From 087a4980f895a2e63499e1773a82726e58b0dca9 Mon Sep 17 00:00:00 2001 From: Srinivas Gorur-Shandilya Date: Tue, 3 Feb 2026 13:45:35 -0500 Subject: [PATCH 01/12] feat: initial support for data-platform --- docs/platform/ref/data.md | 52 +++++++ mkdocs.yaml | 1 + src/platform/client.py | 2 + src/platform/data.py | 300 ++++++++++++++++++++++++++++++++++++ tests/mock_server/server.py | 46 ++++++ tests/test_data.py | 117 ++++++++++++++ 6 files changed, 518 insertions(+) create mode 100644 docs/platform/ref/data.md create mode 100644 src/platform/data.py create mode 100644 tests/test_data.py diff --git a/docs/platform/ref/data.md b/docs/platform/ref/data.md new file mode 100644 index 00000000..96b50bc7 --- /dev/null +++ b/docs/platform/ref/data.md @@ -0,0 +1,52 @@ +# Data Platform API. + +The DeepOriginClient can be used to access the data platform API using: + +```{.python notest} +from deeporigin.platform.client import DeepOriginClient + +client = DeepOriginClient() +``` + +Then, the following methods can be used, for example: + +```{.python notest} +# Check the health status of the data platform +health_status = client.data.health() + +# Search ligands joined with tool results +results = client.data.search_ligands_with_results( + limit=10, + experiments=[{"toolId": "deeporigin.docking"}], +) + +# Search an entity (e.g., ligands) +results = client.data.search("ligands") + +# Search ligands using convenience method +results = client.data.search_ligands(limit=10) + +# Search proteins using convenience method +results = client.data.search_proteins(limit=10) + +# List public models +models = client.data.list_models() +``` + + +::: src.platform.data.Data + options: + heading_level: 2 + docstring_style: google + show_root_heading: true + show_category_heading: true + show_object_full_path: false + show_root_toc_entry: false + inherited_members: true + members_order: alphabetical + filters: + - "!^_" # Exclude private members (names starting with "_") + show_signature: true + show_signature_annotations: true + show_if_no_docstring: true + group_by_category: true diff --git a/mkdocs.yaml b/mkdocs.yaml index 3fe49e68..d1b1d235 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -87,6 +87,7 @@ nav: - functions: platform/ref/functions.md - organizations: platform/ref/organizations.md - billing: platform/ref/billing.md + - data: platform/ref/data.md - Developing: - Installing: dev/install.md - Clients: dev/clients.md diff --git a/src/platform/client.py b/src/platform/client.py index c834655a..21550093 100644 --- a/src/platform/client.py +++ b/src/platform/client.py @@ -22,6 +22,7 @@ from deeporigin.exceptions import DeepOriginException from deeporigin.platform.billing import Billing from deeporigin.platform.clusters import Clusters +from deeporigin.platform.data import Data from deeporigin.platform.executions import Executions from deeporigin.platform.files import Files from deeporigin.platform.functions import Functions @@ -306,6 +307,7 @@ def __init__( self.executions = Executions(self) self.organizations = Organizations(self) self.billing = Billing(self) + self.data = Data(self) # Retry configuration self.max_retries = max_retries diff --git a/src/platform/data.py b/src/platform/data.py new file mode 100644 index 00000000..baa86461 --- /dev/null +++ b/src/platform/data.py @@ -0,0 +1,300 @@ +"""Data Platform API wrapper for DeepOriginClient.""" + +from __future__ import annotations + +from functools import lru_cache +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from deeporigin.platform.client import DeepOriginClient + + +class Data: + """Data Platform API wrapper. + + Provides access to data platform-related endpoints through the DeepOriginClient. + """ + + def __init__(self, client: DeepOriginClient) -> None: + """Initialize Data wrapper. + + Args: + client: The DeepOriginClient instance to use for API calls. + """ + self._c = client + + def health(self) -> dict: + """Check the health status of the data platform. + + Returns: + Dictionary containing the health status response. + """ + return self._c.get_json("/data-platform/health") + + @lru_cache(maxsize=1) # noqa: B019 + def list_models(self) -> dict: + """List public models. + + Returns: + Dictionary containing the list of models. + """ + return self._c.get_json(f"/data-platform/{self._c.org_key}/meta/models") + + def search_ligands_with_results( + self, + *, + cursor: str | None = None, + experiments: list[dict[str, str]] | None = None, + filter: dict[str, Any] | None = None, + limit: int | None = None, + offset: int | None = None, + select: list[str] | None = None, + sort: dict[str, str] | None = None, + ) -> dict: + """Search ligands joined with tool results (wide pivot view). + + Args: + cursor: Cursor for pagination. + experiments: List of experiment filters, each containing toolId and + optionally toolVersion. + filter: Additional filter criteria as a dictionary. + limit: Maximum number of results to return. Defaults to 100. + offset: Number of results to skip. + select: List of fields to select in the response. + sort: Dictionary mapping field names to sort order ("asc" or "desc"). + + Returns: + Dictionary containing the search results. + """ + body: dict[str, Any] = {} + if cursor is not None: + body["cursor"] = cursor + if experiments is not None: + body["experiments"] = experiments + if filter is not None: + body["filter"] = filter + if limit is not None: + body["limit"] = limit + if offset is not None: + body["offset"] = offset + if select is not None: + body["select"] = select + if sort is not None: + body["sort"] = sort + + return self._c.post_json( + f"/data-platform/{self._c.org_key}/ligands_with_results/search", + body=body, + ) + + def search( + self, + entity: str, + *, + cursor: str | None = None, + filter_dict: dict[str, Any] | None = None, + limit: int | None = None, + offset: int | None = None, + select: list[str] | None = None, + sort: dict[str, str] | None = None, + ) -> dict: + """Search an entity (table). + + Args: + entity: Entity (table) name to search (e.g., "ligands"). + cursor: Cursor for pagination. + filter_dict: Additional filter criteria as a dictionary. + limit: Maximum number of results to return. Defaults to 100. + offset: Number of results to skip. + select: List of fields to select in the response. + sort: Dictionary mapping field names to sort order ("asc" or "desc"). + + Returns: + Dictionary containing the search results. + + Raises: + ValueError: If the entity is not a valid table name. + """ + # Validate entity against list of available models + models_response = self.list_models() + valid_table_names = { + model["tableName"] for model in models_response.get("models", []) + } + if entity not in valid_table_names: + raise ValueError( + f"Invalid entity '{entity}'. Valid entities are: {', '.join(sorted(valid_table_names))}" + ) + + if filter_dict is None: + filter_dict = {"deleted": False} + else: + filter_dict = filter_dict.copy() + filter_dict["deleted"] = False + + body: dict[str, Any] = {} + if cursor is not None: + body["cursor"] = cursor + + body["filter"] = filter_dict + if limit is not None: + body["limit"] = limit + if offset is not None: + body["offset"] = offset + if select is not None: + body["select"] = select + if sort is not None: + body["sort"] = sort + + return self._c.post_json( + f"/data-platform/{self._c.org_key}/{entity}/search", + body=body, + ) + + def search_ligands( + self, + *, + cursor: str | None = None, + filter: dict[str, Any] | None = None, + min_molecular_weight: float | int | None = None, + max_molecular_weight: float | int | None = None, + limit: int | None = None, + offset: int | None = None, + select: list[str] | None = None, + sort: dict[str, str] | None = None, + ) -> dict: + """Search ligands entity. + + Convenience method that calls search(entity="ligands"). + + Args: + cursor: Cursor for pagination. + filter: Additional filter criteria as a dictionary. + min_molecular_weight: Minimum molecular weight filter (inclusive). + max_molecular_weight: Maximum molecular weight filter (inclusive). + limit: Maximum number of results to return. Defaults to 100. + offset: Number of results to skip. + select: List of fields to select in the response. + sort: Dictionary mapping field names to sort order ("asc" or "desc"). + + Returns: + Dictionary containing the search results. + + Raises: + ValueError: If ligands is not a valid table name (should not happen). + """ + # Build filter dict, starting with provided filter or empty dict + filter_dict = filter.copy() if filter is not None else {} + filter_dict.setdefault("deleted", False) + + # Build molecular weight filters + props = [] + if min_molecular_weight is not None: + props.append( + { + "column": "molecular_weight", + "op": "gte", + "value": min_molecular_weight, + } + ) + if max_molecular_weight is not None: + props.append( + { + "column": "molecular_weight", + "op": "lte", + "value": max_molecular_weight, + } + ) + + if props: + # Merge with existing props if any + existing_props = filter_dict.get("props", []) + filter_dict["props"] = existing_props + props + + return self.search( + "ligands", + cursor=cursor, + filter_dict=filter_dict, + limit=limit, + offset=offset, + select=select, + sort=sort, + ) + + def search_proteins( + self, + *, + cursor: str | None = None, + pdb_id: str | None = None, + min_molecular_weight: float | int | None = None, + max_molecular_weight: float | int | None = None, + sequence: str | None = None, + limit: int | None = None, + offset: int | None = None, + select: list[str] | None = None, + sort: dict[str, str] | None = None, + ) -> dict: + """Search proteins entity. + + Convenience method that calls search(entity="proteins"). + + Args: + cursor: Cursor for pagination. + pdb_id: Filter by PDB ID. + min_molecular_weight: Minimum molecular weight filter (inclusive). + max_molecular_weight: Maximum molecular weight filter (inclusive). + sequence: Filter by FASTA sequence (exact match). + limit: Maximum number of results to return. Defaults to 100. + offset: Number of results to skip. + select: List of fields to select in the response. + sort: Dictionary mapping field names to sort order ("asc" or "desc"). + + Returns: + Dictionary containing the search results. + + Raises: + ValueError: If proteins is not a valid table name (should not happen). + """ + + filter_dict = {"deleted": False} + if pdb_id is not None: + filter_dict["pdb_id"] = pdb_id + + # Build molecular weight filters + props = [] + if min_molecular_weight is not None: + props.append( + { + "column": "molecular_weight", + "op": "gte", + "value": min_molecular_weight, + } + ) + if max_molecular_weight is not None: + props.append( + { + "column": "molecular_weight", + "op": "lte", + "value": max_molecular_weight, + } + ) + if sequence is not None: + props.append( + { + "column": "fasta_sequence", + "op": "eq", + "value": sequence, + } + ) + + if props: + filter_dict["props"] = props + + return self.search( + "proteins", + cursor=cursor, + filter_dict=filter_dict, + limit=limit, + offset=offset, + select=select, + sort=sort, + ) diff --git a/tests/mock_server/server.py b/tests/mock_server/server.py index 5ab6a86f..a382263d 100644 --- a/tests/mock_server/server.py +++ b/tests/mock_server/server.py @@ -957,6 +957,52 @@ def health() -> dict[str, str]: """Health check endpoint.""" return {"status": "ok"} + @self.app.get("/data-platform/health") + def data_platform_health() -> dict[str, str]: + """Data platform health check endpoint.""" + return {"status": "ok"} + + @self.app.post("/data-platform/{org_key}/ligands_with_results/search") + async def search_ligands_with_results( + org_key: str, request: Request + ) -> dict[str, Any]: + """Search ligands joined with tool results.""" + body = await request.json() + # Return a mock response with empty data list + return { + "data": [], + "count": 0, + } + + @self.app.post("/data-platform/{org_key}/{entity}/search") + async def search_entity( + org_key: str, entity: str, request: Request + ) -> dict[str, Any]: + """Search an entity.""" + body = await request.json() + # Return a mock response with empty data list + return { + "data": [], + "count": 0, + } + + @self.app.get("/data-platform/{org_key}/meta/models") + def list_models(org_key: str) -> dict[str, Any]: + """List public models.""" + return { + "models": [ + {"tableName": "ligands", "visibility": "public"}, + {"tableName": "proteins", "visibility": "public"}, + {"tableName": "patents", "visibility": "public"}, + {"tableName": "projects", "visibility": "public"}, + {"tableName": "ui_settings", "visibility": "public"}, + {"tableName": "executions", "visibility": "public"}, + {"tableName": "execution_subjects", "visibility": "public"}, + {"tableName": "results", "visibility": "public"}, + {"tableName": "result_table_catalog", "visibility": "public"}, + ] + } + def start(self) -> tuple[str, int]: """Start the test server. diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 00000000..ac60a819 --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,117 @@ +"""Tests for the Data Platform API wrapper.""" + +import pytest + +from deeporigin.platform.client import DeepOriginClient + + +def test_data_platform_health_lv1(): + """Test the data platform health endpoint.""" + client = DeepOriginClient() + response = client.data.health() + + assert isinstance(response, dict), "Expected a dictionary response" + assert "status" in response, "Expected 'status' key in response" + assert response["status"] == "ok", "Expected status to be 'ok'" + + +def test_search_ligands_with_results_lv1(): + """Test searching ligands with results.""" + client = DeepOriginClient() + response = client.data.search_ligands_with_results( + limit=10, + experiments=[{"toolId": "test-tool"}], + ) + + assert isinstance(response, dict), "Expected a dictionary response" + assert "data" in response, "Expected 'data' key in response" + assert isinstance(response["data"], list), "Expected 'data' to be a list" + + +def test_search_entity_lv1(): + """Test searching an entity.""" + client = DeepOriginClient() + response = client.data.search("ligands") + + assert isinstance(response, dict), "Expected a dictionary response" + assert "data" in response, "Expected 'data' key in response" + assert isinstance(response["data"], list), "Expected 'data' to be a list" + + +def test_search_entity_invalid_entity(): + """Test searching with an invalid entity raises ValueError.""" + client = DeepOriginClient() + with pytest.raises(ValueError, match="Invalid entity 'invalid_table'"): + client.data.search("invalid_table") + + +def test_search_ligands_lv1(): + """Test searching ligands using convenience method.""" + client = DeepOriginClient() + response = client.data.search_ligands() + + assert isinstance(response, dict), "Expected a dictionary response" + assert "data" in response, "Expected 'data' key in response" + assert isinstance(response["data"], list), "Expected 'data' to be a list" + + +def test_search_ligands_molecular_weight(): + """Test searching ligands with molecular weight filters.""" + client = DeepOriginClient() + response = client.data.search_ligands( + min_molecular_weight=250, max_molecular_weight=550 + ) + + assert isinstance(response, dict), "Expected a dictionary response" + assert "data" in response, "Expected 'data' key in response" + assert isinstance(response["data"], list), "Expected 'data' to be a list" + + +def test_search_proteins_lv1(): + """Test searching proteins using convenience method.""" + client = DeepOriginClient() + response = client.data.search_proteins() + + assert isinstance(response, dict), "Expected a dictionary response" + assert "data" in response, "Expected 'data' key in response" + assert isinstance(response["data"], list), "Expected 'data' to be a list" + + +def test_search_proteins_molecular_weight(): + """Test searching proteins with molecular weight filters.""" + client = DeepOriginClient() + response = client.data.search_proteins( + min_molecular_weight=250, max_molecular_weight=550 + ) + + assert isinstance(response, dict), "Expected a dictionary response" + assert "data" in response, "Expected 'data' key in response" + assert isinstance(response["data"], list), "Expected 'data' to be a list" + + +def test_search_proteins_sequence(): + """Test searching proteins with sequence filter.""" + client = DeepOriginClient() + response = client.data.search_proteins( + sequence="MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL" + ) + + assert isinstance(response, dict), "Expected a dictionary response" + assert "data" in response, "Expected 'data' key in response" + assert isinstance(response["data"], list), "Expected 'data' to be a list" + + +def test_list_models_lv1(): + """Test listing models.""" + client = DeepOriginClient() + response = client.data.list_models() + + assert isinstance(response, dict), "Expected a dictionary response" + assert "models" in response, "Expected 'models' key in response" + assert isinstance(response["models"], list), "Expected 'models' to be a list" + assert len(response["models"]) > 0, "Expected at least one model" + # Verify structure of first model + model = response["models"][0] + assert "tableName" in model, "Expected 'tableName' key in model" + assert "visibility" in model, "Expected 'visibility' key in model" + assert model["visibility"] == "public", "Expected visibility to be 'public'" From 2deb71008da4ad359b8888a1c87e30860faaf933 Mon Sep 17 00:00:00 2001 From: Srinivas Gorur-Shandilya Date: Sun, 8 Feb 2026 17:04:33 -0500 Subject: [PATCH 02/12] fix: add some tests --- docs/platform/ref/data.md | 19 +++++++ src/platform/data.py | 101 +++++++++++++++++++++++++++++++++--- tests/mock_server/server.py | 45 +++++++++++++++- tests/test_data.py | 42 +++++++++++++++ 4 files changed, 198 insertions(+), 9 deletions(-) diff --git a/docs/platform/ref/data.md b/docs/platform/ref/data.md index 96b50bc7..90a610b0 100644 --- a/docs/platform/ref/data.md +++ b/docs/platform/ref/data.md @@ -29,6 +29,25 @@ results = client.data.search_ligands(limit=10) # Search proteins using convenience method results = client.data.search_proteins(limit=10) +# Create a new ligand +ligand = client.data.create_ligand( + project_id="\\x0011223344556677", + canonical_smiles="CCOc1ccc2nc(S(=O)(=O)N3CCN(CC3)C)c(N)c2c1", + inchi_key="BSYNRYMUTXBXSQ-UHFFFAOYSA-N", + inchi="InChI=1S/C20H24N4O4S/.../h1-4,6-9H,5,10-14H2,(H,22,23)", + smiles="CCOc1ccc2nc(S(=O)(=O)N3CCN(CC3)C)c(N)c2c1", + name="Compound-12345", + formal_charge=0, + hbond_donor_count=1, + hbond_acceptor_count=6, + rotatable_bond_count=5, + tpsa=85.12, + molecular_weight=447.5, +) + +# List projects +projects = client.data.list_projects() + # List public models models = client.data.list_models() ``` diff --git a/src/platform/data.py b/src/platform/data.py index baa86461..139a736e 100644 --- a/src/platform/data.py +++ b/src/platform/data.py @@ -2,7 +2,6 @@ from __future__ import annotations -from functools import lru_cache from typing import TYPE_CHECKING, Any if TYPE_CHECKING: @@ -22,6 +21,7 @@ def __init__(self, client: DeepOriginClient) -> None: client: The DeepOriginClient instance to use for API calls. """ self._c = client + self._models: dict | None = None def health(self) -> dict: """Check the health status of the data platform. @@ -31,21 +31,26 @@ def health(self) -> dict: """ return self._c.get_json("/data-platform/health") - @lru_cache(maxsize=1) # noqa: B019 def list_models(self) -> dict: """List public models. + The result is cached per instance. + Returns: Dictionary containing the list of models. """ - return self._c.get_json(f"/data-platform/{self._c.org_key}/meta/models") + if self._models is None: + self._models = self._c.get_json( + f"/data-platform/{self._c.org_key}/meta/models" + ) + return self._models def search_ligands_with_results( self, *, cursor: str | None = None, experiments: list[dict[str, str]] | None = None, - filter: dict[str, Any] | None = None, + filter_dict: dict[str, Any] | None = None, limit: int | None = None, offset: int | None = None, select: list[str] | None = None, @@ -57,7 +62,7 @@ def search_ligands_with_results( cursor: Cursor for pagination. experiments: List of experiment filters, each containing toolId and optionally toolVersion. - filter: Additional filter criteria as a dictionary. + filter_dict: Additional filter criteria as a dictionary. limit: Maximum number of results to return. Defaults to 100. offset: Number of results to skip. select: List of fields to select in the response. @@ -71,8 +76,8 @@ def search_ligands_with_results( body["cursor"] = cursor if experiments is not None: body["experiments"] = experiments - if filter is not None: - body["filter"] = filter + if filter_dict is not None: + body["filter"] = filter_dict if limit is not None: body["limit"] = limit if offset is not None: @@ -298,3 +303,85 @@ def search_proteins( select=select, sort=sort, ) + + def create_ligand( + self, + *, + project_id: str, + canonical_smiles: str, + inchi_key: str, + inchi: str, + smiles: str, + name: str, + formal_charge: int = 0, + hbond_donor_count: int | None = None, + hbond_acceptor_count: int | None = None, + rotatable_bond_count: int | None = None, + tpsa: float | None = None, + molecular_weight: float | None = None, + variant_name_tag: str = "", + ) -> dict: + """Create a new ligand. + + Args: + project_id: Project ID for the ligand. + canonical_smiles: Canonical SMILES string. + inchi_key: InChI key. + inchi: InChI string. + smiles: SMILES string. + name: Name of the ligand. + formal_charge: Formal charge. Defaults to 0. + hbond_donor_count: Number of hydrogen bond donors. + hbond_acceptor_count: Number of hydrogen bond acceptors. + rotatable_bond_count: Number of rotatable bonds. + tpsa: Topological polar surface area. + molecular_weight: Molecular weight. + variant_name_tag: Variant name tag. Defaults to empty string. + + Returns: + Dictionary containing the created ligand data. + """ + # Build the set object with all ligand properties + set_dict: dict[str, Any] = { + "project_id": project_id, + "subtable_name": "ligands", + "canonical_smiles": canonical_smiles, + "inchi_key": inchi_key, + "inchi": inchi, + "smiles": smiles, + "name": name, + "formal_charge": formal_charge, + "variant_name_tag": variant_name_tag, + } + + # Add optional fields only if provided + if hbond_donor_count is not None: + set_dict["hbond_donor_count"] = hbond_donor_count + if hbond_acceptor_count is not None: + set_dict["hbond_acceptor_count"] = hbond_acceptor_count + if rotatable_bond_count is not None: + set_dict["rotatable_bond_count"] = rotatable_bond_count + if tpsa is not None: + set_dict["tpsa"] = tpsa + if molecular_weight is not None: + set_dict["molecular_weight"] = molecular_weight + + body: dict[str, Any] = { + "set": set_dict, + } + + return self._c.post_json( + f"/data-platform/{self._c.org_key}/ligands", + body=body, + ) + + def list_projects(self) -> dict: + """List projects. + + Returns: + Dictionary containing the list of projects. + """ + return self._c.post_json( + f"/data-platform/{self._c.org_key}/projects/search", + body={}, + ) diff --git a/tests/mock_server/server.py b/tests/mock_server/server.py index a382263d..5b4f7f0c 100644 --- a/tests/mock_server/server.py +++ b/tests/mock_server/server.py @@ -967,7 +967,7 @@ async def search_ligands_with_results( org_key: str, request: Request ) -> dict[str, Any]: """Search ligands joined with tool results.""" - body = await request.json() + await request.json() # Consume request body # Return a mock response with empty data list return { "data": [], @@ -979,13 +979,54 @@ async def search_entity( org_key: str, entity: str, request: Request ) -> dict[str, Any]: """Search an entity.""" - body = await request.json() + await request.json() # Consume request body # Return a mock response with empty data list return { "data": [], "count": 0, } + @self.app.post("/data-platform/{org_key}/projects/search") + async def list_projects(org_key: str, request: Request) -> dict[str, Any]: + """List projects.""" + await request.json() # Consume request body + # Return a mock response with empty projects list + return { + "data": [], + "count": 0, + } + + @self.app.post("/data-platform/{org_key}/ligands") + async def create_ligand(org_key: str, request: Request) -> dict[str, Any]: + """Create a new ligand.""" + body = await request.json() + set_data = body.get("set", {}) + returning = body.get("returning", []) + + # Generate mock response with canonical_id and version + now = datetime.now(timezone.utc) + canonical_id = str(uuid.uuid4()) + response_data: dict[str, Any] = { + "canonical_id": canonical_id, + "version": 1, + "valid_from": now.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z", + "valid_to": None, + "modified_by": "test-user", + "deleted": False, + "structure_key": str(uuid.uuid4()), + } + + # Include all fields from set_data + response_data.update(set_data) + + # Filter to only return requested fields if specified + if returning: + response_data = { + k: v for k, v in response_data.items() if k in returning + } + + return response_data + @self.app.get("/data-platform/{org_key}/meta/models") def list_models(org_key: str) -> dict[str, Any]: """List public models.""" diff --git a/tests/test_data.py b/tests/test_data.py index ac60a819..bbcc95ee 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -115,3 +115,45 @@ def test_list_models_lv1(): assert "tableName" in model, "Expected 'tableName' key in model" assert "visibility" in model, "Expected 'visibility' key in model" assert model["visibility"] == "public", "Expected visibility to be 'public'" + + +def test_create_ligand_lv1(): + """Test creating a ligand.""" + client = DeepOriginClient() + response = client.data.create_ligand( + project_id="\\x0011223344556677", + canonical_smiles="CCOc1ccc2nc(S(=O)(=O)N3CCN(CC3)C)c(N)c2c1", + inchi_key="BSYNRYMUTXBXSQ-UHFFFAOYSA-N", + inchi="InChI=1S/C20H24N4O4S/.../h1-4,6-9H,5,10-14H2,(H,22,23)", + smiles="CCOc1ccc2nc(S(=O)(=O)N3CCN(CC3)C)c(N)c2c1", + name="Compound-12345", + formal_charge=0, + hbond_donor_count=1, + hbond_acceptor_count=6, + rotatable_bond_count=5, + tpsa=85.12, + molecular_weight=447.5, + variant_name_tag="", + ) + + assert isinstance(response, dict), "Expected a dictionary response" + assert "canonical_id" in response, "Expected 'canonical_id' key in response" + assert "version" in response, "Expected 'version' key in response" + assert response["version"] == 1, "Expected version to be 1" + assert "name" in response, "Expected 'name' key in response" + assert response["name"] == "Compound-12345", "Expected name to match" + assert "canonical_smiles" in response, "Expected 'canonical_smiles' key in response" + assert ( + response["canonical_smiles"] == "CCOc1ccc2nc(S(=O)(=O)N3CCN(CC3)C)c(N)c2c1" + ), "Expected canonical_smiles to match" + + +def test_list_projects_lv1(): + """Test listing projects.""" + client = DeepOriginClient() + response = client.data.list_projects() + + assert isinstance(response, dict), "Expected a dictionary response" + assert "data" in response, "Expected 'data' key in response" + assert isinstance(response["data"], list), "Expected 'data' to be a list" + assert "count" in response, "Expected 'count' key in response" From d56a62016158cadd25dff64dba810787e92e4799 Mon Sep 17 00:00:00 2001 From: Srinivas Gorur-Shandilya Date: Wed, 18 Feb 2026 19:59:02 -0500 Subject: [PATCH 03/12] feat: support for making a protein --- .../clean/docking-single-ligand.ipynb | 20 +++++ src/drug_discovery/structures/entity.py | 4 +- src/drug_discovery/structures/protein.py | 42 +++++++++ src/platform/data.py | 87 +++++++++++++++++++ tests/mock_server/server.py | 58 +++++++++++++ tests/test_data.py | 23 +++++ 6 files changed, 233 insertions(+), 1 deletion(-) diff --git a/docs/notebooks/clean/docking-single-ligand.ipynb b/docs/notebooks/clean/docking-single-ligand.ipynb index 1ed9748b..babb5c8a 100644 --- a/docs/notebooks/clean/docking-single-ligand.ipynb +++ b/docs/notebooks/clean/docking-single-ligand.ipynb @@ -94,6 +94,16 @@ "sim" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d8bfa9a-c142-4deb-ac3d-d2a4f581df55", + "metadata": {}, + "outputs": [], + "source": [ + "protein._remote_path" + ] + }, { "cell_type": "markdown", "id": "e2aa58aa", @@ -115,6 +125,16 @@ "ligands" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "95f716a7-4a4c-4c23-9d61-130fc3e9a72f", + "metadata": {}, + "outputs": [], + "source": [ + "ligands.to_smiles()" + ] + }, { "cell_type": "markdown", "id": "94e19bec-6bee-4dbf-9c47-1490c41fdbd0", diff --git a/src/drug_discovery/structures/entity.py b/src/drug_discovery/structures/entity.py index f1378dfd..56c416ea 100644 --- a/src/drug_discovery/structures/entity.py +++ b/src/drug_discovery/structures/entity.py @@ -5,7 +5,7 @@ """ from abc import ABC, abstractmethod -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path from typing import Optional @@ -20,6 +20,8 @@ class Entity(ABC): This class manages the remote path and provides an upload method to ensure that the entity's file is uploaded to the remote storage if it does not already exist there. It uses the DeepOrigin FilesClient for remote file operations. """ + id: str | None = field(default=None, kw_only=True) + @abstractmethod def to_hash(self) -> str: """computes a hash of the entity""" diff --git a/src/drug_discovery/structures/protein.py b/src/drug_discovery/structures/protein.py index 56898000..227a945c 100644 --- a/src/drug_discovery/structures/protein.py +++ b/src/drug_discovery/structures/protein.py @@ -1308,11 +1308,53 @@ def __str__(self): info_str += f"Info: {self.info}\n" return f"Protein:\n {info_str}" + @beartype + def sync(self, client: Optional[DeepOriginClient] = None) -> dict: + """Sync the protein to the data platform. + + This method uploads the protein file to remote storage and creates a protein + record in the data platform. + + Args: + client: DeepOriginClient instance. If None, uses DeepOriginClient.get(). + + Returns: + Dictionary containing the created protein data from the data platform. + """ + if client is None: + client = DeepOriginClient.get() + + # Upload the protein file first + self.upload(client=client) + + # Use the remote path as the file_path + file_path = self._remote_path + + # Prepare parameters for create_protein + kwargs: dict[str, Any] = { + "file_path": file_path, + } + + # Pass pdb_id if available + if self.pdb_id is not None: + kwargs["pdb_id"] = self.pdb_id + + kwargs["protein_length"] = self.length + kwargs["protein_name"] = self.name + + # Call create_protein through the client + return client.data.create_protein(**kwargs) + def update_coordinates(self, coords: np.ndarray): """update coordinates of the protein structure""" self.structure.coord = coords + @property + def length(self) -> int: + """get the length of the protein structure""" + return sum([len(seq) for seq in self.sequence]) + def validate_pdb_file(file_path: str | Path) -> None: """validate a PDB file by checking if it can be parsed by RDKit diff --git a/src/platform/data.py b/src/platform/data.py index 139a736e..d9a9574e 100644 --- a/src/platform/data.py +++ b/src/platform/data.py @@ -230,6 +230,7 @@ def search_proteins( *, cursor: str | None = None, pdb_id: str | None = None, + file_path: str | None = None, min_molecular_weight: float | int | None = None, max_molecular_weight: float | int | None = None, sequence: str | None = None, @@ -245,6 +246,7 @@ def search_proteins( Args: cursor: Cursor for pagination. pdb_id: Filter by PDB ID. + file_path: Filter by file path. min_molecular_weight: Minimum molecular weight filter (inclusive). max_molecular_weight: Maximum molecular weight filter (inclusive). sequence: Filter by FASTA sequence (exact match). @@ -263,6 +265,8 @@ def search_proteins( filter_dict = {"deleted": False} if pdb_id is not None: filter_dict["pdb_id"] = pdb_id + if file_path is not None: + filter_dict["file_path"] = file_path # Build molecular weight filters props = [] @@ -375,6 +379,89 @@ def create_ligand( body=body, ) + def create_protein( + self, + *, + file_path: str, + gene_symbol: str | None = None, + pdb_id: str | None = None, + fasta_sequence: str | None = None, + protein_name: str | None = None, + protein_length: int | None = None, + project_id: str | None = None, + ) -> dict: + """Create a new protein. + + Args: + file_path: Path to the protein file (required). + gene_symbol: Gene symbol. + pdb_id: PDB ID. + fasta_sequence: FASTA sequence. + protein_name: Protein name. + protein_length: Protein length. + project_id: Project ID for the protein. + + Returns: + Dictionary containing the created protein data. + """ + # Build the set object with all protein properties + set_dict: dict[str, Any] = { + "file_path": file_path, + } + + # Add optional fields only if provided + if project_id is not None: + set_dict["project_id"] = project_id + if gene_symbol is not None: + set_dict["gene_symbol"] = gene_symbol + if pdb_id is not None: + set_dict["pdb_id"] = pdb_id + if fasta_sequence is not None: + set_dict["fasta_sequence"] = fasta_sequence + if protein_name is not None: + set_dict["protein_name"] = protein_name + if protein_length is not None: + set_dict["protein_length"] = protein_length + + body: dict[str, Any] = { + "set": set_dict, + "returning": [ + "canonical_id", + "version", + "valid_from", + "valid_to", + "modified_by", + "deleted", + "project_id", + "subtable_name", + "uniprot_accession", + "file_path", + "gene_symbol", + "pdb_id", + "refseq_protein_id", + "ensembl_protein_id", + "alpha_fold_id", + "fasta_sequence", + "protein_name", + "kegg_gene_id", + "chembl_target_id", + "binding_db_target_id", + "drugbank_target_id", + "pfam_id", + "interpro_id", + "ec_number", + "ncbi_taxonomy_id", + "protein_family", + "ligandability_score", + "protein_length", + ], + } + + return self._c.post_json( + f"/data-platform/{self._c.org_key}/proteins", + body=body, + ) + def list_projects(self) -> dict: """List projects. diff --git a/tests/mock_server/server.py b/tests/mock_server/server.py index 5b4f7f0c..deeafb36 100644 --- a/tests/mock_server/server.py +++ b/tests/mock_server/server.py @@ -1027,6 +1027,64 @@ async def create_ligand(org_key: str, request: Request) -> dict[str, Any]: return response_data + @self.app.post("/data-platform/{org_key}/proteins") + async def create_protein(org_key: str, request: Request) -> dict[str, Any]: + """Create a new protein.""" + body = await request.json() + set_data = body.get("set", {}) + returning = body.get("returning", []) + + # Generate mock response matching the API format + now = datetime.now(timezone.utc) + protein_id = "08AD337N5YV4Y" # Use a consistent ID for testing + modified_by = "6b96d8f8-0f55-474c-a86c-e09651ba4b20" + + # Build response data with all fields from the example + response_data: dict[str, Any] = { + "id": protein_id, + "version": 1, + "valid_from": now.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z", + "valid_to": None, + "modified_by": modified_by, + "deleted": False, + "project_id": None, + "subtable_name": "proteins", + "uniprot_accession": None, + "file_path": set_data.get("file_path", ""), + "gene_symbol": None, + "pdb_id": None, + "refseq_protein_id": None, + "ensembl_protein_id": None, + "alpha_fold_id": None, + "fasta_sequence": None, + "protein_name": None, + "kegg_gene_id": None, + "chembl_target_id": None, + "binding_db_target_id": None, + "drugbank_target_id": None, + "pfam_id": None, + "interpro_id": None, + "ec_number": None, + "ncbi_taxonomy_id": None, + "protein_family": None, + "ligandability_score": None, + "protein_length": None, + } + + # Override with any fields provided in set_data + response_data.update(set_data) + + # Filter to only return requested fields if specified + if returning: + response_data = { + k: v for k, v in response_data.items() if k in returning + } + + return { + "data": response_data, + "meta": {"inserted": 1}, + } + @self.app.get("/data-platform/{org_key}/meta/models") def list_models(org_key: str) -> dict[str, Any]: """List public models.""" diff --git a/tests/test_data.py b/tests/test_data.py index bbcc95ee..e4417d89 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -148,6 +148,29 @@ def test_create_ligand_lv1(): ), "Expected canonical_smiles to match" +def test_create_protein_lv1(): + """Test creating a protein.""" + client = DeepOriginClient() + response = client.data.create_protein( + file_path="entities/proteins/db4aa32e2e8ffa976a60004a8361b86427a2e5653a6623bb60b7913445902549.pdb", + ) + + assert isinstance(response, dict), "Expected a dictionary response" + assert "data" in response, "Expected 'data' key in response" + assert isinstance(response["data"], dict), "Expected 'data' to be a dictionary" + assert "id" in response["data"], "Expected 'id' key in response data" + assert "version" in response["data"], "Expected 'version' key in response data" + assert response["data"]["version"] == 1, "Expected version to be 1" + assert "file_path" in response["data"], "Expected 'file_path' key in response data" + assert ( + response["data"]["file_path"] + == "entities/proteins/db4aa32e2e8ffa976a60004a8361b86427a2e5653a6623bb60b7913445902549.pdb" + ), "Expected file_path to match" + assert "meta" in response, "Expected 'meta' key in response" + assert "inserted" in response["meta"], "Expected 'inserted' key in meta" + assert response["meta"]["inserted"] == 1, "Expected inserted to be 1" + + def test_list_projects_lv1(): """Test listing projects.""" client = DeepOriginClient() From 6f2b1dbdb6683a44dcf8f75aab0d0770af63bcb1 Mon Sep 17 00:00:00 2001 From: Srinivas Gorur-Shandilya Date: Wed, 18 Feb 2026 20:03:58 -0500 Subject: [PATCH 04/12] feat: ability to sync a protein --- src/drug_discovery/structures/protein.py | 26 ++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/drug_discovery/structures/protein.py b/src/drug_discovery/structures/protein.py index 227a945c..a6d2c3c7 100644 --- a/src/drug_discovery/structures/protein.py +++ b/src/drug_discovery/structures/protein.py @@ -1309,17 +1309,18 @@ def __str__(self): return f"Protein:\n {info_str}" @beartype - def sync(self, client: Optional[DeepOriginClient] = None) -> dict: + def sync(self, client: Optional[DeepOriginClient] = None) -> None: """Sync the protein to the data platform. This method uploads the protein file to remote storage and creates a protein - record in the data platform. + record in the data platform. If a protein with the same file_path already exists, + it returns the existing protein data instead of creating a new one. Args: client: DeepOriginClient instance. If None, uses DeepOriginClient.get(). Returns: - Dictionary containing the created protein data from the data platform. + Dictionary containing the created or existing protein data from the data platform. """ if client is None: client = DeepOriginClient.get() @@ -1330,6 +1331,19 @@ def sync(self, client: Optional[DeepOriginClient] = None) -> dict: # Use the remote path as the file_path file_path = self._remote_path + # Search for existing proteins with the same file_path + response = client.data.search_proteins(file_path=file_path) + data = response["data"] + + # If a protein with this file_path already exists, return the first one + if data: + existing_protein = data[0] + # Update self.id with the existing protein's ID + if "id" in existing_protein: + self.id = existing_protein["id"] + return + + # No existing protein found, create a new one # Prepare parameters for create_protein kwargs: dict[str, Any] = { "file_path": file_path, @@ -1343,7 +1357,11 @@ def sync(self, client: Optional[DeepOriginClient] = None) -> dict: kwargs["protein_name"] = self.name # Call create_protein through the client - return client.data.create_protein(**kwargs) + result = client.data.create_protein(**kwargs) + + # Update self.id with the newly created protein's ID + if "data" in result and "id" in result["data"]: + self.id = result["data"]["id"] def update_coordinates(self, coords: np.ndarray): """update coordinates of the protein structure""" From 2a82f13e5ad1b8d67bc9da3d0d48edf54a04f5b5 Mon Sep 17 00:00:00 2001 From: Srinivas Gorur-Shandilya Date: Thu, 19 Feb 2026 13:52:53 -0500 Subject: [PATCH 05/12] feat: ability to register a ligand --- .vscode/settings.json | 6 + src/drug_discovery/structures/ligand.py | 175 ++++++++++++++++++++++++ src/platform/data.py | 60 ++++++-- 3 files changed, 229 insertions(+), 12 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 726711f3..5ffc2ff5 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -39,6 +39,8 @@ "emeq", "finalizer", "FMCS", + "hbond", + "herg", "HETATM", "inchi", "isin", @@ -49,6 +51,8 @@ "Konnektor", "kwargs", "ligandset", + "logd", + "logp", "marimo", "mbar", "molblock", @@ -78,7 +82,9 @@ "SASA", "softcore", "Substruct", + "subtable", "textea", + "tpsa", "venv" ] } \ No newline at end of file diff --git a/src/drug_discovery/structures/ligand.py b/src/drug_discovery/structures/ligand.py index 121b1394..aae4d273 100644 --- a/src/drug_discovery/structures/ligand.py +++ b/src/drug_discovery/structures/ligand.py @@ -692,6 +692,115 @@ def coordinates(self): def atom_types(self): return self.get_species() + @property + def formal_charge(self) -> int: + """Compute the formal charge of the ligand molecule. + + Returns: + int: The sum of formal charges of all atoms in the molecule. + """ + if self.mol is None: + raise DeepOriginException("Cannot compute formal charge: molecule is None") + return sum(atom.GetFormalCharge() for atom in self.mol.GetAtoms()) + + @property + def molecular_weight(self) -> float: + """Compute the exact molecular weight of the ligand molecule. + + Returns: + float: The exact molecular weight in atomic mass units. + """ + if self.mol is None: + raise DeepOriginException( + "Cannot compute molecular weight: molecule is None" + ) + return rdMolDescriptors.CalcExactMolWt(self.mol) + + @property + def hbond_donor_count(self) -> int: + """Compute the number of hydrogen bond donors in the ligand molecule. + + Returns: + int: The number of hydrogen bond donors. + """ + if self.mol is None: + raise DeepOriginException( + "Cannot compute H-bond donor count: molecule is None" + ) + return rdMolDescriptors.CalcNumHBD(self.mol) + + @property + def hbond_acceptor_count(self) -> int: + """Compute the number of hydrogen bond acceptors in the ligand molecule. + + Returns: + int: The number of hydrogen bond acceptors. + """ + if self.mol is None: + raise DeepOriginException( + "Cannot compute H-bond acceptor count: molecule is None" + ) + return rdMolDescriptors.CalcNumHBA(self.mol) + + @property + def rotatable_bond_count(self) -> int: + """Compute the number of rotatable bonds in the ligand molecule. + + Returns: + int: The number of rotatable bonds. + """ + if self.mol is None: + raise DeepOriginException( + "Cannot compute rotatable bond count: molecule is None" + ) + return rdMolDescriptors.CalcNumRotatableBonds(self.mol) + + @property + def tpsa(self) -> float: + """Compute the Topological Polar Surface Area (TPSA) of the ligand molecule. + + Returns: + float: The TPSA value in square Angstroms. + """ + if self.mol is None: + raise DeepOriginException("Cannot compute TPSA: molecule is None") + return rdMolDescriptors.CalcTPSA(self.mol) + + @property + def canonical_smiles(self) -> str: + """ + Canonical (RDKit) SMILES for this ligand. + + Notes: + - Canonicalization is RDKit-specific. + - Returns implicit-H SMILES by default (explicit Hs removed). + - Preserves stereochemistry if present. + """ + mol = None + + if self.mol is not None: + mol = self.mol + elif self.smiles is not None: + mol = Chem.MolFromSmiles(self.smiles) + if mol is None: + raise DeepOriginException(f"Invalid SMILES: {self.smiles!r}") + else: + raise DeepOriginException( + "Cannot compute canonical SMILES: missing mol and smiles" + ) + + # Remove explicit Hs so we don't emit `[H]...` everywhere + mol = Chem.RemoveHs(mol) + + # If your mol may be unsanitized, you can ensure sanitization: + # Chem.SanitizeMol(mol) + + return Chem.MolToSmiles( + mol, + canonical=True, + isomericSmiles=True, # keep stereochem + ) + def set_property(self, prop_name: str, prop_value): """ Set a property for the ligand molecule. @@ -881,6 +990,72 @@ def to_hash(self) -> str: return hash_hex + @beartype + def sync(self, client: Optional[DeepOriginClient] = None) -> None: + """Sync the ligand to the data platform. + + This method uploads the ligand file to remote storage (if available) and creates a ligand + record in the data platform. If a ligand with the same canonical_smiles already exists, + it returns the existing ligand data instead of creating a new one. + + Args: + client: DeepOriginClient instance. If None, uses DeepOriginClient.get(). + + Note: + If the ligand was created from a SMILES string without an SDF file, only the SMILES + will be used for syncing (no file upload will occur). + """ + if client is None: + client = DeepOriginClient.get() + + # If ligand has a file_path, upload it to remote storage + # (Note: ligands in the data platform are identified by canonical_smiles, not file_path) + if self.file_path is not None: + # Upload the ligand file first + self.upload(client=client) + + # Search for existing ligands by canonical_smiles + response = client.data.search_ligands(canonical_smiles=self.canonical_smiles) + data = response["data"] + + # If a ligand with this canonical_smiles already exists, update self.id and return + if data: + existing_ligand = data[0] + if "id" in existing_ligand: + self.id = existing_ligand["id"] + return + + # No existing ligand found, create a new one + # Prepare parameters for create_ligand + # Note: canonical_smiles is read-only and computed by the platform + kwargs: dict[str, Any] = { + "smiles": self.smiles if self.smiles is not None else self.canonical_smiles, + } + + # Add optional fields if available + if self.name is not None: + kwargs["name"] = self.name + + # Add computed molecular properties if mol is available + if self.mol is not None: + try: + kwargs["formal_charge"] = self.formal_charge + kwargs["molecular_weight"] = self.molecular_weight + kwargs["hbond_donor_count"] = self.hbond_donor_count + kwargs["hbond_acceptor_count"] = self.hbond_acceptor_count + kwargs["rotatable_bond_count"] = self.rotatable_bond_count + kwargs["tpsa"] = self.tpsa + except Exception: + # If property computation fails, continue without those properties + pass + + # Call create_ligand through the client + result = client.data.create_ligand(**kwargs) + + # Update self.id with the newly created ligand's ID + if "data" in result and "id" in result["data"]: + self.id = result["data"]["id"] + @beartype def to_pdb(self, output_path: Optional[str] = None) -> str | Path: """Write the ligand to a PDB file.""" diff --git a/src/platform/data.py b/src/platform/data.py index d9a9574e..5cbbd3f2 100644 --- a/src/platform/data.py +++ b/src/platform/data.py @@ -160,6 +160,8 @@ def search_ligands( *, cursor: str | None = None, filter: dict[str, Any] | None = None, + smiles: str | None = None, + canonical_smiles: str | None = None, min_molecular_weight: float | int | None = None, max_molecular_weight: float | int | None = None, limit: int | None = None, @@ -174,6 +176,8 @@ def search_ligands( Args: cursor: Cursor for pagination. filter: Additional filter criteria as a dictionary. + smiles: Filter by SMILES string. + canonical_smiles: Filter by canonical SMILES string. min_molecular_weight: Minimum molecular weight filter (inclusive). max_molecular_weight: Maximum molecular weight filter (inclusive). limit: Maximum number of results to return. Defaults to 100. @@ -191,6 +195,14 @@ def search_ligands( filter_dict = filter.copy() if filter is not None else {} filter_dict.setdefault("deleted", False) + # Add smiles filter if provided + if smiles is not None: + filter_dict["smiles"] = smiles + + # Add canonical_smiles filter if provided + if canonical_smiles is not None: + filter_dict["canonical_smiles"] = canonical_smiles + # Build molecular weight filters props = [] if min_molecular_weight is not None: @@ -311,12 +323,11 @@ def search_proteins( def create_ligand( self, *, - project_id: str, - canonical_smiles: str, - inchi_key: str, - inchi: str, smiles: str, - name: str, + project_id: str | None = None, + inchi_key: str | None = None, + inchi: str | None = None, + name: str | None = None, formal_charge: int = 0, hbond_donor_count: int | None = None, hbond_acceptor_count: int | None = None, @@ -328,11 +339,10 @@ def create_ligand( """Create a new ligand. Args: + smiles: SMILES string (required). project_id: Project ID for the ligand. - canonical_smiles: Canonical SMILES string. inchi_key: InChI key. inchi: InChI string. - smiles: SMILES string. name: Name of the ligand. formal_charge: Formal charge. Defaults to 0. hbond_donor_count: Number of hydrogen bond donors. @@ -347,18 +357,21 @@ def create_ligand( """ # Build the set object with all ligand properties set_dict: dict[str, Any] = { - "project_id": project_id, "subtable_name": "ligands", - "canonical_smiles": canonical_smiles, - "inchi_key": inchi_key, - "inchi": inchi, "smiles": smiles, - "name": name, "formal_charge": formal_charge, "variant_name_tag": variant_name_tag, } # Add optional fields only if provided + if project_id is not None: + set_dict["project_id"] = project_id + if inchi_key is not None: + set_dict["inchi_key"] = inchi_key + if inchi is not None: + set_dict["inchi"] = inchi + if name is not None: + set_dict["name"] = name if hbond_donor_count is not None: set_dict["hbond_donor_count"] = hbond_donor_count if hbond_acceptor_count is not None: @@ -372,6 +385,29 @@ def create_ligand( body: dict[str, Any] = { "set": set_dict, + "returning": [ + "canonical_id", + "version", + "valid_from", + "valid_to", + "modified_by", + "deleted", + "project_id", + "subtable_name", + "canonical_smiles", + "smiles", + "inchi_key", + "inchi", + "name", + "formal_charge", + "hbond_donor_count", + "hbond_acceptor_count", + "rotatable_bond_count", + "tpsa", + "molecular_weight", + "log_p", + "structure_key", + ], } return self._c.post_json( From 23516bd1903759ab82c77bc7050e9541dd8bbb0e Mon Sep 17 00:00:00 2001 From: Srinivas Gorur-Shandilya Date: Thu, 19 Feb 2026 14:02:35 -0500 Subject: [PATCH 06/12] feat: ability to create ligands and proteins --- .vscode/settings.json | 6 ++++++ src/drug_discovery/structures/ligand.py | 4 ++-- src/platform/data.py | 12 ++--------- tests/mock_server/server.py | 22 ++++++++++++------- tests/test_data.py | 28 ++++++++++++------------- 5 files changed, 38 insertions(+), 34 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 5ffc2ff5..272888ac 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -37,12 +37,15 @@ "drugability", "dtos", "emeq", + "ensembl", + "fasta", "finalizer", "FMCS", "hbond", "herg", "HETATM", "inchi", + "interpro", "isin", "isoparse", "kabsch", @@ -50,6 +53,7 @@ "Kekulize", "Konnektor", "kwargs", + "ligandability", "ligandset", "logd", "logp", @@ -74,6 +78,7 @@ "rcsbapi", "rdchem", "rdkit", + "refseq", "replex", "resnames", "retryable", @@ -85,6 +90,7 @@ "subtable", "textea", "tpsa", + "uniprot", "venv" ] } \ No newline at end of file diff --git a/src/drug_discovery/structures/ligand.py b/src/drug_discovery/structures/ligand.py index aae4d273..b5c16f12 100644 --- a/src/drug_discovery/structures/ligand.py +++ b/src/drug_discovery/structures/ligand.py @@ -792,8 +792,8 @@ def canonical_smiles(self) -> str: # Remove explicit Hs so we don't emit `[H]...` everywhere mol = Chem.RemoveHs(mol) - # If your mol may be unsanitized, you can ensure sanitization: - # Chem.SanitizeMol(mol) + # ensure sanitization: + Chem.SanitizeMol(mol) return Chem.MolToSmiles( mol, diff --git a/src/platform/data.py b/src/platform/data.py index 5cbbd3f2..17d56bd2 100644 --- a/src/platform/data.py +++ b/src/platform/data.py @@ -325,8 +325,6 @@ def create_ligand( *, smiles: str, project_id: str | None = None, - inchi_key: str | None = None, - inchi: str | None = None, name: str | None = None, formal_charge: int = 0, hbond_donor_count: int | None = None, @@ -341,8 +339,6 @@ def create_ligand( Args: smiles: SMILES string (required). project_id: Project ID for the ligand. - inchi_key: InChI key. - inchi: InChI string. name: Name of the ligand. formal_charge: Formal charge. Defaults to 0. hbond_donor_count: Number of hydrogen bond donors. @@ -366,10 +362,6 @@ def create_ligand( # Add optional fields only if provided if project_id is not None: set_dict["project_id"] = project_id - if inchi_key is not None: - set_dict["inchi_key"] = inchi_key - if inchi is not None: - set_dict["inchi"] = inchi if name is not None: set_dict["name"] = name if hbond_donor_count is not None: @@ -386,7 +378,7 @@ def create_ligand( body: dict[str, Any] = { "set": set_dict, "returning": [ - "canonical_id", + "id", "version", "valid_from", "valid_to", @@ -462,7 +454,7 @@ def create_protein( body: dict[str, Any] = { "set": set_dict, "returning": [ - "canonical_id", + "id", "version", "valid_from", "valid_to", diff --git a/tests/mock_server/server.py b/tests/mock_server/server.py index deeafb36..92fea86c 100644 --- a/tests/mock_server/server.py +++ b/tests/mock_server/server.py @@ -1003,17 +1003,25 @@ async def create_ligand(org_key: str, request: Request) -> dict[str, Any]: set_data = body.get("set", {}) returning = body.get("returning", []) - # Generate mock response with canonical_id and version + # Generate mock response matching the real API format now = datetime.now(timezone.utc) - canonical_id = str(uuid.uuid4()) + ligand_id = "08" + str(uuid.uuid4()).replace("-", "").upper()[:11] + smiles = set_data.get("smiles", "") response_data: dict[str, Any] = { - "canonical_id": canonical_id, + "id": ligand_id, "version": 1, "valid_from": now.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z", "valid_to": None, "modified_by": "test-user", "deleted": False, - "structure_key": str(uuid.uuid4()), + "project_id": None, + "subtable_name": "ligands", + "canonical_smiles": smiles, + "smiles": smiles, + "inchi_key": None, + "inchi": None, + "log_p": None, + "structure_key": None, } # Include all fields from set_data @@ -1025,7 +1033,7 @@ async def create_ligand(org_key: str, request: Request) -> dict[str, Any]: k: v for k, v in response_data.items() if k in returning } - return response_data + return {"data": response_data, "meta": {"inserted": 1}} @self.app.post("/data-platform/{org_key}/proteins") async def create_protein(org_key: str, request: Request) -> dict[str, Any]: @@ -1034,12 +1042,12 @@ async def create_protein(org_key: str, request: Request) -> dict[str, Any]: set_data = body.get("set", {}) returning = body.get("returning", []) - # Generate mock response matching the API format + # Generate mock response matching the real API format now = datetime.now(timezone.utc) protein_id = "08AD337N5YV4Y" # Use a consistent ID for testing modified_by = "6b96d8f8-0f55-474c-a86c-e09651ba4b20" - # Build response data with all fields from the example + # Build response data with all fields matching the real API response_data: dict[str, Any] = { "id": protein_id, "version": 1, diff --git a/tests/test_data.py b/tests/test_data.py index e4417d89..402e2d91 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -120,12 +120,9 @@ def test_list_models_lv1(): def test_create_ligand_lv1(): """Test creating a ligand.""" client = DeepOriginClient() + smiles = "Fc1c(-c2cccc3ccccc23)ncc2c(N3C[C@H]4CC[C@@H](C3)N4)nc(OCC34CCCN3CCC4)nc12" response = client.data.create_ligand( - project_id="\\x0011223344556677", - canonical_smiles="CCOc1ccc2nc(S(=O)(=O)N3CCN(CC3)C)c(N)c2c1", - inchi_key="BSYNRYMUTXBXSQ-UHFFFAOYSA-N", - inchi="InChI=1S/C20H24N4O4S/.../h1-4,6-9H,5,10-14H2,(H,22,23)", - smiles="CCOc1ccc2nc(S(=O)(=O)N3CCN(CC3)C)c(N)c2c1", + smiles=smiles, name="Compound-12345", formal_charge=0, hbond_donor_count=1, @@ -137,15 +134,17 @@ def test_create_ligand_lv1(): ) assert isinstance(response, dict), "Expected a dictionary response" - assert "canonical_id" in response, "Expected 'canonical_id' key in response" - assert "version" in response, "Expected 'version' key in response" - assert response["version"] == 1, "Expected version to be 1" - assert "name" in response, "Expected 'name' key in response" - assert response["name"] == "Compound-12345", "Expected name to match" - assert "canonical_smiles" in response, "Expected 'canonical_smiles' key in response" - assert ( - response["canonical_smiles"] == "CCOc1ccc2nc(S(=O)(=O)N3CCN(CC3)C)c(N)c2c1" - ), "Expected canonical_smiles to match" + assert "data" in response, "Expected 'data' key in response" + data = response["data"] + assert isinstance(data, dict), "Expected 'data' to be a dictionary" + assert "id" in data, "Expected 'id' key in data" + assert "version" in data, "Expected 'version' key in data" + assert data["version"] == 1, "Expected version to be 1" + assert "name" in data, "Expected 'name' key in data" + assert data["name"] == "Compound-12345", "Expected name to match" + assert "canonical_smiles" in data, "Expected 'canonical_smiles' key in data" + assert "meta" in response, "Expected 'meta' key in response" + assert response["meta"]["inserted"] == 1, "Expected inserted to be 1" def test_create_protein_lv1(): @@ -179,4 +178,3 @@ def test_list_projects_lv1(): assert isinstance(response, dict), "Expected a dictionary response" assert "data" in response, "Expected 'data' key in response" assert isinstance(response["data"], list), "Expected 'data' to be a list" - assert "count" in response, "Expected 'count' key in response" From 18938f83fd21ad42b395b827e1588cd594565776 Mon Sep 17 00:00:00 2001 From: Srinivas Gorur-Shandilya Date: Thu, 19 Feb 2026 14:35:47 -0500 Subject: [PATCH 07/12] feat: ability to create proteins from IDs --- src/drug_discovery/structures/ligand.py | 7 +++ src/drug_discovery/structures/protein.py | 50 ++++++++++++++++ src/platform/data.py | 60 +++++++++++++++++-- tests/fixtures/ligand_08B05B1GDYWJR.json | 70 +++++++++++++++++++++++ tests/fixtures/protein_08AD337N5YV4Y.json | 46 +++++++++++++++ tests/mock_server/server.py | 26 +++++++++ tests/test_data.py | 34 +++++++++++ tests/test_protein.py | 13 +++++ 8 files changed, 300 insertions(+), 6 deletions(-) create mode 100644 tests/fixtures/ligand_08B05B1GDYWJR.json create mode 100644 tests/fixtures/protein_08AD337N5YV4Y.json diff --git a/src/drug_discovery/structures/ligand.py b/src/drug_discovery/structures/ligand.py index b5c16f12..59be5200 100644 --- a/src/drug_discovery/structures/ligand.py +++ b/src/drug_discovery/structures/ligand.py @@ -1010,9 +1010,12 @@ def sync(self, client: Optional[DeepOriginClient] = None) -> None: # If ligand has a file_path, upload it to remote storage # (Note: ligands in the data platform are identified by canonical_smiles, not file_path) + mol_file: str | None = None if self.file_path is not None: # Upload the ligand file first self.upload(client=client) + # Use the remote path as the mol_file + mol_file = self._remote_path # Search for existing ligands by canonical_smiles response = client.data.search_ligands(canonical_smiles=self.canonical_smiles) @@ -1032,6 +1035,10 @@ def sync(self, client: Optional[DeepOriginClient] = None) -> None: "smiles": self.smiles if self.smiles is not None else self.canonical_smiles, } + # Add mol_file if available + if mol_file is not None: + kwargs["mol_file"] = mol_file + # Add optional fields if available if self.name is not None: kwargs["name"] = self.name diff --git a/src/drug_discovery/structures/protein.py b/src/drug_discovery/structures/protein.py index a6d2c3c7..5844c2ff 100644 --- a/src/drug_discovery/structures/protein.py +++ b/src/drug_discovery/structures/protein.py @@ -67,6 +67,56 @@ def from_name(cls, name: str) -> Self: return cls.from_pdb_id(pdb_id) + @classmethod + def from_id(cls, id: str, *, client: Optional[DeepOriginClient] = None) -> Self: + """ + Create a Protein instance from a Deep Origin Data Platform ID. + + Args: + id: The Deep Origin Data Platform ID of the protein. + client: Optional DeepOriginClient instance. If not provided, uses the default client. + + Returns: + Protein: A new Protein instance. + + Raises: + ValueError: If the protein data does not contain a file_path. + RuntimeError: If the file cannot be downloaded or loaded. + """ + if client is None: + client = DeepOriginClient.get() + + data = client.data.get_protein(id=id) + + # Check if file_path exists + file_path = data.get("file_path") + if not file_path: + raise ValueError( + f"Protein {id} does not have a file_path. Cannot create Protein instance without structure file." + ) + + # Download the file + local_file_path = client.files.download_file(remote_path=file_path) + + # Create Protein instance from the downloaded file + protein = cls.from_file(file_path=local_file_path) + + # Set the ID from the data + protein.id = data.get("id") + + # Update fields from the data + if data.get("protein_name"): + protein.name = data["protein_name"] + elif data.get("pdb_id"): + protein.name = data["pdb_id"] + elif data.get("gene_symbol"): + protein.name = data["gene_symbol"] + + if data.get("pdb_id"): + protein.pdb_id = data["pdb_id"] + + return protein + @classmethod def from_pdb_id(cls, pdb_id: str, struct_ind: int = 0) -> Self: """ diff --git a/src/platform/data.py b/src/platform/data.py index 17d56bd2..242dd6c0 100644 --- a/src/platform/data.py +++ b/src/platform/data.py @@ -71,13 +71,20 @@ def search_ligands_with_results( Returns: Dictionary containing the search results. """ + # Ensure deleted=False is always set in filter_dict + if filter_dict is None: + filter_dict = {"deleted": False} + else: + filter_dict = filter_dict.copy() + filter_dict["deleted"] = False + body: dict[str, Any] = {} if cursor is not None: body["cursor"] = cursor if experiments is not None: body["experiments"] = experiments - if filter_dict is not None: - body["filter"] = filter_dict + body["filter"] = filter_dict + if limit is not None: body["limit"] = limit if offset is not None: @@ -159,7 +166,7 @@ def search_ligands( self, *, cursor: str | None = None, - filter: dict[str, Any] | None = None, + filter_dict: dict[str, Any] | None = None, smiles: str | None = None, canonical_smiles: str | None = None, min_molecular_weight: float | int | None = None, @@ -175,7 +182,7 @@ def search_ligands( Args: cursor: Cursor for pagination. - filter: Additional filter criteria as a dictionary. + filter_dict: Additional filter criteria as a dictionary. smiles: Filter by SMILES string. canonical_smiles: Filter by canonical SMILES string. min_molecular_weight: Minimum molecular weight filter (inclusive). @@ -191,8 +198,8 @@ def search_ligands( Raises: ValueError: If ligands is not a valid table name (should not happen). """ - # Build filter dict, starting with provided filter or empty dict - filter_dict = filter.copy() if filter is not None else {} + # Build filter dict, starting with provided filter_dict or empty dict + filter_dict = filter_dict.copy() if filter_dict is not None else {} filter_dict.setdefault("deleted", False) # Add smiles filter if provided @@ -237,6 +244,42 @@ def search_ligands( sort=sort, ) + def get_entity(self, *, entity: str, entity_id: str) -> dict: + """Get an entity by ID. + + Args: + entity: The entity type (e.g., "ligands", "proteins"). + entity_id: The ID of the entity to retrieve. + + Returns: + Dictionary containing the entity data. + """ + return self._c.get_json( + f"/data-platform/{self._c.org_key}/{entity}/{entity_id}" + ) + + def get_ligand(self, id: str) -> dict: + """Get a ligand by ID. + + Args: + id: The ID of the ligand to retrieve. + + Returns: + Dictionary containing the ligand data. + """ + return self.get_entity(entity="ligands", entity_id=id) + + def get_protein(self, id: str) -> dict: + """Get a protein by ID. + + Args: + id: The ID of the protein to retrieve. + + Returns: + Dictionary containing the protein data. + """ + return self.get_entity(entity="proteins", entity_id=id) + def search_proteins( self, *, @@ -326,6 +369,7 @@ def create_ligand( smiles: str, project_id: str | None = None, name: str | None = None, + mol_file: str | None = None, formal_charge: int = 0, hbond_donor_count: int | None = None, hbond_acceptor_count: int | None = None, @@ -340,6 +384,7 @@ def create_ligand( smiles: SMILES string (required). project_id: Project ID for the ligand. name: Name of the ligand. + mol_file: Path to the molecule file (e.g., SDF file) in remote storage. formal_charge: Formal charge. Defaults to 0. hbond_donor_count: Number of hydrogen bond donors. hbond_acceptor_count: Number of hydrogen bond acceptors. @@ -364,6 +409,8 @@ def create_ligand( set_dict["project_id"] = project_id if name is not None: set_dict["name"] = name + if mol_file is not None: + set_dict["mol_file"] = mol_file if hbond_donor_count is not None: set_dict["hbond_donor_count"] = hbond_donor_count if hbond_acceptor_count is not None: @@ -384,6 +431,7 @@ def create_ligand( "valid_to", "modified_by", "deleted", + "mol_file", "project_id", "subtable_name", "canonical_smiles", diff --git a/tests/fixtures/ligand_08B05B1GDYWJR.json b/tests/fixtures/ligand_08B05B1GDYWJR.json new file mode 100644 index 00000000..a4b101cd --- /dev/null +++ b/tests/fixtures/ligand_08B05B1GDYWJR.json @@ -0,0 +1,70 @@ +{ + "id": "08B05B1GDYWJR", + "created_at": "2026-02-19T18:40:33.979Z", + "updated_at": "2026-02-19T18:40:33.979Z", + "version": 1, + "valid_from": "2026-02-19T18:40:33.979Z", + "valid_to": null, + "modified_by": "6b96d8f8-0f55-474c-a86c-e09651ba4b20", + "deleted": false, + "project_id": null, + "project_scope_key": "__unscoped__", + "mol_file": null, + "rdkit_mol": "C/C=C/Cn1cc(-c2cccc(C(=O)N(C)C)c2)c2cc[nH]c2c1=O", + "smiles": "C/C=C/Cn1cc(-c2cccc(C(=O)N(C)C)c2)c2cc[nH]c2c1=O", + "canonical_smiles": "C/C=C/Cn1cc(-c2cccc(C(=O)N(C)C)c2)c2cc[nH]c2c1=O", + "inchi_key": "RJEMCUZKQLRUIS-SNAWJCMRSA-N", + "inchi": "InChI=1S/C20H21N3O2/c1-4-5-11-23-13-17(16-9-10-21-18(16)20(23)25)14-7-6-8-15(12-14)19(24)22(2)3/h4-10,12-13,21H,11H2,1-3H3/b5-4+", + "subtable_name": "ligands", + "variant_name_tag": "", + "structure_key": "RJEMCUZKQLRUIS-SNAWJCMRSA-N", + "smirks": null, + "name": "cmpd 4 (Crotyl)", + "sa_score": null, + "qed_score": null, + "topological_fingerprint": "\\xaf3c9d0b3481ef0669925e6bdd530c7d9c79493f5a6e9693ce1c583fa9055f7bf9b5327ec30cfe64da3d60ee21e9e0c15ef6e1358c96974e63ffca31a9d8b5223a9f1e547162192515752b2b13948da71b15cfcc0c365182b1c88af7e731d3a91049d899971ef44ff157c152f77df9d2289e26c5d7821287439edb9d4c23e287", + "morgan_fingerprint": "\\x06800010224000000100010400002000000d000000004000a00041000100004000042000c000000408301200910a122000000000808000100300018004000005", + "formal_charge": 0, + "hbond_donor_count": 1, + "hbond_acceptor_count": 3, + "rotatable_bond_count": 7, + "tpsa": 58.1, + "molecular_weight": 335.16337691200056, + "aromatic_ring_count": null, + "log_p": null, + "external_id": null, + "cas_registry_number": null, + "chembl_id": null, + "pdb_ligand_id": null, + "drugbank_id": null, + "zinc_id": null, + "pubchem_cid": null, + "binding_db_id": null, + "rule_of5_violations": null, + "bemis_murcko_scaffold": null, + "canonical_tautomer": null, + "charge_state": null, + "pka_values": null, + "bioavailability_score": null, + "rotamer_state_count": null, + "topological_diameter": null, + "chebi_id": null, + "kegg_compound_id": null, + "uni_chem_id": null, + "chem_spider_id": null, + "iuphar_ligand_id": null, + "sure_chembl_id": null, + "hmdb_id": null, + "nsc_number": null, + "embl_compound_id": null, + "lincs_id": null, + "maccs_keys": null, + "atom_pair_fingerprint": null, + "selfies": null, + "polarizability": null, + "refractivity": null, + "conformer_count": null, + "electrostatic_potential_map": null, + "pains_flag": null, + "unii": null +} diff --git a/tests/fixtures/protein_08AD337N5YV4Y.json b/tests/fixtures/protein_08AD337N5YV4Y.json new file mode 100644 index 00000000..33f82d8e --- /dev/null +++ b/tests/fixtures/protein_08AD337N5YV4Y.json @@ -0,0 +1,46 @@ +{ + "id": "08AD337N5YV4Y", + "created_at": "2026-02-18T20:27:28.073Z", + "updated_at": "2026-02-18T20:27:28.073Z", + "version": 1, + "valid_from": "2026-02-18T20:27:28.073Z", + "valid_to": null, + "modified_by": "6b96d8f8-0f55-474c-a86c-e09651ba4b20", + "deleted": false, + "project_id": null, + "subtable_name": "proteins", + "uniprot_accession": null, + "file_path": "entities/proteins/db4aa32e2e8ffa976a60004a8361b86427a2e5653a6623bb60b7913445902549.pdb", + "gene_symbol": null, + "pdb_id": null, + "refseq_protein_id": null, + "ensembl_protein_id": null, + "alpha_fold_id": null, + "fasta_sequence": null, + "protein_name": null, + "external_id": null, + "kegg_gene_id": null, + "chembl_target_id": null, + "binding_db_target_id": null, + "drugbank_target_id": null, + "pfam_id": null, + "interpro_id": null, + "ec_number": null, + "ncbi_taxonomy_id": null, + "go_term_id": null, + "uniprot_entry_name": null, + "uni_parc": null, + "reactome_protein_id": null, + "hgnc_id": null, + "orth_db_id": null, + "ensembl_transcript_id": null, + "tcr_bcr_ids": null, + "protein_family": null, + "disordered_regions": null, + "ligandability_score": null, + "protein_length": null, + "half_life": null, + "molecular_weight": null, + "isoelectric_point": null, + "subcellular_location": null +} \ No newline at end of file diff --git a/tests/mock_server/server.py b/tests/mock_server/server.py index 92fea86c..fc96120c 100644 --- a/tests/mock_server/server.py +++ b/tests/mock_server/server.py @@ -1093,6 +1093,32 @@ async def create_protein(org_key: str, request: Request) -> dict[str, Any]: "meta": {"inserted": 1}, } + @self.app.get("/data-platform/{org_key}/ligands/{ligand_id}") + def get_ligand(org_key: str, ligand_id: str) -> dict[str, Any]: + """Get a ligand by ID.""" + # Load fixture for the specific ligand ID + try: + return self._load_fixture(f"ligand_{ligand_id}") + except FileNotFoundError: + from fastapi import HTTPException + + raise HTTPException( + status_code=404, detail=f"Ligand {ligand_id} not found" + ) from None + + @self.app.get("/data-platform/{org_key}/proteins/{protein_id}") + def get_protein(org_key: str, protein_id: str) -> dict[str, Any]: + """Get a protein by ID.""" + # Load fixture for the specific protein ID + try: + return self._load_fixture(f"protein_{protein_id}") + except FileNotFoundError: + from fastapi import HTTPException + + raise HTTPException( + status_code=404, detail=f"Protein {protein_id} not found" + ) from None + @self.app.get("/data-platform/{org_key}/meta/models") def list_models(org_key: str) -> dict[str, Any]: """List public models.""" diff --git a/tests/test_data.py b/tests/test_data.py index 402e2d91..b9eec2ad 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -178,3 +178,37 @@ def test_list_projects_lv1(): assert isinstance(response, dict), "Expected a dictionary response" assert "data" in response, "Expected 'data' key in response" assert isinstance(response["data"], list), "Expected 'data' to be a list" + + +def test_get_ligand_lv1(): + """Test getting a ligand by ID.""" + client = DeepOriginClient() + response = client.data.get_ligand(id="08B05B1GDYWJR") + + assert isinstance(response, dict), "Expected a dictionary response" + assert "id" in response, "Expected 'id' key in response" + assert response["id"] == "08B05B1GDYWJR", "Expected id to match" + assert "smiles" in response, "Expected 'smiles' key in response" + assert "name" in response, "Expected 'name' key in response" + assert response["name"] == "cmpd 4 (Crotyl)", "Expected name to match" + assert "molecular_weight" in response, "Expected 'molecular_weight' key in response" + assert abs(response["molecular_weight"] - 335.16337691200056) < 1e-10, ( + "Expected molecular_weight to match" + ) + + +def test_get_protein_lv1(): + """Test getting a protein by ID.""" + client = DeepOriginClient() + response = client.data.get_protein(id="08AD337N5YV4Y") + + assert isinstance(response, dict), "Expected a dictionary response" + assert "id" in response, "Expected 'id' key in response" + assert response["id"] == "08AD337N5YV4Y", "Expected id to match" + assert "file_path" in response, "Expected 'file_path' key in response" + assert ( + response["file_path"] + == "entities/proteins/db4aa32e2e8ffa976a60004a8361b86427a2e5653a6623bb60b7913445902549.pdb" + ), "Expected file_path to match" + assert "subtable_name" in response, "Expected 'subtable_name' key in response" + assert response["subtable_name"] == "proteins", "Expected subtable_name to match" diff --git a/tests/test_protein.py b/tests/test_protein.py index a1db7ba8..2df2cd37 100644 --- a/tests/test_protein.py +++ b/tests/test_protein.py @@ -7,6 +7,7 @@ from deeporigin.drug_discovery import BRD_DATA_DIR, Protein from deeporigin.exceptions import DeepOriginException +from deeporigin.platform.client import DeepOriginClient def test_load_protein_from_cif_structure_factor(): @@ -401,3 +402,15 @@ def test_load_structure_from_block_invalid_type(): """Test that load_structure_from_block raises ValueError for unsupported types.""" with pytest.raises(ValueError, match=r".*Unsupported block type.*"): Protein.load_structure_from_block("test content", "xyz") + + +def test_from_id_lv1(): + """Test creating a protein from a Deep Origin Data Platform ID.""" + client = DeepOriginClient() + protein = Protein.from_id("08AD337N5YV4Y", client=client) + + assert protein.id == "08AD337N5YV4Y" + assert protein.file_path is not None + assert protein.file_path.exists() + assert len(protein.structure) > 0 + assert protein.block_content is not None From b79fc2f740d7aa15eaf6eea2fbf65c641454a1f5 Mon Sep 17 00:00:00 2001 From: Srinivas Gorur-Shandilya Date: Thu, 19 Feb 2026 14:37:19 -0500 Subject: [PATCH 08/12] feat: docs --- docs/dd/how-to/proteins.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/docs/dd/how-to/proteins.md b/docs/dd/how-to/proteins.md index 04966243..ce757475 100644 --- a/docs/dd/how-to/proteins.md +++ b/docs/dd/how-to/proteins.md @@ -37,6 +37,37 @@ from deeporigin.drug_discovery import Protein protein = Protein.from_name("insulin") ``` +### From a Deep Origin Data Platform ID + +You can create a Protein instance directly from a Deep Origin Data Platform ID. This method fetches the protein data from the platform, downloads the structure file, and creates a Protein instance with metadata from the platform: + +```python +from deeporigin.drug_discovery import Protein + +protein = Protein.from_id("08AD337N5YV4Y") +``` + +The method automatically: +- Downloads the structure file from the Deep Origin Data Platform +- Sets the protein's ID, name, and PDB ID (if available) from the platform metadata +- Creates a Protein instance from the downloaded file + +You can optionally provide a custom `DeepOriginClient` instance: + +```python +from deeporigin.drug_discovery import Protein +from deeporigin.platform.client import DeepOriginClient + +client = DeepOriginClient() +protein = Protein.from_id("08AD337N5YV4Y", client=client) +``` + +!!! warning "Requires file_path" + The protein data in the platform must contain a `file_path` field. If the protein data does not have a file_path, a `ValueError` will be raised. + +!!! note "Automatic metadata" + The method automatically populates the protein's `name` field from the platform data, preferring `protein_name`, then `pdb_id`, then `gene_symbol` (in that order). + ## Inspecting the Protein From c72dfb28d061526764398c15791cc0b9ebb6327c Mon Sep 17 00:00:00 2001 From: Srinivas Gorur-Shandilya Date: Thu, 19 Feb 2026 20:01:52 -0500 Subject: [PATCH 09/12] feat: cleaned up some tests --- tests/mock_server/server.py | 16 ++++++++-- tests/test_data.py | 61 ++++++++++++++++++------------------- 2 files changed, 43 insertions(+), 34 deletions(-) diff --git a/tests/mock_server/server.py b/tests/mock_server/server.py index fc96120c..6763e07f 100644 --- a/tests/mock_server/server.py +++ b/tests/mock_server/server.py @@ -47,6 +47,8 @@ def __init__(self, port: int = 0, docking_speed: float = 0.5): # In-memory storage for executions self._executions: dict[str, dict[str, Any]] = {} self._execution_start_times: dict[str, datetime] = {} + self._ligands: dict[str, dict[str, Any]] = {} + self._proteins: dict[str, dict[str, Any]] = {} # Tool-specific mock execution durations (in seconds) self._mock_execution_durations: dict[str, float] = { "deeporigin.abfe-end-to-end": 30.0, # seconds @@ -1027,6 +1029,9 @@ async def create_ligand(org_key: str, request: Request) -> dict[str, Any]: # Include all fields from set_data response_data.update(set_data) + # Store full record in memory before filtering + self._ligands[ligand_id] = response_data.copy() + # Filter to only return requested fields if specified if returning: response_data = { @@ -1044,7 +1049,7 @@ async def create_protein(org_key: str, request: Request) -> dict[str, Any]: # Generate mock response matching the real API format now = datetime.now(timezone.utc) - protein_id = "08AD337N5YV4Y" # Use a consistent ID for testing + protein_id = "08" + str(uuid.uuid4()).replace("-", "").upper()[:11] modified_by = "6b96d8f8-0f55-474c-a86c-e09651ba4b20" # Build response data with all fields matching the real API @@ -1082,6 +1087,9 @@ async def create_protein(org_key: str, request: Request) -> dict[str, Any]: # Override with any fields provided in set_data response_data.update(set_data) + # Store full record in memory before filtering + self._proteins[protein_id] = response_data.copy() + # Filter to only return requested fields if specified if returning: response_data = { @@ -1096,7 +1104,8 @@ async def create_protein(org_key: str, request: Request) -> dict[str, Any]: @self.app.get("/data-platform/{org_key}/ligands/{ligand_id}") def get_ligand(org_key: str, ligand_id: str) -> dict[str, Any]: """Get a ligand by ID.""" - # Load fixture for the specific ligand ID + if ligand_id in self._ligands: + return self._ligands[ligand_id] try: return self._load_fixture(f"ligand_{ligand_id}") except FileNotFoundError: @@ -1109,7 +1118,8 @@ def get_ligand(org_key: str, ligand_id: str) -> dict[str, Any]: @self.app.get("/data-platform/{org_key}/proteins/{protein_id}") def get_protein(org_key: str, protein_id: str) -> dict[str, Any]: """Get a protein by ID.""" - # Load fixture for the specific protein ID + if protein_id in self._proteins: + return self._proteins[protein_id] try: return self._load_fixture(f"protein_{protein_id}") except FileNotFoundError: diff --git a/tests/test_data.py b/tests/test_data.py index b9eec2ad..5d6410bd 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,5 +1,7 @@ """Tests for the Data Platform API wrapper.""" +import uuid + import pytest from deeporigin.platform.client import DeepOriginClient @@ -15,19 +17,6 @@ def test_data_platform_health_lv1(): assert response["status"] == "ok", "Expected status to be 'ok'" -def test_search_ligands_with_results_lv1(): - """Test searching ligands with results.""" - client = DeepOriginClient() - response = client.data.search_ligands_with_results( - limit=10, - experiments=[{"toolId": "test-tool"}], - ) - - assert isinstance(response, dict), "Expected a dictionary response" - assert "data" in response, "Expected 'data' key in response" - assert isinstance(response["data"], list), "Expected 'data' to be a list" - - def test_search_entity_lv1(): """Test searching an entity.""" client = DeepOriginClient() @@ -55,11 +44,12 @@ def test_search_ligands_lv1(): assert isinstance(response["data"], list), "Expected 'data' to be a list" -def test_search_ligands_molecular_weight(): +def test_search_ligands_molecular_weight_lv1(): """Test searching ligands with molecular weight filters.""" client = DeepOriginClient() response = client.data.search_ligands( - min_molecular_weight=250, max_molecular_weight=550 + min_molecular_weight=250, + max_molecular_weight=550, ) assert isinstance(response, dict), "Expected a dictionary response" @@ -77,11 +67,12 @@ def test_search_proteins_lv1(): assert isinstance(response["data"], list), "Expected 'data' to be a list" -def test_search_proteins_molecular_weight(): +def test_search_proteins_molecular_weight_lv1(): """Test searching proteins with molecular weight filters.""" client = DeepOriginClient() response = client.data.search_proteins( - min_molecular_weight=250, max_molecular_weight=550 + min_molecular_weight=250, + max_molecular_weight=550, ) assert isinstance(response, dict), "Expected a dictionary response" @@ -89,7 +80,7 @@ def test_search_proteins_molecular_weight(): assert isinstance(response["data"], list), "Expected 'data' to be a list" -def test_search_proteins_sequence(): +def test_search_proteins_sequence_lv1(): """Test searching proteins with sequence filter.""" client = DeepOriginClient() response = client.data.search_proteins( @@ -121,6 +112,7 @@ def test_create_ligand_lv1(): """Test creating a ligand.""" client = DeepOriginClient() smiles = "Fc1c(-c2cccc3ccccc23)ncc2c(N3C[C@H]4CC[C@@H](C3)N4)nc(OCC34CCCN3CCC4)nc12" + unique_tag = str(uuid.uuid4()) response = client.data.create_ligand( smiles=smiles, name="Compound-12345", @@ -130,7 +122,7 @@ def test_create_ligand_lv1(): rotatable_bond_count=5, tpsa=85.12, molecular_weight=447.5, - variant_name_tag="", + variant_name_tag=unique_tag, ) assert isinstance(response, dict), "Expected a dictionary response" @@ -183,32 +175,39 @@ def test_list_projects_lv1(): def test_get_ligand_lv1(): """Test getting a ligand by ID.""" client = DeepOriginClient() - response = client.data.get_ligand(id="08B05B1GDYWJR") + smiles = "Fc1c(-c2cccc3ccccc23)ncc2c(N3C[C@H]4CC[C@@H](C3)N4)nc(OCC34CCCN3CCC4)nc12" + created = client.data.create_ligand( + smiles=smiles, + name="GetLigandTest", + molecular_weight=447.5, + variant_name_tag=str(uuid.uuid4()), + ) + ligand_id = created["data"]["id"] + + response = client.data.get_ligand(id=ligand_id) assert isinstance(response, dict), "Expected a dictionary response" assert "id" in response, "Expected 'id' key in response" - assert response["id"] == "08B05B1GDYWJR", "Expected id to match" + assert response["id"] == ligand_id, "Expected id to match" assert "smiles" in response, "Expected 'smiles' key in response" assert "name" in response, "Expected 'name' key in response" - assert response["name"] == "cmpd 4 (Crotyl)", "Expected name to match" + assert response["name"] == "GetLigandTest", "Expected name to match" assert "molecular_weight" in response, "Expected 'molecular_weight' key in response" - assert abs(response["molecular_weight"] - 335.16337691200056) < 1e-10, ( - "Expected molecular_weight to match" - ) def test_get_protein_lv1(): """Test getting a protein by ID.""" client = DeepOriginClient() - response = client.data.get_protein(id="08AD337N5YV4Y") + file_path = "entities/proteins/db4aa32e2e8ffa976a60004a8361b86427a2e5653a6623bb60b7913445902549.pdb" + created = client.data.create_protein(file_path=file_path) + protein_id = created["data"]["id"] + + response = client.data.get_protein(id=protein_id) assert isinstance(response, dict), "Expected a dictionary response" assert "id" in response, "Expected 'id' key in response" - assert response["id"] == "08AD337N5YV4Y", "Expected id to match" + assert response["id"] == protein_id, "Expected id to match" assert "file_path" in response, "Expected 'file_path' key in response" - assert ( - response["file_path"] - == "entities/proteins/db4aa32e2e8ffa976a60004a8361b86427a2e5653a6623bb60b7913445902549.pdb" - ), "Expected file_path to match" + assert response["file_path"] == file_path, "Expected file_path to match" assert "subtable_name" in response, "Expected 'subtable_name' key in response" assert response["subtable_name"] == "proteins", "Expected subtable_name to match" From de60d9b392fb7769963f99f2b44bcdad034f6b69 Mon Sep 17 00:00:00 2001 From: Srinivas Gorur-Shandilya Date: Thu, 19 Feb 2026 20:04:53 -0500 Subject: [PATCH 10/12] fix: no fail fast for lv1 tests --- .github/workflows/level_1.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/level_1.yml b/.github/workflows/level_1.yml index 2ad0d935..4fc3ba8c 100644 --- a/.github/workflows/level_1.yml +++ b/.github/workflows/level_1.yml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest environment: ${{ matrix.env }} strategy: - fail-fast: true + fail-fast: false matrix: python-version: ["3.13"] env: ${{ fromJSON( ((github.event_name == 'workflow_call' || github.event_name == 'release') && inputs.env != '') && format('["{0}"]', inputs.env) || '["dev","staging","prod"]' ) }} From 0a530111e57980fcdefb7839fa9526c8a689d2e6 Mon Sep 17 00:00:00 2001 From: Srinivas Gorur-Shandilya Date: Thu, 19 Feb 2026 20:52:41 -0500 Subject: [PATCH 11/12] Update src/drug_discovery/structures/protein.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/drug_discovery/structures/protein.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/drug_discovery/structures/protein.py b/src/drug_discovery/structures/protein.py index 5844c2ff..9245789c 100644 --- a/src/drug_discovery/structures/protein.py +++ b/src/drug_discovery/structures/protein.py @@ -1403,7 +1403,9 @@ def sync(self, client: Optional[DeepOriginClient] = None) -> None: if self.pdb_id is not None: kwargs["pdb_id"] = self.pdb_id - kwargs["protein_length"] = self.length + # Only compute and include protein_length when a local file_path is available + if getattr(self, "file_path", None) is not None: + kwargs["protein_length"] = self.length kwargs["protein_name"] = self.name # Call create_protein through the client From 74f2603725a3c081fbd225c2eca9be644df04445 Mon Sep 17 00:00:00 2001 From: Srinivas Gorur-Shandilya Date: Thu, 19 Feb 2026 20:53:17 -0500 Subject: [PATCH 12/12] Update src/drug_discovery/structures/protein.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/drug_discovery/structures/protein.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/drug_discovery/structures/protein.py b/src/drug_discovery/structures/protein.py index 9245789c..ce44d660 100644 --- a/src/drug_discovery/structures/protein.py +++ b/src/drug_discovery/structures/protein.py @@ -1364,13 +1364,15 @@ def sync(self, client: Optional[DeepOriginClient] = None) -> None: This method uploads the protein file to remote storage and creates a protein record in the data platform. If a protein with the same file_path already exists, - it returns the existing protein data instead of creating a new one. + it updates the current instance with the existing protein's ID instead of + creating a new one. Args: client: DeepOriginClient instance. If None, uses DeepOriginClient.get(). Returns: - Dictionary containing the created or existing protein data from the data platform. + None. As a side effect, uploads the protein (if necessary) and updates + ``self.id`` with the ID of the existing or newly created protein record. """ if client is None: client = DeepOriginClient.get()