diff --git a/pyproject.toml b/pyproject.toml index 34af193..24c44d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -156,7 +156,7 @@ convention = "google" requires = ["uv_build>=0.9.9,<0.10.0"] build-backend = "uv_build" -[tool.pytest] +[tool.pytest.ini_options] pythonpath = ["src"] log_cli = true log_cli_level = "INFO" diff --git a/src/cdm_data_loader_utils/parsers/annotation_parse.py b/src/cdm_data_loader_utils/parsers/annotation_parse.py new file mode 100644 index 0000000..2a690c8 --- /dev/null +++ b/src/cdm_data_loader_utils/parsers/annotation_parse.py @@ -0,0 +1,374 @@ +""" + +RefSeq annotation parser for transforming NCBI Datasets API JSON into CDM-formatted Delta Lake tables. + +Usage: + python src/cdm_data_loader_utils/parsers/annotation_parse.py \ + --accession GCF_000869125.1 \ + --output-path output/refseq/GCF_000869125.1 \ + --query + +""" + +from __future__ import annotations +import argparse +import json +from pathlib import Path +from typing import Optional + +import requests +from pyspark.sql import SparkSession +from pyspark.sql.types import StructType +from delta import configure_spark_with_delta_pip + +from cdm_data_loader_utils.parsers.kbase_cdm_pyspark import schema as cdm_schemas + + +# --------------------------------------------------------------------- +# Accession-based annotation fetch +# --------------------------------------------------------------------- +def fetch_annotation_json(accession: str) -> dict: + """Fetch annotation JSON from NCBI Datasets API.""" + url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{accession}/annotation_report" + resp = requests.get(url, headers={"Accept": "application/json"}, timeout=60) + resp.raise_for_status() + return resp.json() + + +# --------------------------------------------------------------------- +# SPARK SESSION +# --------------------------------------------------------------------- +def build_spark_session(app_name: str = "RefSeqAnnotationToCDM") -> SparkSession: + """Configure and return Spark session with Delta support.""" + builder = ( + SparkSession.builder.appName(app_name) + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") + ) + return configure_spark_with_delta_pip(builder).getOrCreate() + + +# --------------------------------------------------------------------- +# CDM TABLE SCHEMAS +# --------------------------------------------------------------------- +# Using centralized schemas +IDENTIFIER_SCHEMA = cdm_schemas["Identifier"] +NAME_SCHEMA = cdm_schemas["Name"] +FEATURE_SCHEMA = cdm_schemas["Feature"] +CONTIG_COLLECTION_X_FEATURE_SCHEMA = cdm_schemas["ContigCollection_x_Feature"] +CONTIG_COLLECTION_X_PROTEIN_SCHEMA = cdm_schemas["ContigCollection_x_Protein"] +FEATURE_X_PROTEIN_SCHEMA = cdm_schemas["Feature_x_Protein"] +CONTIG_SCHEMA = cdm_schemas["Contig"] +CONTIG_X_CONTIG_COLLECTION_SCHEMA = cdm_schemas["Contig_x_ContigCollection"] + + +# --------------------------------------------------------------------- +# CDM PREFIX NORMALIZATION +# --------------------------------------------------------------------- +def apply_prefix(identifier: str) -> str: + """Normalize identifiers to CDM-prefixed formats.""" + if identifier.startswith(("YP_", "XP_", "WP_", "NP_", "NC_")): + return f"refseq:{identifier}" + if identifier.startswith("GCF_"): + return f"insdc.gcf:{identifier}" + return identifier + + +# --------------------------------------------------------------------- +# Safe integer conversion +# --------------------------------------------------------------------- +def to_int(val: str) -> int | None: + try: + return int(val) + except Exception: + return None + + +# --------------------------------------------------------------------- +# IDENTIFIERS +# --------------------------------------------------------------------- +def load_identifiers(data: dict) -> list[tuple[str, str, str, str, str | None]]: + """Extract Identifier table records.""" + out = [] + for report in data.get("reports", []): + ann = report.get("annotation", {}) + gene_id = ann.get("gene_id") + if not gene_id: + continue + entity_id = f"ncbigene:{gene_id}" + out.append((entity_id, gene_id, ann.get("name"), "RefSeq", ann.get("relationship"))) + return out + + +# --------------------------------------------------------------------- +# NAME EXTRACTION +# --------------------------------------------------------------------- +def load_names(data: dict) -> list[tuple[str, str, str, str]]: + """Extract Name table records.""" + out = [] + for report in data.get("reports", []): + ann = report.get("annotation", {}) + gene_id = ann.get("gene_id") + if not gene_id: + continue + entity_id = f"ncbigene:{gene_id}" + for label, desc in [ + ("symbol", "RefSeq gene symbol"), + ("name", "RefSeq gene name"), + ("locus_tag", "RefSeq locus tag"), + ]: + val = ann.get(label) + if val: + out.append((entity_id, val, desc, "RefSeq")) + return out + + +# --------------------------------------------------------------------- +# FEATURE LOCATIONS +# --------------------------------------------------------------------- +def load_feature_records(data: dict) -> list[tuple]: + """Extract Feature table records.""" + features = [] + for report in data.get("reports", []): + ann = report.get("annotation", {}) + gene_id = ann.get("gene_id") + if not gene_id: + continue + feature_id = f"ncbigene:{gene_id}" + for region in ann.get("genomic_regions", []): + for r in region.get("gene_range", {}).get("range", []): + strand = { + "plus": "positive", + "minus": "negative", + "unstranded": "unstranded", + }.get(r.get("orientation"), "unknown") + features.append(( + feature_id, + None, + None, + None, + to_int(r.get("end")), + None, + to_int(r.get("begin")), + strand, + "RefSeq", + None, + "gene", + )) + return features + + +# --------------------------------------------------------------------- +# PARSE CONTIG_COLLECTION <-> FEATURE +# --------------------------------------------------------------------- +def load_contig_collection_x_feature(data: dict) -> list[tuple[str, str]]: + """Parse ContigCollection ↔ Feature links.""" + links = [] + + for report in data.get("reports", []): + ann = report.get("annotation", {}) + gene_id = ann.get("gene_id") + regions = ann.get("genomic_regions", []) + + if not gene_id or not regions: + continue + + acc = regions[0].get("gene_range", {}).get("accession_version") + if acc: + links.append((apply_prefix(acc), f"ncbigene:{gene_id}")) + + return links + + +# --------------------------------------------------------------------- +# PARSE CONTIG_COLLECTION <-> PROTEIN +# --------------------------------------------------------------------- +def load_contig_collection_x_protein(data: dict) -> list[tuple[str, str]]: + links = [] + + for report in data.get("reports", []): + ann = report.get("annotation", {}) + proteins = ann.get("proteins", []) + annotations = ann.get("annotations", []) + + if not proteins or not annotations: + continue + + assembly = annotations[0].get("assembly_accession") + + if not assembly: + continue + + contig_id = apply_prefix(assembly) + + for p in proteins: + pid = p.get("accession_version") + if pid: + protein_id = apply_prefix(pid) + links.append((contig_id, protein_id)) + + return links + + +# --------------------------------------------------------------------- +# PARSE FEATURE <-> PROTEIN +# --------------------------------------------------------------------- +def load_feature_x_protein(data: dict) -> list[tuple[str, str]]: + links = [] + + for report in data.get("reports", []): + ann = report.get("annotation", {}) + gene_id = ann.get("gene_id") + proteins = ann.get("proteins", []) + + if not gene_id or not proteins: + continue + + feature_id = f"ncbigene:{gene_id}" + + for p in proteins: + pid = p.get("accession_version") + if pid: + protein_id = apply_prefix(pid) + links.append((feature_id, protein_id)) + + return links + + +# --------------------------------------------------------------------- +# PARSE CONTIGS +# --------------------------------------------------------------------- +def load_contigs(data: dict) -> list[tuple[str, str | None, float | None, int | None]]: + """Parse Contig table.""" + contigs = {} + + for report in data.get("reports", []): + for region in report.get("annotation", {}).get("genomic_regions", []): + acc = region.get("gene_range", {}).get("accession_version") + if acc: + contig_id = apply_prefix(acc) + contigs.setdefault(contig_id, {"hash": None, "gc_content": None, "length": None}) + + return [(cid, meta["hash"], meta["gc_content"], meta["length"]) for cid, meta in contigs.items()] + + +# --------------------------------------------------------------------- +# PARSE CONTIG <-> CONTIG_COLLECTION +# --------------------------------------------------------------------- +def load_contig_x_contig_collection(data: dict) -> list[tuple[str, str]]: + links = [] + + for report in data.get("reports", []): + ann = report.get("annotation", {}) + regions = ann.get("genomic_regions", []) + annotations = ann.get("annotations", []) + + if not regions or not annotations: + continue + + contig = regions[0].get("gene_range", {}).get("accession_version") + assembly = annotations[0].get("assembly_accession") + + if contig and assembly: + contig_id = f"refseq:{contig}" + collection_id = apply_prefix(assembly) + links.append((contig_id, collection_id)) + + return links + + +# --------------------------------------------------------------------- +# DELTA TABLE +# --------------------------------------------------------------------- +def write_to_delta( + spark: SparkSession, + records: list[tuple], + output_path: str, + schema: StructType, +) -> None: + """Write records to Delta table.""" + if not records: + return + + df = spark.createDataFrame(records, schema=schema) + df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(output_path) + + +# --------------------------------------------------------------------- +# SQL PREVIEW +# --------------------------------------------------------------------- +def run_sql_query(spark: SparkSession, delta_path: str) -> None: + """Run SQL queries to preview Delta tables.""" + for name in [ + "cdm_identifiers", + "cdm_names", + "cdm_features", + "cdm_contig_collection_x_feature", + "cdm_contig_collection_x_protein", + "cdm_feature_x_protein", + "cdm_contigs", + "cdm_contig_x_contig_collection", + ]: + print(f"\n[SQL] {name}:") + path = str(Path(delta_path) / name) + spark.read.format("delta").load(path).createOrReplaceTempView(name) + spark.sql(f"SELECT * FROM {name} LIMIT 20").show(truncate=False) + + +# --------------------------------------------------------------------- +# CLI ENTRY +# --------------------------------------------------------------------- +def main() -> None: + """Entry point for RefSeq Annotation parser.""" + parser = argparse.ArgumentParser(description="RefSeq Annotation Parser to CDM") + parser.add_argument("--accession", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument("--query", action="store_true") + args = parser.parse_args() + + base_output = Path(args.output_path) + base_output.mkdir(parents=True, exist_ok=True) + + data = fetch_annotation_json(args.accession) + input_path = Path(f"/tmp/{args.accession}.json") + input_path.write_text(json.dumps(data, indent=2)) + + spark = build_spark_session() + + write_to_delta(spark, load_identifiers(data), str(base_output / "cdm_identifiers"), IDENTIFIER_SCHEMA) + write_to_delta(spark, load_names(data), str(base_output / "cdm_names"), NAME_SCHEMA) + write_to_delta(spark, load_feature_records(data), str(base_output / "cdm_features"), FEATURE_SCHEMA) + write_to_delta( + spark, + load_contig_collection_x_feature(data), + str(base_output / "cdm_contig_collection_x_feature"), + CONTIG_COLLECTION_X_FEATURE_SCHEMA, + ) + write_to_delta( + spark, + load_contig_collection_x_protein(data), + str(base_output / "cdm_contig_collection_x_protein"), + CONTIG_COLLECTION_X_PROTEIN_SCHEMA, + ) + write_to_delta( + spark, + load_feature_x_protein(data), + str(base_output / "cdm_feature_x_protein"), + FEATURE_X_PROTEIN_SCHEMA, + ) + write_to_delta(spark, load_contigs(data), str(base_output / "cdm_contigs"), CONTIG_SCHEMA) + write_to_delta( + spark, + load_contig_x_contig_collection(data), + str(base_output / "cdm_contig_x_contig_collection"), + CONTIG_X_CONTIG_COLLECTION_SCHEMA, + ) + + if args.query: + run_sql_query(spark, str(base_output)) + + spark.stop() + + +if __name__ == "__main__": + main() diff --git a/src/cdm_data_loader_utils/parsers/kbase_cdm_pyspark.py b/src/cdm_data_loader_utils/parsers/kbase_cdm_pyspark.py new file mode 100644 index 0000000..19be5e8 --- /dev/null +++ b/src/cdm_data_loader_utils/parsers/kbase_cdm_pyspark.py @@ -0,0 +1,610 @@ +"""Automated conversion of cdm_schema to PySpark.""" + +from pyspark.sql.types import BooleanType, DateType, FloatType, IntegerType, StringType, StructField, StructType + +schema = { + "Association": StructType( + [ + StructField("association_id", StringType(), nullable=False), + StructField("subject", StringType(), nullable=False), + StructField("object", StringType(), nullable=False), + StructField("predicate", StringType(), nullable=False), + StructField("negated", BooleanType(), nullable=True), + StructField("evidence_type", StringType(), nullable=True), + StructField("primary_knowledge_source", StringType(), nullable=True), + StructField("aggregator_knowledge_source", StringType(), nullable=True), + StructField("annotation_date", DateType(), nullable=True), + StructField("comments", StringType(), nullable=True), + ] + ), + "Association_x_SupportingObject": StructType( + [ + StructField("association_id", StringType(), nullable=False), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "Cluster": StructType( + [ + StructField("cluster_id", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + StructField("name", StringType(), nullable=True), + StructField("entity_type", StringType(), nullable=False), + StructField("protocol_id", StringType(), nullable=True), + ] + ), + "ClusterMember": StructType( + [ + StructField("cluster_id", StringType(), nullable=False), + StructField("entity_id", StringType(), nullable=False), + StructField("is_representative", BooleanType(), nullable=True), + StructField("is_seed", BooleanType(), nullable=True), + StructField("score", FloatType(), nullable=True), + ] + ), + "Contig": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("gc_content", FloatType(), nullable=True), + StructField("length", IntegerType(), nullable=True), + ] + ), + "ContigCollection": StructType( + [ + StructField("contig_collection_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("asm_score", FloatType(), nullable=True), + StructField("checkm_completeness", FloatType(), nullable=True), + StructField("checkm_contamination", FloatType(), nullable=True), + StructField("checkm_version", StringType(), nullable=True), + StructField("contig_bp", IntegerType(), nullable=True), + StructField("contig_collection_type", StringType(), nullable=True), + StructField("contig_l50", IntegerType(), nullable=True), + StructField("contig_l90", IntegerType(), nullable=True), + StructField("contig_n50", IntegerType(), nullable=True), + StructField("contig_n90", IntegerType(), nullable=True), + StructField("contig_logsum", FloatType(), nullable=True), + StructField("contig_max", IntegerType(), nullable=True), + StructField("contig_powersum", FloatType(), nullable=True), + StructField("gap_percent", FloatType(), nullable=True), + StructField("gc_average", FloatType(), nullable=True), + StructField("gc_std", FloatType(), nullable=True), + StructField("gtdb_taxon_id", StringType(), nullable=True), + StructField("n_chromosomes", IntegerType(), nullable=True), + StructField("n_contigs", IntegerType(), nullable=True), + StructField("n_scaffolds", IntegerType(), nullable=True), + StructField("ncbi_taxon_id", StringType(), nullable=True), + StructField("scaffold_l50", IntegerType(), nullable=True), + StructField("scaffold_l90", IntegerType(), nullable=True), + StructField("scaffold_n50", IntegerType(), nullable=True), + StructField("scaffold_n90", IntegerType(), nullable=True), + StructField("scaffold_bp", IntegerType(), nullable=True), + StructField("scaffold_logsum", FloatType(), nullable=True), + StructField("scaffold_maximum_length", IntegerType(), nullable=True), + StructField("scaffold_powersum", FloatType(), nullable=True), + StructField("scaffolds_n_over_50K", IntegerType(), nullable=True), + StructField("scaffolds_percent_over_50K", FloatType(), nullable=True), + StructField("scaffolds_total_length_over_50k", IntegerType(), nullable=True), + ] + ), + "ContigCollection_x_EncodedFeature": StructType( + [ + StructField("contig_collection_id", StringType(), nullable=False), + StructField("encoded_feature_id", StringType(), nullable=False), + ] + ), + "ContigCollection_x_Feature": StructType( + [ + StructField("contig_collection_id", StringType(), nullable=False), + StructField("feature_id", StringType(), nullable=False), + ] + ), + "ContigCollection_x_Protein": StructType( + [ + StructField("contig_collection_id", StringType(), nullable=False), + StructField("protein_id", StringType(), nullable=False), + ] + ), + "Contig_x_ContigCollection": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("contig_collection_id", StringType(), nullable=False), + ] + ), + "Contig_x_EncodedFeature": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("encoded_feature_id", StringType(), nullable=False), + ] + ), + "Contig_x_Feature": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("feature_id", StringType(), nullable=False), + ] + ), + "Contig_x_Protein": StructType( + [ + StructField("contig_id", StringType(), nullable=False), + StructField("protein_id", StringType(), nullable=False), + ] + ), + "Contributor": StructType( + [ + StructField("contributor_id", StringType(), nullable=False), + StructField("contributor_type", StringType(), nullable=True), + StructField("name", StringType(), nullable=True), + StructField("given_name", StringType(), nullable=True), + StructField("family_name", StringType(), nullable=True), + ] + ), + "ContributorAffiliation": StructType( + [ + StructField("contributor_id", StringType(), nullable=False), + StructField("affiliation_id", StringType(), nullable=True), + ] + ), + "Contributor_x_DataSource": StructType( + [ + StructField("contributor_id", StringType(), nullable=False), + StructField("data_source_id", StringType(), nullable=False), + StructField("contributor_role", StringType(), nullable=True), + ] + ), + "Contributor_x_Role_x_Project": StructType( + [ + StructField("contributor_id", StringType(), nullable=False), + StructField("project_id", StringType(), nullable=False), + StructField("contributor_role", StringType(), nullable=True), + ] + ), + "ControlledTermValue": StructType( + [ + StructField("value_cv_label", StringType(), nullable=False), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "ControlledVocabularyTermValue": StructType( + [ + StructField("value_cv_label", StringType(), nullable=True), + StructField("value_cv_id", StringType(), nullable=True), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "DataSource": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + ] + ), + "DataSourceNew": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("comments", StringType(), nullable=True), + StructField("date_accessed", DateType(), nullable=False), + StructField("date_published", DateType(), nullable=True), + StructField("date_updated", DateType(), nullable=True), + StructField("license", StringType(), nullable=True), + StructField("publisher", StringType(), nullable=True), + StructField("resource_type", StringType(), nullable=False), + StructField("url", StringType(), nullable=True), + StructField("version", StringType(), nullable=True), + ] + ), + "DataSource_x_Description": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("resource_description_id", StringType(), nullable=False), + ] + ), + "DataSource_x_FundingReference": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("funding_reference_id", StringType(), nullable=False), + ] + ), + "DataSource_x_License": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("license_id", StringType(), nullable=False), + ] + ), + "DataSource_x_Title": StructType( + [ + StructField("data_source_id", StringType(), nullable=False), + StructField("resource_title_id", StringType(), nullable=False), + ] + ), + "DateTimeValue": StructType( + [ + StructField("date_time", DateType(), nullable=False), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "EncodedFeature": StructType( + [ + StructField("encoded_feature_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("has_stop_codon", BooleanType(), nullable=True), + StructField("type", StringType(), nullable=True), + ] + ), + "EncodedFeature_x_Feature": StructType( + [ + StructField("encoded_feature_id", StringType(), nullable=False), + StructField("feature_id", StringType(), nullable=False), + ] + ), + "EncodedFeature_x_Protein": StructType( + [ + StructField("encoded_feature_id", StringType(), nullable=False), + StructField("protein_id", StringType(), nullable=False), + ] + ), + "EntailedEdge": StructType( + [ + StructField("subject", StringType(), nullable=True), + StructField("predicate", StringType(), nullable=True), + StructField("object", StringType(), nullable=True), + ] + ), + "Entity": StructType( + [ + StructField("entity_id", StringType(), nullable=False), + StructField("entity_type", StringType(), nullable=False), + StructField("data_source_id", StringType(), nullable=True), + StructField("data_source_entity_id", StringType(), nullable=True), + StructField("data_source_created", DateType(), nullable=False), + StructField("data_source_updated", DateType(), nullable=True), + StructField("created", DateType(), nullable=False), + StructField("updated", DateType(), nullable=False), + ] + ), + "Event": StructType( + [ + StructField("event_id", StringType(), nullable=False), + StructField("created_at", DateType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("name", StringType(), nullable=True), + StructField("location", StringType(), nullable=True), + ] + ), + "Experiment": StructType( + [ + StructField("experiment_id", StringType(), nullable=False), + StructField("protocol_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("created_at", DateType(), nullable=True), + ] + ), + "ExperimentCondition": StructType( + [ + StructField("experiment_condition_id", StringType(), nullable=False), + StructField("experiment_id", StringType(), nullable=False), + StructField("variable_id", StringType(), nullable=False), + StructField("value", StringType(), nullable=True), + ] + ), + "ExperimentConditionSet": StructType( + [ + StructField("experiment_condition_set_id", StringType(), nullable=False), + StructField("experiment_condition_id", StringType(), nullable=False), + ] + ), + "Feature": StructType( + [ + StructField("feature_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("cds_phase", StringType(), nullable=True), + StructField("e_value", FloatType(), nullable=True), + StructField("end", IntegerType(), nullable=True), + StructField("p_value", FloatType(), nullable=True), + StructField("start", IntegerType(), nullable=True), + StructField("strand", StringType(), nullable=True), + StructField("source_database", StringType(), nullable=True), + StructField("protocol_id", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + ] + ), + "Feature_x_Protein": StructType( + [ + StructField("feature_id", StringType(), nullable=False), + StructField("protein_id", StringType(), nullable=False), + ] + ), + "FundingReference": StructType( + [ + StructField("funding_reference_id", StringType(), nullable=False), + StructField("funder", StringType(), nullable=True), + StructField("grant_id", StringType(), nullable=True), + StructField("grant_title", StringType(), nullable=True), + StructField("grant_url", StringType(), nullable=True), + ] + ), + "Geolocation": StructType( + [ + StructField("latitude", FloatType(), nullable=False), + StructField("longitude", FloatType(), nullable=False), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "GoldEnvironmentalContext": StructType( + [ + StructField("gold_environmental_context_id", StringType(), nullable=False), + StructField("ecosystem", StringType(), nullable=True), + StructField("ecosystem_category", StringType(), nullable=True), + StructField("ecosystem_subtype", StringType(), nullable=True), + StructField("ecosystem_type", StringType(), nullable=True), + StructField("specific_ecosystem", StringType(), nullable=True), + ] + ), + "Identifier": StructType( + [ + StructField("entity_id", StringType(), nullable=False), + StructField("identifier", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + StructField("source", StringType(), nullable=True), + StructField("relationship", StringType(), nullable=True), + ] + ), + "License": StructType( + [ + StructField("license_id", StringType(), nullable=False), + StructField("id", StringType(), nullable=True), + StructField("name", StringType(), nullable=True), + StructField("url", StringType(), nullable=True), + ] + ), + "Measurement": StructType( + [ + StructField("measurement_id", StringType(), nullable=False), + StructField("measurement_set_id", StringType(), nullable=False), + StructField("experiment_condition_set_id", StringType(), nullable=False), + StructField("value", StringType(), nullable=True), + ] + ), + "MeasurementSet": StructType( + [ + StructField("measurement_set_id", StringType(), nullable=False), + StructField("variable_id", StringType(), nullable=False), + StructField("quality", StringType(), nullable=True), + StructField("created_at", DateType(), nullable=True), + ] + ), + "MixsEnvironmentalContext": StructType( + [ + StructField("mixs_environmental_context_id", StringType(), nullable=False), + StructField("env_broad_scale", StringType(), nullable=True), + StructField("env_local_scale", StringType(), nullable=True), + StructField("env_medium", StringType(), nullable=True), + ] + ), + "Name": StructType( + [ + StructField("entity_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + StructField("source", StringType(), nullable=True), + ] + ), + "OrderedProtocolStep": StructType( + [ + StructField("protocol_id", StringType(), nullable=False), + StructField("protocol_step_id", StringType(), nullable=False), + StructField("step_index", IntegerType(), nullable=False), + ] + ), + "Parameter": StructType( + [ + StructField("parameter_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("value_type", StringType(), nullable=True), + StructField("required", BooleanType(), nullable=True), + StructField("cardinality", StringType(), nullable=True), + StructField("default", StringType(), nullable=True), + StructField("parameter_type", StringType(), nullable=True), + ] + ), + "Prefix": StructType( + [ + StructField("prefix", StringType(), nullable=True), + StructField("base", StringType(), nullable=True), + ] + ), + "Project": StructType( + [ + StructField("project_id", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + ] + ), + "Protein": StructType( + [ + StructField("protein_id", StringType(), nullable=False), + StructField("hash", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("evidence_for_existence", StringType(), nullable=True), + StructField("length", IntegerType(), nullable=True), + StructField("sequence", StringType(), nullable=True), + ] + ), + "Protocol": StructType( + [ + StructField("protocol_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("doi", StringType(), nullable=True), + StructField("url", StringType(), nullable=True), + StructField("version", StringType(), nullable=True), + ] + ), + "ProtocolExecution": StructType( + [ + StructField("protocol_execution_id", StringType(), nullable=False), + StructField("protocol_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("created_at", DateType(), nullable=True), + ] + ), + "ProtocolInput": StructType( + [ + StructField("parameter_id", StringType(), nullable=False), + StructField("protocol_input_id", StringType(), nullable=False), + StructField("protocol_execution_id", StringType(), nullable=False), + StructField("value", StringType(), nullable=False), + ] + ), + "ProtocolInputSet": StructType( + [ + StructField("protocol_input_id", StringType(), nullable=False), + StructField("protocol_input_set_id", StringType(), nullable=False), + ] + ), + "ProtocolOutput": StructType( + [ + StructField("protocol_output_id", StringType(), nullable=False), + StructField("protocol_input_set_id", StringType(), nullable=False), + StructField("value", StringType(), nullable=False), + ] + ), + "ProtocolStep": StructType( + [ + StructField("protocol_step_id", StringType(), nullable=False), + StructField("step", StringType(), nullable=True), + ] + ), + "ProtocolVariable": StructType( + [ + StructField("protocol_id", StringType(), nullable=False), + StructField("variable_id", StringType(), nullable=False), + ] + ), + "Publication": StructType( + [ + StructField("publication_id", StringType(), nullable=False), + ] + ), + "QuantityRangeValue": StructType( + [ + StructField("maximum_numeric_value", FloatType(), nullable=False), + StructField("minimum_numeric_value", FloatType(), nullable=False), + StructField("unit_cv_id", StringType(), nullable=True), + StructField("unit_cv_label", StringType(), nullable=True), + StructField("unit_string", StringType(), nullable=True), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "QuantityValue": StructType( + [ + StructField("numeric_value", FloatType(), nullable=False), + StructField("unit_cv_id", StringType(), nullable=True), + StructField("unit_cv_label", StringType(), nullable=True), + StructField("unit_string", StringType(), nullable=True), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "ResourceDescription": StructType( + [ + StructField("resource_description_id", StringType(), nullable=False), + StructField("description_text", StringType(), nullable=False), + StructField("description_type", StringType(), nullable=True), + StructField("language", StringType(), nullable=True), + ] + ), + "ResourceTitle": StructType( + [ + StructField("resource_title_id", StringType(), nullable=False), + StructField("language", StringType(), nullable=True), + StructField("title", StringType(), nullable=False), + StructField("title_type", StringType(), nullable=True), + ] + ), + "Sample": StructType( + [ + StructField("sample_id", StringType(), nullable=False), + StructField("description", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + ] + ), + "Sequence": StructType( + [ + StructField("sequence_id", StringType(), nullable=False), + StructField("entity_id", StringType(), nullable=False), + StructField("type", StringType(), nullable=True), + StructField("length", IntegerType(), nullable=True), + StructField("checksum", StringType(), nullable=True), + ] + ), + "Statement": StructType( + [ + StructField("subject", StringType(), nullable=True), + StructField("predicate", StringType(), nullable=True), + StructField("object", StringType(), nullable=True), + StructField("value", StringType(), nullable=True), + StructField("datatype", StringType(), nullable=True), + StructField("language", StringType(), nullable=True), + ] + ), + "TextValue": StructType( + [ + StructField("text_value", StringType(), nullable=False), + StructField("language", StringType(), nullable=True), + StructField("raw_value", StringType(), nullable=True), + StructField("type", StringType(), nullable=True), + StructField("attribute_cv_id", StringType(), nullable=True), + StructField("attribute_cv_label", StringType(), nullable=True), + StructField("attribute_string", StringType(), nullable=True), + StructField("entity_id", StringType(), nullable=False), + ] + ), + "Variable": StructType( + [ + StructField("variable_id", StringType(), nullable=False), + StructField("name", StringType(), nullable=True), + StructField("description", StringType(), nullable=True), + StructField("name_cv_id", StringType(), nullable=True), + StructField("unit", StringType(), nullable=True), + StructField("value_type", StringType(), nullable=False), + ] + ), + "VariableValue": StructType( + [ + StructField("variable_value_id", StringType(), nullable=False), + StructField("variable_id", StringType(), nullable=False), + StructField("value_type", StringType(), nullable=True), + ] + ), +} diff --git a/src/cdm_data_loader_utils/parsers/uniref.py b/src/cdm_data_loader_utils/parsers/uniref.py index 6f24bb3..6e1cdf3 100644 --- a/src/cdm_data_loader_utils/parsers/uniref.py +++ b/src/cdm_data_loader_utils/parsers/uniref.py @@ -41,19 +41,18 @@ import os import uuid import xml.etree.ElementTree as ET -from datetime import datetime +from datetime import UTC, datetime +from pathlib import Path from urllib.error import URLError -from datetime import timezone from urllib.request import urlretrieve + import click from delta import configure_spark_with_delta_pip from pyspark.sql import SparkSession from pyspark.sql.types import StringType, StructField, StructType -from pathlib import Path from cdm_data_loader_utils.parsers.xml_utils import get_text, parse_properties - logger = logging.getLogger(__name__) @@ -102,7 +101,7 @@ def get_timestamps( if not uniref_id: raise ValueError("get_timestamps: uniref_id must be a non-empty string") - now_dt = now or datetime.now(timezone.utc) + now_dt = now or datetime.now(UTC) updated_time = now_dt.isoformat(timespec="seconds") created_time = existing_created.get(uniref_id) or updated_time @@ -187,7 +186,6 @@ def get_accession_and_seed(dbref: ET.Element | None, ns: dict[str, str]) -> tupl """ Extract UniProtKB accession and is_seed status from a dbReference element. """ - if dbref is None: return None, False diff --git a/tests/parsers/refseq_importer/__init__.py b/tests/parsers/refseq_importer/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/parsers/refseq_importer/test_spark_delta.py b/tests/parsers/refseq_importer/test_spark_delta.py index b5cd9d0..0f5751e 100644 --- a/tests/parsers/refseq_importer/test_spark_delta.py +++ b/tests/parsers/refseq_importer/test_spark_delta.py @@ -114,14 +114,12 @@ def test_write_delta_contig_collection_schema(spark) -> None: db = "cdmdb" spark.sql(f"CREATE DATABASE IF NOT EXISTS {db}") - schema = StructType( - [ - StructField("collection_id", StringType(), True), - StructField("contig_collection_type", StringType(), True), - StructField("ncbi_taxon_id", StringType(), True), - StructField("gtdb_taxon_id", StringType(), True), - ] - ) + schema = StructType([ + StructField("collection_id", StringType(), True), + StructField("contig_collection_type", StringType(), True), + StructField("ncbi_taxon_id", StringType(), True), + StructField("gtdb_taxon_id", StringType(), True), + ]) df = spark.createDataFrame( [("C1", "isolate", "NCBITaxon:123", None)], diff --git a/tests/parsers/refseq_importer/test_tables_finalize.py b/tests/parsers/refseq_importer/test_tables_finalize.py index c71911c..d9151fd 100644 --- a/tests/parsers/refseq_importer/test_tables_finalize.py +++ b/tests/parsers/refseq_importer/test_tables_finalize.py @@ -20,12 +20,10 @@ def spark(): # ------------------------------------------------------------------- @pytest.mark.requires_spark def test_list_of_dicts_to_spark(spark) -> None: - schema = StructType( - [ - StructField("a", StringType(), True), - StructField("b", StringType(), True), - ] - ) + schema = StructType([ + StructField("a", StringType(), True), + StructField("b", StringType(), True), + ]) rows = [{"a": "1", "b": "x"}, {"a": "2", "b": "y"}] df = list_of_dicts_to_spark(spark, rows, schema) @@ -40,15 +38,13 @@ def test_list_of_dicts_to_spark(spark) -> None: @pytest.mark.requires_spark def test_finalize_tables_basic(spark) -> None: # ---------- entity ---------- - e_schema = StructType( - [ - StructField("entity_id", StringType(), True), - StructField("entity_type", StringType(), True), - StructField("data_source", StringType(), True), - StructField("created", StringType(), True), - StructField("updated", StringType(), True), - ] - ) + e_schema = StructType([ + StructField("entity_id", StringType(), True), + StructField("entity_type", StringType(), True), + StructField("data_source", StringType(), True), + StructField("created", StringType(), True), + StructField("updated", StringType(), True), + ]) e1 = spark.createDataFrame( [Row(entity_id="E1", entity_type="genome", data_source="RefSeq", created="2020", updated="2021")], @@ -60,14 +56,12 @@ def test_finalize_tables_basic(spark) -> None: ) # ---------- contig_collection (schema REQUIRED due to None!) ---------- - coll_schema = StructType( - [ - StructField("collection_id", StringType(), True), - StructField("contig_collection_type", StringType(), True), - StructField("ncbi_taxon_id", StringType(), True), - StructField("gtdb_taxon_id", StringType(), True), - ] - ) + coll_schema = StructType([ + StructField("collection_id", StringType(), True), + StructField("contig_collection_type", StringType(), True), + StructField("ncbi_taxon_id", StringType(), True), + StructField("gtdb_taxon_id", StringType(), True), + ]) c1 = spark.createDataFrame( [ diff --git a/tests/parsers/test_annotation_parse.py b/tests/parsers/test_annotation_parse.py new file mode 100644 index 0000000..35c9ffe --- /dev/null +++ b/tests/parsers/test_annotation_parse.py @@ -0,0 +1,710 @@ +import json +from pathlib import Path +import pytest + +from cdm_data_loader_utils.parsers.annotation_parse import ( + load_contig_collection_x_feature, + load_contig_collection_x_protein, + load_contig_x_contig_collection, + load_contigs, + load_feature_records, + load_feature_x_protein, + load_identifiers, + load_names, + apply_prefix, + to_int, +) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + ( + { + "reports": [ + { + "annotation": { + "gene_id": "1234", + "name": "hypothetical protein", + "relationship": "RefSeq gene symbol", + } + } + ] + }, + [ + ( + "ncbigene:1234", + "1234", + "hypothetical protein", + "RefSeq", + "RefSeq gene symbol", + ) + ], + ), + ( + {"reports": [{"annotation": {"gene_id": "5678", "name": "some protein"}}]}, + [("ncbigene:5678", "5678", "some protein", "RefSeq", None)], + ), + ( + { + "reports": [ + { + "annotation": { + "name": "no gene id here", + "relationship": "RefSeq locus tag", + } + } + ] + }, + [], + ), + ( + { + "reports": [ + { + "annotation": { + "gene_id": "1001", + "name": "abc", + "relationship": "RefSeq gene symbol", + } + }, + {"annotation": {"gene_id": "1002", "name": "xyz"}}, + ] + }, + [ + ("ncbigene:1001", "1001", "abc", "RefSeq", "RefSeq gene symbol"), + ("ncbigene:1002", "1002", "xyz", "RefSeq", None), + ], + ), + ], +) +def test_load_identifiers(input_data, expected_output): + result = load_identifiers(input_data) + assert result == expected_output + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: all name fields present + ( + { + "reports": [ + { + "annotation": { + "gene_id": "1234", + "symbol": "abc", + "name": "ABC protein", + "locus_tag": "LTG_1234", + } + } + ] + }, + [ + ("ncbigene:1234", "abc", "RefSeq gene symbol", "RefSeq"), + ("ncbigene:1234", "ABC protein", "RefSeq gene name", "RefSeq"), + ("ncbigene:1234", "LTG_1234", "RefSeq locus tag", "RefSeq"), + ], + ), + # Case 2: only gene_name present + ( + {"reports": [{"annotation": {"gene_id": "5678", "name": "Hypothetical protein"}}]}, + [ + ( + "ncbigene:5678", + "Hypothetical protein", + "RefSeq gene name", + "RefSeq", + ) + ], + ), + # Case 3: no gene_id + ( + {"reports": [{"annotation": {"name": "Unnamed", "symbol": "XYZ"}}]}, + [], + ), + # Case 4: only locus_tag present + ( + {"reports": [{"annotation": {"gene_id": "8888", "locus_tag": "LTG_8888"}}]}, + [("ncbigene:8888", "LTG_8888", "RefSeq locus tag", "RefSeq")], + ), + # Case 5: multiple reports + ( + { + "reports": [ + {"annotation": {"gene_id": "1001", "symbol": "DEF"}}, + {"annotation": {"gene_id": "1002", "name": "DEF protein"}}, + ] + }, + [ + ("ncbigene:1001", "DEF", "RefSeq gene symbol", "RefSeq"), + ("ncbigene:1002", "DEF protein", "RefSeq gene name", "RefSeq"), + ], + ), + ], +) +def test_load_names(input_data, expected_output): + result = load_names(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: basic valid input with plus strand + ( + { + "reports": [ + { + "annotation": { + "gene_id": "1234", + "genomic_regions": [ + { + "gene_range": { + "range": [ + { + "begin": "100", + "end": "200", + "orientation": "plus", + } + ] + } + } + ], + } + } + ] + }, + [ + ( + "ncbigene:1234", + None, + None, + None, + 200, + None, + 100, + "positive", + "RefSeq", + None, + "gene", + ) + ], + ), + # Case 2: multiple ranges, different strands + ( + { + "reports": [ + { + "annotation": { + "gene_id": "5678", + "genomic_regions": [ + { + "gene_range": { + "range": [ + { + "begin": "300", + "end": "500", + "orientation": "minus", + }, + { + "begin": "600", + "end": "800", + "orientation": "plus", + }, + ] + } + } + ], + } + } + ] + }, + [ + ( + "ncbigene:5678", + None, + None, + None, + 500, + None, + 300, + "negative", + "RefSeq", + None, + "gene", + ), + ( + "ncbigene:5678", + None, + None, + None, + 800, + None, + 600, + "positive", + "RefSeq", + None, + "gene", + ), + ], + ), + # Case 3: missing orientation + ( + { + "reports": [ + { + "annotation": { + "gene_id": "9999", + "genomic_regions": [{"gene_range": {"range": [{"begin": "1", "end": "2"}]}}], + } + } + ] + }, + [ + ( + "ncbigene:9999", + None, + None, + None, + 2, + None, + 1, + "unknown", + "RefSeq", + None, + "gene", + ) + ], + ), + # Case 4: no gene_id + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [ + { + "gene_range": { + "range": [ + { + "begin": "100", + "end": "200", + "orientation": "plus", + } + ] + } + } + ] + } + } + ] + }, + [], + ), + # Case 5: non-integer start/end + ( + { + "reports": [ + { + "annotation": { + "gene_id": "1111", + "genomic_regions": [ + { + "gene_range": { + "range": [ + { + "begin": "abc", + "end": "xyz", + "orientation": "plus", + } + ] + } + } + ], + } + } + ] + }, + [ + ( + "ncbigene:1111", + None, + None, + None, + None, + None, + None, + "positive", + "RefSeq", + None, + "gene", + ) + ], + ), + ], +) +def test_load_feature_records(input_data, expected_output): + result = load_feature_records(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: valid mapping + ( + { + "reports": [ + { + "annotation": { + "gene_id": "12345", + "genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}], + } + } + ] + }, + [("refseq:NC_000001.11", "ncbigene:12345")], + ), + # Case 2: no gene_id + ( + {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000002.11"}}]}}]}, + [], + ), + # Case 3: no genomic_regions + ( + {"reports": [{"annotation": {"gene_id": "67890"}}]}, + [], + ), + # Case 4: empty genomic_regions list + ( + {"reports": [{"annotation": {"gene_id": "99999", "genomic_regions": []}}]}, + [], + ), + # Case 5: missing accession_version + ( + { + "reports": [ + { + "annotation": { + "gene_id": "13579", + "genomic_regions": [{"gene_range": {}}], + } + } + ] + }, + [], + ), + ], +) +def test_load_contig_collection_x_feature(input_data, expected_output): + result = load_contig_collection_x_feature(input_data) + assert result == expected_output + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: Valid report with multiple proteins + ( + { + "reports": [ + { + "annotation": { + "proteins": [ + {"accession_version": "XP_123"}, + {"accession_version": "XP_456"}, + ], + "annotations": [{"assembly_accession": "GCF_000001"}], + } + } + ] + }, + [ + ("insdc.gcf:GCF_000001", "refseq:XP_123"), + ("insdc.gcf:GCF_000001", "refseq:XP_456"), + ], + ), + # Case 2: No proteins + ( + { + "reports": [ + { + "annotation": { + "proteins": [], + "annotations": [{"assembly_accession": "GCF_000002"}], + } + } + ] + }, + [], + ), + # Case 3: No annotations + ( + {"reports": [{"annotation": {"proteins": [{"accession_version": "XP_789"}]}}]}, + [], + ), + # Case 4: Missing assembly_accession + ( + { + "reports": [ + { + "annotation": { + "proteins": [{"accession_version": "XP_789"}], + "annotations": [{}], + } + } + ] + }, + [], + ), + # Case 5: Some proteins missing accession_version + ( + { + "reports": [ + { + "annotation": { + "proteins": [ + {"accession_version": "XP_111"}, + {}, + {"accession_version": "XP_222"}, + ], + "annotations": [{"assembly_accession": "GCF_000003"}], + } + } + ] + }, + [ + ("insdc.gcf:GCF_000003", "refseq:XP_111"), + ("insdc.gcf:GCF_000003", "refseq:XP_222"), + ], + ), + ], +) +def test_load_contig_collection_x_protein(input_data, expected_output): + result = load_contig_collection_x_protein(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: valid gene with multiple proteins + ( + { + "reports": [ + { + "annotation": { + "gene_id": "4156311", + "proteins": [ + {"accession_version": "XP_001"}, + {"accession_version": "XP_002"}, + ], + } + } + ] + }, + [ + ("ncbigene:4156311", "refseq:XP_001"), + ("ncbigene:4156311", "refseq:XP_002"), + ], + ), + # Case 2: no gene_id + ( + {"reports": [{"annotation": {"proteins": [{"accession_version": "XP_999"}]}}]}, + [], + ), + # Case 3: gene with no proteins + ( + {"reports": [{"annotation": {"gene_id": "4156312"}}]}, + [], + ), + # Case 4: some proteins missing accession_version + ( + { + "reports": [ + { + "annotation": { + "gene_id": "4156313", + "proteins": [ + {"accession_version": "XP_777"}, + {}, + {"accession_version": "XP_888"}, + ], + } + } + ] + }, + [ + ("ncbigene:4156313", "refseq:XP_777"), + ("ncbigene:4156313", "refseq:XP_888"), + ], + ), + # Case 5: empty report list + ({"reports": []}, []), + ], +) +def test_load_feature_x_protein(input_data, expected_output): + result = load_feature_x_protein(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: Valid contig and assembly + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}], + "annotations": [{"assembly_accession": "GCF_000001.1"}], + } + } + ] + }, + [("refseq:NC_000001.11", "insdc.gcf:GCF_000001.1")], + ), + # Case 2: Missing genomic_regions + ( + {"reports": [{"annotation": {"annotations": [{"assembly_accession": "GCF_000002.1"}]}}]}, + [], + ), + # Case 3: Missing annotations + ( + {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000003.11"}}]}}]}, + [], + ), + # Case 4: Missing accession_version in region + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [{"gene_range": {}}], + "annotations": [{"assembly_accession": "GCF_000004.1"}], + } + } + ] + }, + [], + ), + # Case 5: Missing assembly_accession in annotations + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [{"gene_range": {"accession_version": "NC_000005.11"}}], + "annotations": [{}], + } + } + ] + }, + [], + ), + # Case 6: Multiple reports, one valid + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [{"gene_range": {"accession_version": "NC_000006.11"}}], + "annotations": [{"assembly_accession": "GCF_000006.1"}], + } + }, + { + "annotation": { + "genomic_regions": [{"gene_range": {"accession_version": "NC_000007.11"}}], + "annotations": [{}], + } + }, + ] + }, + [("refseq:NC_000006.11", "insdc.gcf:GCF_000006.1")], + ), + ], +) +def test_load_contig_x_contig_collection(input_data, expected_output): + result = load_contig_x_contig_collection(input_data) + assert sorted(result) == sorted(expected_output) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + # Case 1: Valid contig with accession_version + ( + {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}]}}]}, + [("refseq:NC_000001.11", None, None, None)], + ), + # Case 2: Multiple contigs, different accession_versions + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [ + {"gene_range": {"accession_version": "NC_000001.11"}}, + {"gene_range": {"accession_version": "NC_000002.12"}}, + ] + } + } + ] + }, + [ + ("refseq:NC_000001.11", None, None, None), + ("refseq:NC_000002.12", None, None, None), + ], + ), + # Case 3: Duplicate accession versions + ( + { + "reports": [ + { + "annotation": { + "genomic_regions": [ + {"gene_range": {"accession_version": "NC_000003.13"}}, + {"gene_range": {"accession_version": "NC_000003.13"}}, + ] + } + } + ] + }, + [("refseq:NC_000003.13", None, None, None)], + ), + # Case 4: Missing accession_version + ( + {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {}}]}}]}, + [], + ), + # Case 5: Empty reports + ( + {"reports": []}, + [], + ), + ], +) +def test_load_contigs(input_data, expected_output): + result = load_contigs(input_data) + assert sorted(result) == sorted(expected_output) + + +### add new test: to_int +@pytest.mark.parametrize( + "input_id, expected", + [ + ("GeneID:123", "ncbigene:123"), + ("YP_009725307.1", "refseq:YP_009725307.1"), + ("GCF_000001405.39", "insdc.gcf:GCF_000001405.39"), + ("random", "random"), + ], +) +def test_apply_prefix(input_id, expected): + assert apply_prefix(input_id) == expected + + +@pytest.mark.parametrize("val, expected", [("123", 123), ("abc", None), ("", None)]) +def test_to_int(val, expected): + assert to_int(val) == expected diff --git a/tests/parsers/test_uniprot.py b/tests/parsers/test_uniprot.py index 86ffca3..105fb76 100644 --- a/tests/parsers/test_uniprot.py +++ b/tests/parsers/test_uniprot.py @@ -1,4 +1,4 @@ -""" +"""Tests for the UniProt parser. This file uses pytest to provide parameterized and functional tests for all major UniProt parsing utility functions, ensuring correct parsing and transformation of @@ -16,729 +16,756 @@ - parse_uniprot_entry: Full record parsing, all fields together How to run in the terminal: - pytest tests/uniprot_refactor/test_uniprot_parsers.py + PYTHONPATH=src pytest tests/test_uniprot.py """ import datetime -import json +import re import xml.etree.ElementTree as ET -from pathlib import Path +from typing import Any import pytest from cdm_data_loader_utils.parsers.uniprot import ( build_datasource_record, + generate_cdm_id, parse_associations, - parse_cross_references, parse_evidence_map, parse_identifiers, parse_names, parse_protein_info, - save_datasource_record, + parse_publications, + parse_uniprot_entry, ) -NS_URI = "https://uniprot.org/uniprot" - +# Regular expression to validate UUID format +UUID_PATTERN = re.compile(r"^[a-f0-9]{8}-[a-f0-9]{4}-[1-5][a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$", re.IGNORECASE) -@pytest.fixture( - params=[ - "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz", - "http://example.org/uniprot_test.xml.gz", - ] -) -def xml_url(request): - return request.param +@pytest.mark.parametrize("n", range(5)) +def test_generate_cdm_id_format(n: int) -> None: + uuid = generate_cdm_id() + assert uuid.startswith("CDM:") + uuid_str = uuid[4:] + assert UUID_PATTERN.match(uuid_str), f"{uuid_str} is not a valid UUID" -def test_build_datasource_record(xml_url): - record = build_datasource_record(xml_url) - # ---- basic structure ---- +## build_datasource_record ## +def test_build_datasource_record() -> None: + url = "https://example.com/uniprot.xml.gz" + record = build_datasource_record(url) assert isinstance(record, dict) - - # ---- fixed fields ---- + assert set(record.keys()) == {"name", "source", "url", "accessed", "version"} assert record["name"] == "UniProt import" assert record["source"] == "UniProt" - assert record["url"] == xml_url + assert record["url"] == url + + # check accessed + accessed_dt = datetime.datetime.fromisoformat(record["accessed"]) + now = datetime.datetime.now(datetime.UTC) + delta = abs((now - accessed_dt).total_seconds()) + assert delta < 10 assert record["version"] == 115 - # ---- accessed field ---- - accessed = record.get("accessed") - assert accessed is not None - parsed = datetime.datetime.fromisoformat(accessed) - assert parsed.tzinfo is not None - assert parsed.tzinfo == datetime.UTC +@pytest.mark.parametrize("bad_url", [None, ""]) +def test_build_datasource_record_bad(bad_url: str | None) -> None: + record = build_datasource_record(bad_url) + assert record["url"] == bad_url -def test_save_datasource_record(tmp_path: Path, xml_url): - """ - save_datasource_record should: - - create output directory if missing - - write datasource.json - - return the same content that is written to disk +## parse_identifiers function test ## +@pytest.mark.parametrize( + ("xml_str", "cdm_id", "expected"), + [ + ### multiple accessions, expect two dict, every dic use the same cdm_id + ### identifier according to number + ( + """ + + Q9V2L2 + G8ZFP4 + + """, + "CDM:001", + [ + { + "entity_id": "CDM:001", + "identifier": "UniProt:Q9V2L2", + "source": "UniProt", + "description": "UniProt accession", + }, + { + "entity_id": "CDM:001", + "identifier": "UniProt:G8ZFP4", + "source": "UniProt", + "description": "UniProt accession", + }, + ], + ), + ### Use single accession + ( + """ + + X00001 + + """, + "CDM:002", + [ + { + "entity_id": "CDM:002", + "identifier": "UniProt:X00001", + "source": "UniProt", + "description": "UniProt accession", + } + ], + ), + ### No accession + ( + """ + + + """, + "CDM:003", + [], + ), + ], +) +def test_parse_identifiers(xml_str: str, cdm_id: str, expected: list[dict[str, str]]) -> None: """ - output_dir = tmp_path / "output" - - # ---- call function ---- - result = save_datasource_record(xml_url, str(output_dir)) + This approach ensures that parse_identifiers correctly parses and structures identifier data. - # ---- return value sanity ---- - assert isinstance(result, dict) - assert result["url"] == xml_url - assert result["source"] == "UniProt" - assert result["name"] == "UniProt import" - assert "accessed" in result - assert "version" in result + The parsed Element object and the provided CDM_id are passed to the parse_identifiers funtion. + The function is expected to extract all relevant identifier information from the XML and return list of dict. - # ---- file existence ---- - output_file = output_dir / "datasource.json" - assert output_file.exists() - assert output_file.is_file() - - # ---- file content correctness ---- - with open(output_file, encoding="utf-8") as f: - on_disk = json.load(f) - - assert on_disk == result + The test compares the result output with the predefined expected result using an assert statement. + """ + entry = ET.fromstring(xml_str) + result = parse_identifiers(entry, cdm_id) + assert result == expected -def make_entry(names=None, protein_names=None): - entry = ET.Element(f"{{{NS_URI}}}entry") - # - for n in names or []: - e = ET.SubElement(entry, f"{{{NS_URI}}}name") - e.text = n +""" + This parameterized pytest function tests the correctness of the parse_names function for various UniProt XML entry scenarios. - # block - if protein_names: - protein = ET.SubElement(entry, f"{{{NS_URI}}}protein") + XML string representing a UniProt entry with different protein names: + top-level + recommended names, + alternative names, + combinations, + no names - for tag, logical in [ - ("recommendedName", "recommended"), - ("alternativeName", "alternative"), - ]: - if logical not in protein_names: - continue + cdm_id: CDM entry ID - block = ET.SubElement(protein, f"{{{NS_URI}}}{tag}") - for xml_tag in ["fullName", "shortName"]: - val = protein_names[logical].get(xml_tag.replace("Name", "")) - if val: - e = ET.SubElement(block, f"{{{NS_URI}}}{xml_tag}") - e.text = val + Output: + A list of name records with their metadata - return entry +""" +## parse_names function test ## @pytest.mark.parametrize( - "entry_kwargs, cdm_id, expected", + ("xml_str", "cdm_id", "expected"), [ - # Only + # Only top-level ( - {"names": ["ProteinA"]}, - "cdm_1", - { - ("ProteinA", "UniProt entry name"), - }, + """ + MainProteinName + """, + "CDM:001", + [ + { + "entity_id": "CDM:001", + "name": "MainProteinName", + "description": "UniProt protein name", + "source": "UniProt", + } + ], ), - # entry name + recommended full name + # RecommendedName (fullName and shortName) ( - { - "names": ["ProteinB"], - "protein_names": { - "recommended": {"full": "Rec Full B", "short": None}, + """ + + + RecFullName + RecShort + + + """, + "CDM:002", + [ + { + "entity_id": "CDM:002", + "name": "RecFullName", + "description": "UniProt recommended full name", + "source": "UniProt", }, - }, - "cdm_2", - { - ("ProteinB", "UniProt entry name"), - ("Rec Full B", "UniProt recommended full name"), - }, + { + "entity_id": "CDM:002", + "name": "RecShort", + "description": "UniProt recommended short name", + "source": "UniProt", + }, + ], ), - # everything + # AlternativeName (fullName and shortName) ( - { - "names": ["ProteinC"], - "protein_names": { - "recommended": {"full": "Rec Full C", "short": "Rec Short C"}, - "alternative": {"full": "Alt Full C", "short": "Alt Short C"}, + """ + + + AltFullName1 + AltShort1 + + + AltFullName2 + + + """, + "CDM:003", + [ + { + "entity_id": "CDM:003", + "name": "AltFullName1", + "description": "UniProt alternative full name", + "source": "UniProt", }, - }, - "cdm_3", - { - ("ProteinC", "UniProt entry name"), - ("Rec Full C", "UniProt recommended full name"), - ("Rec Short C", "UniProt recommended short name"), - ("Alt Full C", "UniProt alternative full name"), - ("Alt Short C", "UniProt alternative short name"), - }, + { + "entity_id": "CDM:003", + "name": "AltShort1", + "description": "UniProt alternative short name", + "source": "UniProt", + }, + { + "entity_id": "CDM:003", + "name": "AltFullName2", + "description": "UniProt alternative full name", + "source": "UniProt", + }, + ], + ), + # Mixed: top-level and + ( + """ + TopLevel + + + MixedFull + + + """, + "CDM:004", + [ + { + "entity_id": "CDM:004", + "name": "TopLevel", + "description": "UniProt protein name", + "source": "UniProt", + }, + { + "entity_id": "CDM:004", + "name": "MixedFull", + "description": "UniProt recommended full name", + "source": "UniProt", + }, + ], + ), + # No names at all + ( + """ + """, + "CDM:005", + [], ), ], ) -def test_parse_names_parametrized(entry_kwargs, cdm_id, expected): - entry = make_entry(**entry_kwargs) +def test_parse_names(xml_str: str, cdm_id: str, expected: list[dict[str, str]]) -> None: + entry = ET.fromstring(xml_str) + result = parse_names(entry, cdm_id) + assert result == expected - rows = parse_names(entry, cdm_id) - # ---- row count ---- - assert len(rows) == len(expected) +""" + + This test ensures parse_protein_info works correctly for different combinations of data + Including cases with no protein info, sequence only, existence only or EC numbers + + This approach thoroughly validates that parse_protein_info can accurately extract, combine and structure metadata field. - # ---- content ---- - observed = {(r["name"], r["description"]) for r in rows} - assert observed == expected + Include: + EC Number, + existence evidence, + sequence - # ---- entity_id and source ---- - for r in rows: - assert r["entity_id"] == cdm_id - assert r["source"] == "UniProt" +""" +## parse_protein_info function test ## @pytest.mark.parametrize( - "build_entry, cdm_id, expected", + ("xml_str", "cdm_id", "expected"), [ - # -------------------------------------------------- - # Empty entry -> None - # -------------------------------------------------- + # There are multiple ecNumbers under the recommend names ( - lambda: ET.Element(f"{{{NS_URI}}}entry"), - "cdm_1", - None, + """ + + + 1.2.3.4 + 5.6.7.8 + + + """, + "CDM:001", + {"ec_numbers": ["1.2.3.4", "5.6.7.8"]}, + ), + # alternativeName has EC Number + ( + """ + + + 3.3.3.3 + + + """, + "CDM:002", + {"ec_numbers": ["3.3.3.3"]}, ), - # -------------------------------------------------- - # Only EC numbers - # -------------------------------------------------- + # If have both proteinExistence evidence and existence ( - lambda: ( - lambda entry: ( - ET.SubElement( - ET.SubElement( - ET.SubElement(entry, f"{{{NS_URI}}}protein"), - f"{{{NS_URI}}}recommendedName", - ), - f"{{{NS_URI}}}ecNumber", - ).__setattr__("text", "1.1.1.1"), - entry, - )[1] - )(ET.Element(f"{{{NS_URI}}}entry")), - "cdm_2", + """ + + """, + "CDM:003", { - "ec_numbers": "1.1.1.1", + "protein_id": "CDM:003", + "evidence_for_existence": "evidence at protein level", }, ), - # -------------------------------------------------- - # Only sequence + entry modified - # -------------------------------------------------- + # Sequence only ( - lambda: ( - lambda entry: ( - entry.set("modified", "2024-01-01"), - ET.SubElement( - entry, - f"{{{NS_URI}}}sequence", - { - "length": "100", - "mass": "12345", - "checksum": "ABC", - "version": "2", - }, - ).__setattr__("text", "MKTIIALSY"), - entry, - )[2] - )(ET.Element(f"{{{NS_URI}}}entry")), - "cdm_3", + """ + + MAGNLSKVAAVSGVAAAVLGK + + """, + "CDM:004", { - "length": "100", + "length": "357", "mass": "12345", - "checksum": "ABC", + "checksum": "ABCD", + "modified": "2024-05-21", "sequence_version": "2", - "sequence": "MKTIIALSY", - "entry_modified": "2024-01-01", + "sequence": "MAGNLSKVAAVSGVAAAVLGK", }, ), - # -------------------------------------------------- - # Everything - # -------------------------------------------------- + # Combine with three elements: proteinExistence, sequence and ecNumbers ( - lambda: ( - lambda entry: ( - entry.set("modified", "2024-02-02"), - # protein + EC - ET.SubElement( - ET.SubElement( - ET.SubElement(entry, f"{{{NS_URI}}}protein"), - f"{{{NS_URI}}}recommendedName", - ), - f"{{{NS_URI}}}ecNumber", - ).__setattr__("text", "3.5.4.4"), - # proteinExistence - ET.SubElement( - entry, - f"{{{NS_URI}}}proteinExistence", - {"type": "evidence at protein level"}, - ), - # sequence - ET.SubElement( - entry, - f"{{{NS_URI}}}sequence", - { - "length": "250", - "mass": "99999", - "checksum": "XYZ", - "modified": "2023-12-01", - "version": "1", - }, - ).__setattr__("text", "MADEUPSEQUENCE"), - entry, - )[4] - )(ET.Element(f"{{{NS_URI}}}entry")), - "cdm_4", + """ + + + 3.3.3.3 + + + 8.8.8.8 + + + + + MKTLLTGAAT + + """, + "CDM:005", { - "ec_numbers": "3.5.4.4", - "protein_id": "cdm_4", - "evidence_for_existence": "evidence at protein level", - "length": "250", - "mass": "99999", + "ec_numbers": ["3.3.3.3", "8.8.8.8"], + "protein_id": "CDM:005", + "evidence_for_existence": "evidence at transcript level", + "length": "10", + "mass": "1000", "checksum": "XYZ", - "modified": "2023-12-01", + "modified": "2021-12-01", "sequence_version": "1", - "sequence": "MADEUPSEQUENCE", - "entry_modified": "2024-02-02", + "sequence": "MKTLLTGAAT", }, ), + # return None + ("""""", "CDM:006", None), ], ) -def test_parse_protein_info(build_entry, cdm_id, expected): - entry = build_entry() - +def test_parse_protein_info(xml_str: str, cdm_id: str, expected: dict[str, Any]) -> None: + entry = ET.fromstring(xml_str) result = parse_protein_info(entry, cdm_id) + assert result == expected + - if expected is None: - assert result is None - else: - assert isinstance(result, dict) - assert result == expected +""" + + This parameterized pytest function verifies the behavior of the parse_evidence_map function + for different UniProt XML entry structures involving evidence elements. + + xml_str: Simulates a UniProt entry with various and sub-structures, + including cases with multiple evidence elements, missing sources, or no evidence at all. + + expected: A dictionary mapping evidence keys to their extracted details—such as evidence type, + supporting objects, and publication references. + + Ensure parse_evidence_map: + Accurately extract evidence keys and types + Correctly classify supporting objects and publication references + Handle entries with absent sources or evidence elements + Represent all relevant evidence metadata in the required structure + +""" +## parse_evidence_map function test ## @pytest.mark.parametrize( - "build_xml, expected", + ("xml_str", "expected"), [ - # -------------------------------------------------- - # No evidence elements - # -------------------------------------------------- + # Single evidence,include PubMed and supporting object ( - lambda: ET.Element(f"{{{NS_URI}}}entry"), - {}, + """ + + + + + + + """, + { + "1": { + "evidence_type": "ECO:0000255", + "supporting_objects": ["Ensembl:ENSG00001"], + "publications": ["PMID:123456"], + } + }, ), - # -------------------------------------------------- - # Evidence without key - # -------------------------------------------------- + # multiple evidences ( - lambda: ( - lambda entry: ( - ET.SubElement(entry, f"{{{NS_URI}}}evidence", {"type": "ECO:0000269"}), - entry, - )[1] - )(ET.Element(f"{{{NS_URI}}}entry")), - {}, + """ + + + + + + + + + + + """, + { + "E1": { + "evidence_type": "ECO:0000313", + "supporting_objects": None, + "publications": ["PMID:654321"], + }, + "E2": { + "evidence_type": "ECO:0000250", + "supporting_objects": ["PDB:2N7Q"], + "publications": None, + }, + }, ), - # -------------------------------------------------- - # Evidence with key, no source - # -------------------------------------------------- + # no source ( - lambda: ( - lambda entry: ( - ET.SubElement( - entry, - f"{{{NS_URI}}}evidence", - {"key": "1", "type": "ECO:0000313"}, - ), - entry, - )[1] - )(ET.Element(f"{{{NS_URI}}}entry")), + """ + + """, { - "1": { - "evidence_type": "ECO:0000313", + "X1": { + "evidence_type": "ECO:9999999", + "supporting_objects": None, + "publications": None, } }, ), - # -------------------------------------------------- - # Evidence with PUBMED with other refs - # -------------------------------------------------- + # no evidence + ( + """ + """, + {}, + ), + # one evidence with multiple supporting objects ( - lambda: ( - lambda entry: ( - lambda ev: ( - ET.SubElement( - ET.SubElement(ev, f"{{{NS_URI}}}source"), - f"{{{NS_URI}}}dbReference", - {"type": "PubMed", "id": "12345"}, - ), - ET.SubElement( - ET.SubElement(ev, f"{{{NS_URI}}}source"), - f"{{{NS_URI}}}dbReference", - {"type": "GO", "id": "GO:0008150"}, - ), - entry, - )[2] - )( - ET.SubElement( - entry, - f"{{{NS_URI}}}evidence", - {"key": "E2", "type": "ECO:0000269"}, - ) - ) - )(ET.Element(f"{{{NS_URI}}}entry")), + """ + + + + + + + """, { - "E2": { + "K1": { "evidence_type": "ECO:0000269", - "publications": ["PMID:12345"], + "supporting_objects": ["Ensembl:ENS1", "RefSeq:RS123"], + "publications": None, } }, ), ], ) -def test_parse_evidence_map_parametrized(build_xml, expected): - entry = build_xml() +def test_parse_evidence_map(xml_str: str, expected: dict[str, Any]) -> None: + entry = ET.fromstring(xml_str) result = parse_evidence_map(entry) - - assert isinstance(result, dict) assert result == expected +""" + + xml_strings: models a UniProt entry with different types of possible associations + cdm_id: uniquely identifies the protein being parsed + evidence_map: supplies external evidence metadata for associations + expected: list of association dictionaries + + Arg: + The function correctly links proteins to organism taxonomy. + Cross-references are properly included, evidence metadata is correctly merged. + Associations derived from catalytic activity and cofactor comments are correctly generated. + All combinations and edge cases are handled robustly. + +""" + + +## parse_associations function test ## @pytest.mark.parametrize( - "build_xml, cdm_id, evidence_map, expected", + ("xml_str", "cdm_id", "evidence_map", "expected"), [ - # -------------------------------------------------- - # Taxonomy association only - # -------------------------------------------------- + # organism association(NCBI Taxonomy dbReference) ( - lambda: ( - lambda entry: ( - ET.SubElement( - ET.SubElement(entry, f"{{{NS_URI}}}organism"), - f"{{{NS_URI}}}dbReference", - {"type": "NCBI Taxonomy", "id": "1234"}, - ), - entry, - )[1] - )(ET.Element(f"{{{NS_URI}}}entry")), - "cdm_1", + """ + + + + """, + "CDM:1", {}, + [{"subject": "CDM:1", "object": "NCBITaxon:9606"}], + ), + # dbReference with evidence key + ( + """ + + """, + "CDM:2", + { + "E1": { + "evidence_type": "ECO:0000250", + "supporting_objects": ["Ensembl:ENS1"], + "publications": ["PMID:1234"], + } + }, [ { - "subject": "cdm_1", - "object": "NCBITaxon:1234", - "predicate": "in_taxon", + "subject": "CDM:2", + "object": "PDB:2N7Q", + "evidence_type": "ECO:0000250", + "supporting_objects": ["Ensembl:ENS1"], + "publications": ["PMID:1234"], } ], ), - # -------------------------------------------------- - # Catalytic activity with evidence - # -------------------------------------------------- + # comment catalytic activity (reaction) with evidence key ( - lambda: ( - lambda entry: ( - lambda comment: ( - lambda reaction: ( - ET.SubElement( - reaction, - f"{{{NS_URI}}}dbReference", - {"type": "Rhea", "id": "RHEA:12345"}, - ), - entry, - )[1] - )( - ET.SubElement( - comment, - f"{{{NS_URI}}}reaction", - {"evidence": "E1"}, - ) - ) - )( - ET.SubElement( - entry, - f"{{{NS_URI}}}comment", - {"type": "catalytic activity"}, - ) - ) - )(ET.Element(f"{{{NS_URI}}}entry")), - "cdm_2", + """ + + + + + + """, + "CDM:3", { - "E1": { - "evidence_type": "ECO:0000269", - "publications": ["PMID:12345"], + "E2": { + "evidence_type": "ECO:0000313", + "publications": ["PMID:2222"], } }, [ { - "subject": "cdm_2", + "subject": "CDM:3", "predicate": "catalyzes", - "object": "Rhea:RHEA:12345", - "evidence_type": "ECO:0000269", - "publications": ["PMID:12345"], + "object": "Rhea:12345", + "evidence_type": "ECO:0000313", + "publications": ["PMID:2222"], } ], ), - # -------------------------------------------------- - # Cofactor association - # -------------------------------------------------- + # Comment cofactor without evidence ( - lambda: ( - lambda entry: ( - lambda comment: ( - ET.SubElement( - ET.SubElement( - comment, - f"{{{NS_URI}}}cofactor", - ), - f"{{{NS_URI}}}dbReference", - {"type": "ChEBI", "id": "CHEBI:15377"}, - ), - entry, - )[1] - )( - ET.SubElement( - entry, - f"{{{NS_URI}}}comment", - {"type": "cofactor"}, - ) - ) - )(ET.Element(f"{{{NS_URI}}}entry")), - "cdm_3", + """ + + + + + + """, + "CDM:4", {}, [ { - "subject": "cdm_3", + "subject": "CDM:4", "predicate": "requires_cofactor", "object": "ChEBI:CHEBI:15377", } ], ), + # Several relevant relationship(with organism and dbReference) + ( + """ + + + + + """, + "CDM:5", + {}, + [ + {"subject": "CDM:5", "object": "NCBITaxon:562"}, + {"subject": "CDM:5", "object": "RefSeq:NP_414543"}, + ], + ), + # if it is empty entry, return to [] + ("""""", "CDM:6", {}, []), ], ) -def test_parse_associations_parametrized(build_xml, cdm_id, evidence_map, expected): - entry = build_xml() - +def test_parse_associations( + xml_str: str, cdm_id: str, evidence_map: dict[str, Any], expected: list[dict[str, str]] +) -> None: + entry = ET.fromstring(xml_str) result = parse_associations(entry, cdm_id, evidence_map) - - assert isinstance(result, list) assert result == expected +""" + + xml_str: Uniprot entry include , , + Refer: PubMed, DOI, GeneBank, DDBJ, EMBL + + Output: List of publication identifier + + Arg: + Extract publication of references + Recognize and format database types ( with prefixing “PMID:”, “DOI:”) + Handle entries with multiple or mixed publication types + Return an empty list if no publication data. + +""" + + +## parse_publications function test ## @pytest.mark.parametrize( - "build_xml, cdm_id, expected", + ("xml_str", "expected"), [ - # -------------------------------------------------- - # No dbReference - # -------------------------------------------------- + # Single PubMed ( - lambda: ET.Element(f"{{{NS_URI}}}entry"), - "cdm_1", - [], + """ + + + + + + """, + ["PMID:12345"], ), - # -------------------------------------------------- - # dbReference with CURIE id - # -------------------------------------------------- + # Multiple types include (PubMed, DOI, GenBank) ( - lambda: ( - lambda entry: ( - ET.SubElement( - entry, - f"{{{NS_URI}}}dbReference", - {"type": "GO", "id": "GO:0008150"}, - ), - entry, - )[1] - )(ET.Element(f"{{{NS_URI}}}entry")), - "cdm_2", - [ - { - "entity_id": "cdm_2", - "xref_type": "GO", - "xref_value": "GO:0008150", - "xref": "GO:0008150", - } - ], + """ + + + + + + + + """, + ["PMID:55555", "DOI:10.1000/j.jmb.2020.01.001"], ), - # -------------------------------------------------- - # dbReference without CURIE (prefix) - # -------------------------------------------------- + # Multiple references ( - lambda: ( - lambda entry: ( - ET.SubElement( - entry, - f"{{{NS_URI}}}dbReference", - {"type": "CDD", "id": "cd04253"}, - ), - entry, - )[1] - )(ET.Element(f"{{{NS_URI}}}entry")), - "cdm_3", - [ - { - "entity_id": "cdm_3", - "xref_type": "CDD", - "xref_value": "cd04253", - "xref": "CDD:cd04253", - } - ], - ), - # -------------------------------------------------- - # Mixed dbReferences - # -------------------------------------------------- - ( - lambda: ( - lambda entry: ( - ET.SubElement( - entry, - f"{{{NS_URI}}}dbReference", - {"type": "GO", "id": "GO:0003674"}, - ), - ET.SubElement( - entry, - f"{{{NS_URI}}}dbReference", - {"type": "PDB", "id": "1ABC"}, - ), - entry, - )[2] - )(ET.Element(f"{{{NS_URI}}}entry")), - "cdm_4", - [ - { - "entity_id": "cdm_4", - "xref_type": "GO", - "xref_value": "GO:0003674", - "xref": "GO:0003674", - }, - { - "entity_id": "cdm_4", - "xref_type": "PDB", - "xref_value": "1ABC", - "xref": "PDB:1ABC", - }, - ], + """ + + + + + + + + + + + """, + ["DOI:10.1000/jmb.123456", "PMID:98765"], ), - # -------------------------------------------------- - # Missing type or id - # -------------------------------------------------- + # dbReference: DDBJ and EMBL ( - lambda: ( - lambda entry: ( - ET.SubElement( - entry, - f"{{{NS_URI}}}dbReference", - {"type": "GO"}, # missing id - ), - ET.SubElement( - entry, - f"{{{NS_URI}}}dbReference", - {"id": "123"}, # missing type - ), - entry, - )[2] - )(ET.Element(f"{{{NS_URI}}}entry")), - "cdm_5", + """ + + + + + + + """, [], ), + # no publication + ("""""", []), ], ) -def test_parse_cross_references_parametrized(build_xml, cdm_id, expected): - entry = build_xml() - - result = parse_cross_references(entry, cdm_id) - - assert isinstance(result, list) +def test_parse_publications(xml_str: str, expected: list[str]) -> None: + entry = ET.fromstring(xml_str) + result = parse_publications(entry) assert result == expected +## parse_uniprot_entry function test ## @pytest.mark.parametrize( - "build_xml, cdm_id, expected", + ("xml_str", "datasource_name", "prev_created"), [ - # -------------------------------------------------- - # No accession - # -------------------------------------------------- - ( - lambda: ET.Element(f"{{{NS_URI}}}entry"), - "cdm_1", - [], - ), - # -------------------------------------------------- - # Single accession - # -------------------------------------------------- - ( - lambda: ( - lambda entry: ( - ET.SubElement(entry, f"{{{NS_URI}}}accession").__setattr__("text", "P12345"), - entry, - )[1] - )(ET.Element(f"{{{NS_URI}}}entry")), - "cdm_2", - [ - { - "entity_id": "cdm_2", - "identifier": "UniProt:P12345", - "source": "UniProt", - "description": "UniProt accession", - } - ], - ), - # -------------------------------------------------- - # Multiple accessions - # -------------------------------------------------- - ( - lambda: ( - lambda entry: ( - ET.SubElement(entry, f"{{{NS_URI}}}accession").__setattr__("text", "Q11111"), - ET.SubElement(entry, f"{{{NS_URI}}}accession").__setattr__("text", "Q22222"), - entry, - )[2] - )(ET.Element(f"{{{NS_URI}}}entry")), - "cdm_3", - [ - { - "entity_id": "cdm_3", - "identifier": "UniProt:Q11111", - "source": "UniProt", - "description": "UniProt accession", - }, - { - "entity_id": "cdm_3", - "identifier": "UniProt:Q22222", - "source": "UniProt", - "description": "UniProt accession", - }, - ], - ), - # -------------------------------------------------- - # parse_identifiers_generic already sets source/description → setdefault - # -------------------------------------------------- ( - lambda: ( - lambda entry: ( - ET.SubElement(entry, f"{{{NS_URI}}}accession").__setattr__("text", "A0A000"), - entry, - )[1] - )(ET.Element(f"{{{NS_URI}}}entry")), - "cdm_4", - [ - { - "entity_id": "cdm_4", - "identifier": "UniProt:A0A000", - "source": "UniProt", # remains - "description": "UniProt accession", # remains - } - ], + """ + + P12345 + ProteinX + + + ProteinX Full Name + + + + + + + + + + + + """, + "UniProt import", + None, ), ], ) -def test_parse_identifiers_parametrized(build_xml, cdm_id, expected): - entry = build_xml() - - result = parse_identifiers(entry, cdm_id) - - assert isinstance(result, list) - assert result == expected +def test_parse_uniprot_entry(xml_str: str, datasource_name: str, prev_created: None) -> None: + entry = ET.fromstring(xml_str) + cdm_id = generate_cdm_id() + + current_timestamp = "2024-07-17T13:00:00Z" + + record = parse_uniprot_entry(entry, cdm_id, current_timestamp, datasource_name, prev_created) + + entity = record["entity"] + assert entity["entity_type"] == "protein" + assert entity["data_source"] == datasource_name + assert entity["version"] == "3" + assert entity["uniprot_created"] == "2020-01-01" + assert entity["uniprot_modified"] == "2021-01-01" + assert entity["entity_id"].startswith("CDM:") + + # identifiers/names/associations/publications + assert isinstance(record["identifiers"], list) + assert isinstance(record["names"], list) + assert isinstance(record["associations"], list) + assert isinstance(record["publications"], list) diff --git a/tests/parsers/test_uniref.py b/tests/parsers/test_uniref.py index 9ca5360..630949c 100644 --- a/tests/parsers/test_uniref.py +++ b/tests/parsers/test_uniref.py @@ -1,318 +1,284 @@ -import os -import sys +"""Tests for the UniRef importer.""" -sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) - -import gzip -import tempfile +import datetime as dt +import textwrap import xml.etree.ElementTree as ET -from datetime import datetime, timezone + import pytest from cdm_data_loader_utils.parsers.uniref import ( + add_cluster_members, cdm_entity_id, - get_timestamps, extract_cluster, - get_accession_and_seed, - add_cluster_members, extract_cross_refs, - parse_uniref_xml, + get_accession_and_seed, + get_timestamps, ) -NS = {"ns": "http://uniprot.org/uniref"} - -# --------------------------------------------------------- -# cdm_entity_id -# --------------------------------------------------------- @pytest.mark.parametrize( - "value, should_raise", - [ - ("A0A009HJL9", False), - ("UniRef100_A0A009HJL9", False), - ("", True), - (None, True), - ], + ("accession", "expected_prefix"), + [("A0B0123456", "CDM:"), ("P01234", "CDM:"), ("", None), (None, None)], ) -def test_cdm_entity_id(value, should_raise): - if should_raise: - with pytest.raises(ValueError): - cdm_entity_id(value) +def test_cdm_entity_id(accession: str | None, expected_prefix: str | None) -> None: + """Ensure that CDM entities start with the appropriate prefix.""" + result = cdm_entity_id(accession) + if expected_prefix is None: + assert result is None else: - out = cdm_entity_id(value) - assert isinstance(out, str) - assert out.startswith("CDM:") + assert result.startswith(expected_prefix) -# --------------------------------------------------------- -# get_timestamps -# --------------------------------------------------------- @pytest.mark.parametrize( - "uniref_id, existing, now, expect_created_same_as_updated", + ("xml_str", "expected_name"), [ ( - "UniRef100_A", - {"UniRef100_A": "2024-01-01T00:00:00+00:00"}, - datetime(2025, 1, 1, 0, 0, 0, tzinfo=timezone.utc), - False, + "" + "TestName", + "TestName", ), ( - "UniRef100_B", - {}, - datetime(2025, 1, 1, 0, 0, 0, tzinfo=timezone.utc), - True, - ), - ( - "UniRef100_C", - {}, - None, - True, - ), - ], -) -def test_get_timestamps(uniref_id, existing, now, expect_created_same_as_updated): - updated, created = get_timestamps(uniref_id, existing, now) - - assert isinstance(updated, str) - assert isinstance(created, str) - assert updated.endswith("+00:00") - - if expect_created_same_as_updated: - assert updated == created - else: - assert updated != created - - -@pytest.mark.parametrize("bad_id", ["", None]) -def test_get_timestamps_rejects_empty_uniref_id(bad_id): - with pytest.raises(ValueError): - get_timestamps(bad_id, {}, None) - - -# --------------------------------------------------------- -# add_cluster_members -# --------------------------------------------------------- -@pytest.mark.parametrize( - "repr_xml, member_xmls, expected_count", - [ - ( - """ - - - - - """, - [ - """ - - - - """, - """ - - - - """, - ], - 3, - ), - ( - None, - [ - """ - - - - """, - ], - 1, + "", + "UNKNOWN", ), - (None, [], 0), ], ) -def test_add_cluster_members(repr_xml, member_xmls, expected_count): - cluster_id = "CDM_CLUSTER" - repr_db = ET.fromstring(repr_xml) if repr_xml else None - - entry = ET.Element("{http://uniprot.org/uniref}entry") - for m in member_xmls: - mem = ET.SubElement(entry, "{http://uniprot.org/uniref}member") - mem.append(ET.fromstring(m)) - - rows = [] - add_cluster_members(cluster_id, repr_db, entry, rows, NS) - - assert len(rows) == expected_count - for r in rows: - assert r[0] == cluster_id - assert r[1].startswith("CDM:") - assert r[4] == "1.0" +def test_extract_cluster(xml_str: str, expected_name: str) -> None: + """Test cluster extraction from XML.""" + ns = {"ns": "http://uniprot.org/uniref"} + elem = ET.fromstring(xml_str) + cluster_id, name = extract_cluster(elem, ns) + assert cluster_id.startswith("CDM:") + assert isinstance(cluster_id, str) + assert name == expected_name -# --------------------------------------------------------- -# extract_cluster -# --------------------------------------------------------- @pytest.mark.parametrize( - "xml_str, uniref_id, expected_name", + ("uniref_id", "existing_created", "now", "expected"), [ + # Has existing_created ( - "Test Cluster Name", "UniRef100_A", - "Test Cluster Name", + {"UniRef100_A": "2024-01-01T00:00:00"}, + dt.datetime(2025, 1, 1, 0, 0, 0, tzinfo=dt.UTC), + ("2025-01-01T00:00:00", "2024-01-01T00:00:00"), ), + # There is no existing_created ( - "", "UniRef100_B", - "UNKNOWN", + {"UniRef100_A": "2024-01-01T00:00:00"}, + dt.datetime(2025, 1, 1, 0, 0, 0, tzinfo=dt.UTC), + ("2025-01-01T00:00:00", "2025-01-01T00:00:00"), + ), + # There is no existing_created,also not provide "now" + ( + "UniRef100_C", + {}, + None, # The system automatically use the current time + None, # Only assert that the return is a string and they are equal ), ], ) -def test_extract_cluster(xml_str, uniref_id, expected_name): - elem = ET.fromstring(xml_str) - - cluster_id, name = extract_cluster(elem, NS, uniref_id) - - # ---- cluster_id checks ---- - assert isinstance(cluster_id, str) - assert cluster_id.startswith("CDM:") - - # ---- name checks ---- - assert name == expected_name +def test_get_timestamps(uniref_id: str, existing_created: str, now: dt.datetime, expected: tuple[str] | None) -> None: + """Test timestamps.""" + result = get_timestamps(uniref_id, existing_created, now) + if expected is not None: + assert result == expected + else: + formatted_now, created_time = result + assert formatted_now == created_time + assert isinstance(formatted_now, str) + assert len(formatted_now) == 19 # "YYYY-MM-DDTHH:MM:SS" ---> 19 bites @pytest.mark.parametrize( - "xml_str, expected_acc, expected_is_seed", + ("xml_str", "expected_acc", "expected_is_seed"), [ - # accession + isSeed=true + # Have accession and isSeed ( """ - - - - - """, + + + + + """, "A0A009HJL9", True, ), - # accession only + # Only accession, no isSeed ( """ - - - - """, + + + + """, "A0A241V597", False, ), - # no accession + # No accession, only id ( """ - - - - """, - None, - False, - ), - # dbref is None - ( - None, - None, + + """, + "ID_ONLY", False, ), + # None + (None, None, False), ], ) -def test_get_accession_and_seed(xml_str, expected_acc, expected_is_seed): +def test_get_accession_and_seed(xml_str: str | None, expected_acc: str | None, expected_is_seed: bool) -> None: + """Test parsing of UniRef entries for accession and seed status.""" + ns = {"ns": "http://uniprot.org/uniref"} dbref = ET.fromstring(xml_str) if xml_str else None - - acc, is_seed = get_accession_and_seed(dbref, NS) - + acc, is_seed = get_accession_and_seed(dbref, ns) assert acc == expected_acc assert is_seed == expected_is_seed -# --------------------------------------------------------- -# extract_cross_refs -# --------------------------------------------------------- +def make_entry_with_members(member_xmls: list[str], ns_uri: str = "http://uniprot.org/uniref") -> ET.Element: + """ + Receives a list of xml strings from dbReference, generates an element with child nodes. + """ + entry_elem = ET.Element(f"{{{ns_uri}}}entry") + for dbref_xml in member_xmls: + dbref_elem = ET.fromstring(dbref_xml) + member_elem = ET.SubElement(entry_elem, f"{{{ns_uri}}}member") + member_elem.append(dbref_elem) + return entry_elem + + @pytest.mark.parametrize( - "props, expected", + ("repr_xml", "member_xmls", "expected"), [ - ( + pytest.param( + # representative member, two members + textwrap.dedent(""" + + + + + """), + [ + textwrap.dedent(""" + + + + """), + textwrap.dedent(""" + + + + + """), + ], [ - ("UniProtKB accession", "A0A1"), - ("UniRef90 ID", "UniRef90_X"), - ("UniParc ID", "UPI0001"), + ("CLUSTER_X", "CDM:", "true", "true", "1.0"), + ("CLUSTER_X", "CDM:", "false", "false", "1.0"), + ("CLUSTER_X", "CDM:", "false", "true", "1.0"), ], - { - ("UniRef90 ID", "UniRef90_X"), - ("UniParc ID", "UPI0001"), - }, + id="with-representative-and-members", ), - ( + pytest.param( + # Only memebers, no representative member + None, [ - ("UniProtKB accession", "A0A2"), + textwrap.dedent(""" + + + + """) ], - set(), + [("CLUSTER_X", "CDM:", "false", "false", "1.0")], + id="members-only", + ), + pytest.param( + # No members, no representative member + None, + [], + [], + id="no-members", ), ], ) -def test_extract_cross_refs(props, expected): - dbref = ET.Element("{http://uniprot.org/uniref}dbReference", id="UniProtKB:A0A1") - - for k, v in props: - ET.SubElement( - dbref, - "{http://uniprot.org/uniref}property", - type=k, - value=v, - ) +def test_add_cluster_members(repr_xml: str | None, member_xmls: list[str], expected: list[tuple[str, ...]]) -> None: + """Test add_cluster_members with various representative/member combinations.""" + ns = {"ns": "http://uniprot.org/uniref"} + cluster_id = "CLUSTER_X" - rows = [] - extract_cross_refs(dbref, rows, NS) - - got = {(t, v) for _, t, v in rows} - assert got == expected + # Structure (representative members) dbReference if it exists + repr_db = ET.fromstring(repr_xml) if repr_xml else None - for entity_id, _, _ in rows: - assert entity_id is not None - assert isinstance(entity_id, str) + # Structure nodes, and add + elem = make_entry_with_members(member_xmls) + # Calling the function under test + cluster_member_data = [] + add_cluster_members(cluster_id, repr_db, elem, cluster_member_data, ns) -# --------------------------------------------------------- -# parse_uniref_xml -# --------------------------------------------------------- -@pytest.mark.parametrize("batch_size", [1, 2]) -def test_parse_uniref_xml_batch(batch_size): - xml = """ - - - A - - - - - - + assert len(cluster_member_data) == len(expected) + for i, (clu_id, cdm_prefix, is_repr, is_seed, score) in enumerate(expected): + out = cluster_member_data[i] + assert out[0] == clu_id, f"Wrong cluster_id at idx {i}: {out[0]}" + assert out[1].startswith(cdm_prefix), f"Wrong entity_id at idx {i}: {out[1]}" + assert out[2] == is_repr, f"Wrong is_representative at idx {i}: {out[2]}" + assert out[3] == is_seed, f"Wrong is_seed at idx {i}: {out[3]}" + assert out[4] == score, f"Wrong score at idx {i}: {out[4]}" - - B - - - - - - - - """.strip() - with tempfile.TemporaryDirectory() as tmpdir: - gz_path = f"{tmpdir}/uniref_test.xml.gz" - with gzip.open(gz_path, "wb") as gz: - gz.write(xml.encode("utf-8")) +XREF_TYPES = ["UniRef90 ID", "UniRef50 ID", "UniParc ID"] - result = parse_uniref_xml(gz_path, batch_size, {}) - assert len(result["cluster_data"]) == batch_size - assert len(result["entity_data"]) == batch_size - assert len(result["cluster_member_data"]) == batch_size - assert len(result["cross_reference_data"]) in (0, batch_size) +@pytest.mark.parametrize( + ("dbref_props", "expected_xrefs"), + [ + ( + # all cross-ref fields present + [ + ("UniRef90 ID", "UniRef90_N8Q6C0"), + ("UniRef50 ID", "UniRef50_A0A7Z7LP76"), + ("UniParc ID", "UPI00044F6C4F"), + ("protein name", "foo"), + ], + [ + ("UniRef90 ID", "UniRef90_N8Q6C0"), + ("UniRef50 ID", "UniRef50_A0A7Z7LP76"), + ("UniParc ID", "UPI00044F6C4F"), + ], + ), + ( + # partial cross-ref + [ + ("UniRef90 ID", "UniRef90_ABC"), + ("protein name", "bar"), + ], + [ + ("UniRef90 ID", "UniRef90_ABC"), + ], + ), + ( + # No cross-ref + [ + ("protein name", "baz"), + ], + [], + ), + ], +) +def test_extract_cross_refs_param(dbref_props: list[tuple[str, str]], expected_xrefs: list[tuple[str, str]]) -> None: + """ + Test that extract_cross_refs correctly extracts all UniRef cross-reference fields. + """ + dbref = ET.Element("{http://uniprot.org/uniref}dbReference", type="UniProtKB ID", id="TEST_ID") + + for t, v in dbref_props: + ET.SubElement(dbref, "{http://uniprot.org/uniref}property", type=t, value=v) + + ns = {"ns": "http://uniprot.org/uniref"} + cross_reference_data = [] + extract_cross_refs(dbref, cross_reference_data, ns) + + entity_id = cdm_entity_id("TEST_ID") + expected = {(entity_id, typ, val) for typ, val in expected_xrefs} + got = set(cross_reference_data) + assert got == expected