diff --git a/pyproject.toml b/pyproject.toml
index 34af193..24c44d3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -156,7 +156,7 @@ convention = "google"
 requires = ["uv_build>=0.9.9,<0.10.0"]
 build-backend = "uv_build"
 
-[tool.pytest]
+[tool.pytest.ini_options]
 pythonpath = ["src"]
 log_cli = true
 log_cli_level = "INFO"
diff --git a/src/cdm_data_loader_utils/parsers/annotation_parse.py b/src/cdm_data_loader_utils/parsers/annotation_parse.py
new file mode 100644
index 0000000..2a690c8
--- /dev/null
+++ b/src/cdm_data_loader_utils/parsers/annotation_parse.py
@@ -0,0 +1,374 @@
+"""
+
+RefSeq annotation parser for transforming NCBI Datasets API JSON into CDM-formatted Delta Lake tables.
+
+Usage:
+    python src/cdm_data_loader_utils/parsers/annotation_parse.py \
+  --accession GCF_000869125.1 \
+  --output-path output/refseq/GCF_000869125.1 \
+  --query
+
+"""
+
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import Optional
+
+import requests
+from pyspark.sql import SparkSession
+from pyspark.sql.types import StructType
+from delta import configure_spark_with_delta_pip
+
+from cdm_data_loader_utils.parsers.kbase_cdm_pyspark import schema as cdm_schemas
+
+
+# ---------------------------------------------------------------------
+# Accession-based annotation fetch
+# ---------------------------------------------------------------------
+def fetch_annotation_json(accession: str) -> dict:
+    """Fetch annotation JSON from NCBI Datasets API."""
+    url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{accession}/annotation_report"
+    resp = requests.get(url, headers={"Accept": "application/json"}, timeout=60)
+    resp.raise_for_status()
+    return resp.json()
+
+
+# ---------------------------------------------------------------------
+# SPARK SESSION
+# ---------------------------------------------------------------------
+def build_spark_session(app_name: str = "RefSeqAnnotationToCDM") -> SparkSession:
+    """Configure and return Spark session with Delta support."""
+    builder = (
+        SparkSession.builder.appName(app_name)
+        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
+    )
+    return configure_spark_with_delta_pip(builder).getOrCreate()
+
+
+# ---------------------------------------------------------------------
+# CDM TABLE SCHEMAS
+# ---------------------------------------------------------------------
+# Using centralized schemas
+IDENTIFIER_SCHEMA = cdm_schemas["Identifier"]
+NAME_SCHEMA = cdm_schemas["Name"]
+FEATURE_SCHEMA = cdm_schemas["Feature"]
+CONTIG_COLLECTION_X_FEATURE_SCHEMA = cdm_schemas["ContigCollection_x_Feature"]
+CONTIG_COLLECTION_X_PROTEIN_SCHEMA = cdm_schemas["ContigCollection_x_Protein"]
+FEATURE_X_PROTEIN_SCHEMA = cdm_schemas["Feature_x_Protein"]
+CONTIG_SCHEMA = cdm_schemas["Contig"]
+CONTIG_X_CONTIG_COLLECTION_SCHEMA = cdm_schemas["Contig_x_ContigCollection"]
+
+
+# ---------------------------------------------------------------------
+# CDM PREFIX NORMALIZATION
+# ---------------------------------------------------------------------
+def apply_prefix(identifier: str) -> str:
+    """Normalize identifiers to CDM-prefixed formats."""
+    if identifier.startswith(("YP_", "XP_", "WP_", "NP_", "NC_")):
+        return f"refseq:{identifier}"
+    if identifier.startswith("GCF_"):
+        return f"insdc.gcf:{identifier}"
+    return identifier
+
+
+# ---------------------------------------------------------------------
+# Safe integer conversion
+# ---------------------------------------------------------------------
+def to_int(val: str) -> int | None:
+    try:
+        return int(val)
+    except Exception:
+        return None
+
+
+# ---------------------------------------------------------------------
+# IDENTIFIERS
+# ---------------------------------------------------------------------
+def load_identifiers(data: dict) -> list[tuple[str, str, str, str, str | None]]:
+    """Extract Identifier table records."""
+    out = []
+    for report in data.get("reports", []):
+        ann = report.get("annotation", {})
+        gene_id = ann.get("gene_id")
+        if not gene_id:
+            continue
+        entity_id = f"ncbigene:{gene_id}"
+        out.append((entity_id, gene_id, ann.get("name"), "RefSeq", ann.get("relationship")))
+    return out
+
+
+# ---------------------------------------------------------------------
+# NAME EXTRACTION
+# ---------------------------------------------------------------------
+def load_names(data: dict) -> list[tuple[str, str, str, str]]:
+    """Extract Name table records."""
+    out = []
+    for report in data.get("reports", []):
+        ann = report.get("annotation", {})
+        gene_id = ann.get("gene_id")
+        if not gene_id:
+            continue
+        entity_id = f"ncbigene:{gene_id}"
+        for label, desc in [
+            ("symbol", "RefSeq gene symbol"),
+            ("name", "RefSeq gene name"),
+            ("locus_tag", "RefSeq locus tag"),
+        ]:
+            val = ann.get(label)
+            if val:
+                out.append((entity_id, val, desc, "RefSeq"))
+    return out
+
+
+# ---------------------------------------------------------------------
+# FEATURE LOCATIONS
+# ---------------------------------------------------------------------
+def load_feature_records(data: dict) -> list[tuple]:
+    """Extract Feature table records."""
+    features = []
+    for report in data.get("reports", []):
+        ann = report.get("annotation", {})
+        gene_id = ann.get("gene_id")
+        if not gene_id:
+            continue
+        feature_id = f"ncbigene:{gene_id}"
+        for region in ann.get("genomic_regions", []):
+            for r in region.get("gene_range", {}).get("range", []):
+                strand = {
+                    "plus": "positive",
+                    "minus": "negative",
+                    "unstranded": "unstranded",
+                }.get(r.get("orientation"), "unknown")
+                features.append((
+                    feature_id,
+                    None,
+                    None,
+                    None,
+                    to_int(r.get("end")),
+                    None,
+                    to_int(r.get("begin")),
+                    strand,
+                    "RefSeq",
+                    None,
+                    "gene",
+                ))
+    return features
+
+
+# ---------------------------------------------------------------------
+# PARSE CONTIG_COLLECTION <-> FEATURE
+# ---------------------------------------------------------------------
+def load_contig_collection_x_feature(data: dict) -> list[tuple[str, str]]:
+    """Parse ContigCollection ↔ Feature links."""
+    links = []
+
+    for report in data.get("reports", []):
+        ann = report.get("annotation", {})
+        gene_id = ann.get("gene_id")
+        regions = ann.get("genomic_regions", [])
+
+        if not gene_id or not regions:
+            continue
+
+        acc = regions[0].get("gene_range", {}).get("accession_version")
+        if acc:
+            links.append((apply_prefix(acc), f"ncbigene:{gene_id}"))
+
+    return links
+
+
+# ---------------------------------------------------------------------
+# PARSE CONTIG_COLLECTION <-> PROTEIN
+# ---------------------------------------------------------------------
+def load_contig_collection_x_protein(data: dict) -> list[tuple[str, str]]:
+    links = []
+
+    for report in data.get("reports", []):
+        ann = report.get("annotation", {})
+        proteins = ann.get("proteins", [])
+        annotations = ann.get("annotations", [])
+
+        if not proteins or not annotations:
+            continue
+
+        assembly = annotations[0].get("assembly_accession")
+
+        if not assembly:
+            continue
+
+        contig_id = apply_prefix(assembly)
+
+        for p in proteins:
+            pid = p.get("accession_version")
+            if pid:
+                protein_id = apply_prefix(pid)
+                links.append((contig_id, protein_id))
+
+    return links
+
+
+# ---------------------------------------------------------------------
+# PARSE FEATURE <-> PROTEIN
+# ---------------------------------------------------------------------
+def load_feature_x_protein(data: dict) -> list[tuple[str, str]]:
+    links = []
+
+    for report in data.get("reports", []):
+        ann = report.get("annotation", {})
+        gene_id = ann.get("gene_id")
+        proteins = ann.get("proteins", [])
+
+        if not gene_id or not proteins:
+            continue
+
+        feature_id = f"ncbigene:{gene_id}"
+
+        for p in proteins:
+            pid = p.get("accession_version")
+            if pid:
+                protein_id = apply_prefix(pid)
+                links.append((feature_id, protein_id))
+
+    return links
+
+
+# ---------------------------------------------------------------------
+# PARSE CONTIGS
+# ---------------------------------------------------------------------
+def load_contigs(data: dict) -> list[tuple[str, str | None, float | None, int | None]]:
+    """Parse Contig table."""
+    contigs = {}
+
+    for report in data.get("reports", []):
+        for region in report.get("annotation", {}).get("genomic_regions", []):
+            acc = region.get("gene_range", {}).get("accession_version")
+            if acc:
+                contig_id = apply_prefix(acc)
+                contigs.setdefault(contig_id, {"hash": None, "gc_content": None, "length": None})
+
+    return [(cid, meta["hash"], meta["gc_content"], meta["length"]) for cid, meta in contigs.items()]
+
+
+# ---------------------------------------------------------------------
+# PARSE CONTIG <-> CONTIG_COLLECTION
+# ---------------------------------------------------------------------
+def load_contig_x_contig_collection(data: dict) -> list[tuple[str, str]]:
+    links = []
+
+    for report in data.get("reports", []):
+        ann = report.get("annotation", {})
+        regions = ann.get("genomic_regions", [])
+        annotations = ann.get("annotations", [])
+
+        if not regions or not annotations:
+            continue
+
+        contig = regions[0].get("gene_range", {}).get("accession_version")
+        assembly = annotations[0].get("assembly_accession")
+
+        if contig and assembly:
+            contig_id = f"refseq:{contig}"
+            collection_id = apply_prefix(assembly)
+            links.append((contig_id, collection_id))
+
+    return links
+
+
+# ---------------------------------------------------------------------
+# DELTA TABLE
+# ---------------------------------------------------------------------
+def write_to_delta(
+    spark: SparkSession,
+    records: list[tuple],
+    output_path: str,
+    schema: StructType,
+) -> None:
+    """Write records to Delta table."""
+    if not records:
+        return
+
+    df = spark.createDataFrame(records, schema=schema)
+    df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(output_path)
+
+
+# ---------------------------------------------------------------------
+# SQL PREVIEW
+# ---------------------------------------------------------------------
+def run_sql_query(spark: SparkSession, delta_path: str) -> None:
+    """Run SQL queries to preview Delta tables."""
+    for name in [
+        "cdm_identifiers",
+        "cdm_names",
+        "cdm_features",
+        "cdm_contig_collection_x_feature",
+        "cdm_contig_collection_x_protein",
+        "cdm_feature_x_protein",
+        "cdm_contigs",
+        "cdm_contig_x_contig_collection",
+    ]:
+        print(f"\n[SQL] {name}:")
+        path = str(Path(delta_path) / name)
+        spark.read.format("delta").load(path).createOrReplaceTempView(name)
+        spark.sql(f"SELECT * FROM {name} LIMIT 20").show(truncate=False)
+
+
+# ---------------------------------------------------------------------
+# CLI ENTRY
+# ---------------------------------------------------------------------
+def main() -> None:
+    """Entry point for RefSeq Annotation parser."""
+    parser = argparse.ArgumentParser(description="RefSeq Annotation Parser to CDM")
+    parser.add_argument("--accession", required=True)
+    parser.add_argument("--output-path", required=True)
+    parser.add_argument("--query", action="store_true")
+    args = parser.parse_args()
+
+    base_output = Path(args.output_path)
+    base_output.mkdir(parents=True, exist_ok=True)
+
+    data = fetch_annotation_json(args.accession)
+    input_path = Path(f"/tmp/{args.accession}.json")
+    input_path.write_text(json.dumps(data, indent=2))
+
+    spark = build_spark_session()
+
+    write_to_delta(spark, load_identifiers(data), str(base_output / "cdm_identifiers"), IDENTIFIER_SCHEMA)
+    write_to_delta(spark, load_names(data), str(base_output / "cdm_names"), NAME_SCHEMA)
+    write_to_delta(spark, load_feature_records(data), str(base_output / "cdm_features"), FEATURE_SCHEMA)
+    write_to_delta(
+        spark,
+        load_contig_collection_x_feature(data),
+        str(base_output / "cdm_contig_collection_x_feature"),
+        CONTIG_COLLECTION_X_FEATURE_SCHEMA,
+    )
+    write_to_delta(
+        spark,
+        load_contig_collection_x_protein(data),
+        str(base_output / "cdm_contig_collection_x_protein"),
+        CONTIG_COLLECTION_X_PROTEIN_SCHEMA,
+    )
+    write_to_delta(
+        spark,
+        load_feature_x_protein(data),
+        str(base_output / "cdm_feature_x_protein"),
+        FEATURE_X_PROTEIN_SCHEMA,
+    )
+    write_to_delta(spark, load_contigs(data), str(base_output / "cdm_contigs"), CONTIG_SCHEMA)
+    write_to_delta(
+        spark,
+        load_contig_x_contig_collection(data),
+        str(base_output / "cdm_contig_x_contig_collection"),
+        CONTIG_X_CONTIG_COLLECTION_SCHEMA,
+    )
+
+    if args.query:
+        run_sql_query(spark, str(base_output))
+
+    spark.stop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/cdm_data_loader_utils/parsers/kbase_cdm_pyspark.py b/src/cdm_data_loader_utils/parsers/kbase_cdm_pyspark.py
new file mode 100644
index 0000000..19be5e8
--- /dev/null
+++ b/src/cdm_data_loader_utils/parsers/kbase_cdm_pyspark.py
@@ -0,0 +1,610 @@
+"""Automated conversion of cdm_schema to PySpark."""
+
+from pyspark.sql.types import BooleanType, DateType, FloatType, IntegerType, StringType, StructField, StructType
+
+schema = {
+    "Association": StructType(
+        [
+            StructField("association_id", StringType(), nullable=False),
+            StructField("subject", StringType(), nullable=False),
+            StructField("object", StringType(), nullable=False),
+            StructField("predicate", StringType(), nullable=False),
+            StructField("negated", BooleanType(), nullable=True),
+            StructField("evidence_type", StringType(), nullable=True),
+            StructField("primary_knowledge_source", StringType(), nullable=True),
+            StructField("aggregator_knowledge_source", StringType(), nullable=True),
+            StructField("annotation_date", DateType(), nullable=True),
+            StructField("comments", StringType(), nullable=True),
+        ]
+    ),
+    "Association_x_SupportingObject": StructType(
+        [
+            StructField("association_id", StringType(), nullable=False),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "Cluster": StructType(
+        [
+            StructField("cluster_id", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+            StructField("name", StringType(), nullable=True),
+            StructField("entity_type", StringType(), nullable=False),
+            StructField("protocol_id", StringType(), nullable=True),
+        ]
+    ),
+    "ClusterMember": StructType(
+        [
+            StructField("cluster_id", StringType(), nullable=False),
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("is_representative", BooleanType(), nullable=True),
+            StructField("is_seed", BooleanType(), nullable=True),
+            StructField("score", FloatType(), nullable=True),
+        ]
+    ),
+    "Contig": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("gc_content", FloatType(), nullable=True),
+            StructField("length", IntegerType(), nullable=True),
+        ]
+    ),
+    "ContigCollection": StructType(
+        [
+            StructField("contig_collection_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("asm_score", FloatType(), nullable=True),
+            StructField("checkm_completeness", FloatType(), nullable=True),
+            StructField("checkm_contamination", FloatType(), nullable=True),
+            StructField("checkm_version", StringType(), nullable=True),
+            StructField("contig_bp", IntegerType(), nullable=True),
+            StructField("contig_collection_type", StringType(), nullable=True),
+            StructField("contig_l50", IntegerType(), nullable=True),
+            StructField("contig_l90", IntegerType(), nullable=True),
+            StructField("contig_n50", IntegerType(), nullable=True),
+            StructField("contig_n90", IntegerType(), nullable=True),
+            StructField("contig_logsum", FloatType(), nullable=True),
+            StructField("contig_max", IntegerType(), nullable=True),
+            StructField("contig_powersum", FloatType(), nullable=True),
+            StructField("gap_percent", FloatType(), nullable=True),
+            StructField("gc_average", FloatType(), nullable=True),
+            StructField("gc_std", FloatType(), nullable=True),
+            StructField("gtdb_taxon_id", StringType(), nullable=True),
+            StructField("n_chromosomes", IntegerType(), nullable=True),
+            StructField("n_contigs", IntegerType(), nullable=True),
+            StructField("n_scaffolds", IntegerType(), nullable=True),
+            StructField("ncbi_taxon_id", StringType(), nullable=True),
+            StructField("scaffold_l50", IntegerType(), nullable=True),
+            StructField("scaffold_l90", IntegerType(), nullable=True),
+            StructField("scaffold_n50", IntegerType(), nullable=True),
+            StructField("scaffold_n90", IntegerType(), nullable=True),
+            StructField("scaffold_bp", IntegerType(), nullable=True),
+            StructField("scaffold_logsum", FloatType(), nullable=True),
+            StructField("scaffold_maximum_length", IntegerType(), nullable=True),
+            StructField("scaffold_powersum", FloatType(), nullable=True),
+            StructField("scaffolds_n_over_50K", IntegerType(), nullable=True),
+            StructField("scaffolds_percent_over_50K", FloatType(), nullable=True),
+            StructField("scaffolds_total_length_over_50k", IntegerType(), nullable=True),
+        ]
+    ),
+    "ContigCollection_x_EncodedFeature": StructType(
+        [
+            StructField("contig_collection_id", StringType(), nullable=False),
+            StructField("encoded_feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "ContigCollection_x_Feature": StructType(
+        [
+            StructField("contig_collection_id", StringType(), nullable=False),
+            StructField("feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "ContigCollection_x_Protein": StructType(
+        [
+            StructField("contig_collection_id", StringType(), nullable=False),
+            StructField("protein_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contig_x_ContigCollection": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("contig_collection_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contig_x_EncodedFeature": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("encoded_feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contig_x_Feature": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contig_x_Protein": StructType(
+        [
+            StructField("contig_id", StringType(), nullable=False),
+            StructField("protein_id", StringType(), nullable=False),
+        ]
+    ),
+    "Contributor": StructType(
+        [
+            StructField("contributor_id", StringType(), nullable=False),
+            StructField("contributor_type", StringType(), nullable=True),
+            StructField("name", StringType(), nullable=True),
+            StructField("given_name", StringType(), nullable=True),
+            StructField("family_name", StringType(), nullable=True),
+        ]
+    ),
+    "ContributorAffiliation": StructType(
+        [
+            StructField("contributor_id", StringType(), nullable=False),
+            StructField("affiliation_id", StringType(), nullable=True),
+        ]
+    ),
+    "Contributor_x_DataSource": StructType(
+        [
+            StructField("contributor_id", StringType(), nullable=False),
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("contributor_role", StringType(), nullable=True),
+        ]
+    ),
+    "Contributor_x_Role_x_Project": StructType(
+        [
+            StructField("contributor_id", StringType(), nullable=False),
+            StructField("project_id", StringType(), nullable=False),
+            StructField("contributor_role", StringType(), nullable=True),
+        ]
+    ),
+    "ControlledTermValue": StructType(
+        [
+            StructField("value_cv_label", StringType(), nullable=False),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "ControlledVocabularyTermValue": StructType(
+        [
+            StructField("value_cv_label", StringType(), nullable=True),
+            StructField("value_cv_id", StringType(), nullable=True),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "DataSource": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+        ]
+    ),
+    "DataSourceNew": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("comments", StringType(), nullable=True),
+            StructField("date_accessed", DateType(), nullable=False),
+            StructField("date_published", DateType(), nullable=True),
+            StructField("date_updated", DateType(), nullable=True),
+            StructField("license", StringType(), nullable=True),
+            StructField("publisher", StringType(), nullable=True),
+            StructField("resource_type", StringType(), nullable=False),
+            StructField("url", StringType(), nullable=True),
+            StructField("version", StringType(), nullable=True),
+        ]
+    ),
+    "DataSource_x_Description": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("resource_description_id", StringType(), nullable=False),
+        ]
+    ),
+    "DataSource_x_FundingReference": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("funding_reference_id", StringType(), nullable=False),
+        ]
+    ),
+    "DataSource_x_License": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("license_id", StringType(), nullable=False),
+        ]
+    ),
+    "DataSource_x_Title": StructType(
+        [
+            StructField("data_source_id", StringType(), nullable=False),
+            StructField("resource_title_id", StringType(), nullable=False),
+        ]
+    ),
+    "DateTimeValue": StructType(
+        [
+            StructField("date_time", DateType(), nullable=False),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "EncodedFeature": StructType(
+        [
+            StructField("encoded_feature_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("has_stop_codon", BooleanType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+        ]
+    ),
+    "EncodedFeature_x_Feature": StructType(
+        [
+            StructField("encoded_feature_id", StringType(), nullable=False),
+            StructField("feature_id", StringType(), nullable=False),
+        ]
+    ),
+    "EncodedFeature_x_Protein": StructType(
+        [
+            StructField("encoded_feature_id", StringType(), nullable=False),
+            StructField("protein_id", StringType(), nullable=False),
+        ]
+    ),
+    "EntailedEdge": StructType(
+        [
+            StructField("subject", StringType(), nullable=True),
+            StructField("predicate", StringType(), nullable=True),
+            StructField("object", StringType(), nullable=True),
+        ]
+    ),
+    "Entity": StructType(
+        [
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("entity_type", StringType(), nullable=False),
+            StructField("data_source_id", StringType(), nullable=True),
+            StructField("data_source_entity_id", StringType(), nullable=True),
+            StructField("data_source_created", DateType(), nullable=False),
+            StructField("data_source_updated", DateType(), nullable=True),
+            StructField("created", DateType(), nullable=False),
+            StructField("updated", DateType(), nullable=False),
+        ]
+    ),
+    "Event": StructType(
+        [
+            StructField("event_id", StringType(), nullable=False),
+            StructField("created_at", DateType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("name", StringType(), nullable=True),
+            StructField("location", StringType(), nullable=True),
+        ]
+    ),
+    "Experiment": StructType(
+        [
+            StructField("experiment_id", StringType(), nullable=False),
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("created_at", DateType(), nullable=True),
+        ]
+    ),
+    "ExperimentCondition": StructType(
+        [
+            StructField("experiment_condition_id", StringType(), nullable=False),
+            StructField("experiment_id", StringType(), nullable=False),
+            StructField("variable_id", StringType(), nullable=False),
+            StructField("value", StringType(), nullable=True),
+        ]
+    ),
+    "ExperimentConditionSet": StructType(
+        [
+            StructField("experiment_condition_set_id", StringType(), nullable=False),
+            StructField("experiment_condition_id", StringType(), nullable=False),
+        ]
+    ),
+    "Feature": StructType(
+        [
+            StructField("feature_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("cds_phase", StringType(), nullable=True),
+            StructField("e_value", FloatType(), nullable=True),
+            StructField("end", IntegerType(), nullable=True),
+            StructField("p_value", FloatType(), nullable=True),
+            StructField("start", IntegerType(), nullable=True),
+            StructField("strand", StringType(), nullable=True),
+            StructField("source_database", StringType(), nullable=True),
+            StructField("protocol_id", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+        ]
+    ),
+    "Feature_x_Protein": StructType(
+        [
+            StructField("feature_id", StringType(), nullable=False),
+            StructField("protein_id", StringType(), nullable=False),
+        ]
+    ),
+    "FundingReference": StructType(
+        [
+            StructField("funding_reference_id", StringType(), nullable=False),
+            StructField("funder", StringType(), nullable=True),
+            StructField("grant_id", StringType(), nullable=True),
+            StructField("grant_title", StringType(), nullable=True),
+            StructField("grant_url", StringType(), nullable=True),
+        ]
+    ),
+    "Geolocation": StructType(
+        [
+            StructField("latitude", FloatType(), nullable=False),
+            StructField("longitude", FloatType(), nullable=False),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "GoldEnvironmentalContext": StructType(
+        [
+            StructField("gold_environmental_context_id", StringType(), nullable=False),
+            StructField("ecosystem", StringType(), nullable=True),
+            StructField("ecosystem_category", StringType(), nullable=True),
+            StructField("ecosystem_subtype", StringType(), nullable=True),
+            StructField("ecosystem_type", StringType(), nullable=True),
+            StructField("specific_ecosystem", StringType(), nullable=True),
+        ]
+    ),
+    "Identifier": StructType(
+        [
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("identifier", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+            StructField("source", StringType(), nullable=True),
+            StructField("relationship", StringType(), nullable=True),
+        ]
+    ),
+    "License": StructType(
+        [
+            StructField("license_id", StringType(), nullable=False),
+            StructField("id", StringType(), nullable=True),
+            StructField("name", StringType(), nullable=True),
+            StructField("url", StringType(), nullable=True),
+        ]
+    ),
+    "Measurement": StructType(
+        [
+            StructField("measurement_id", StringType(), nullable=False),
+            StructField("measurement_set_id", StringType(), nullable=False),
+            StructField("experiment_condition_set_id", StringType(), nullable=False),
+            StructField("value", StringType(), nullable=True),
+        ]
+    ),
+    "MeasurementSet": StructType(
+        [
+            StructField("measurement_set_id", StringType(), nullable=False),
+            StructField("variable_id", StringType(), nullable=False),
+            StructField("quality", StringType(), nullable=True),
+            StructField("created_at", DateType(), nullable=True),
+        ]
+    ),
+    "MixsEnvironmentalContext": StructType(
+        [
+            StructField("mixs_environmental_context_id", StringType(), nullable=False),
+            StructField("env_broad_scale", StringType(), nullable=True),
+            StructField("env_local_scale", StringType(), nullable=True),
+            StructField("env_medium", StringType(), nullable=True),
+        ]
+    ),
+    "Name": StructType(
+        [
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+            StructField("source", StringType(), nullable=True),
+        ]
+    ),
+    "OrderedProtocolStep": StructType(
+        [
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("protocol_step_id", StringType(), nullable=False),
+            StructField("step_index", IntegerType(), nullable=False),
+        ]
+    ),
+    "Parameter": StructType(
+        [
+            StructField("parameter_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("value_type", StringType(), nullable=True),
+            StructField("required", BooleanType(), nullable=True),
+            StructField("cardinality", StringType(), nullable=True),
+            StructField("default", StringType(), nullable=True),
+            StructField("parameter_type", StringType(), nullable=True),
+        ]
+    ),
+    "Prefix": StructType(
+        [
+            StructField("prefix", StringType(), nullable=True),
+            StructField("base", StringType(), nullable=True),
+        ]
+    ),
+    "Project": StructType(
+        [
+            StructField("project_id", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+        ]
+    ),
+    "Protein": StructType(
+        [
+            StructField("protein_id", StringType(), nullable=False),
+            StructField("hash", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("evidence_for_existence", StringType(), nullable=True),
+            StructField("length", IntegerType(), nullable=True),
+            StructField("sequence", StringType(), nullable=True),
+        ]
+    ),
+    "Protocol": StructType(
+        [
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("doi", StringType(), nullable=True),
+            StructField("url", StringType(), nullable=True),
+            StructField("version", StringType(), nullable=True),
+        ]
+    ),
+    "ProtocolExecution": StructType(
+        [
+            StructField("protocol_execution_id", StringType(), nullable=False),
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("created_at", DateType(), nullable=True),
+        ]
+    ),
+    "ProtocolInput": StructType(
+        [
+            StructField("parameter_id", StringType(), nullable=False),
+            StructField("protocol_input_id", StringType(), nullable=False),
+            StructField("protocol_execution_id", StringType(), nullable=False),
+            StructField("value", StringType(), nullable=False),
+        ]
+    ),
+    "ProtocolInputSet": StructType(
+        [
+            StructField("protocol_input_id", StringType(), nullable=False),
+            StructField("protocol_input_set_id", StringType(), nullable=False),
+        ]
+    ),
+    "ProtocolOutput": StructType(
+        [
+            StructField("protocol_output_id", StringType(), nullable=False),
+            StructField("protocol_input_set_id", StringType(), nullable=False),
+            StructField("value", StringType(), nullable=False),
+        ]
+    ),
+    "ProtocolStep": StructType(
+        [
+            StructField("protocol_step_id", StringType(), nullable=False),
+            StructField("step", StringType(), nullable=True),
+        ]
+    ),
+    "ProtocolVariable": StructType(
+        [
+            StructField("protocol_id", StringType(), nullable=False),
+            StructField("variable_id", StringType(), nullable=False),
+        ]
+    ),
+    "Publication": StructType(
+        [
+            StructField("publication_id", StringType(), nullable=False),
+        ]
+    ),
+    "QuantityRangeValue": StructType(
+        [
+            StructField("maximum_numeric_value", FloatType(), nullable=False),
+            StructField("minimum_numeric_value", FloatType(), nullable=False),
+            StructField("unit_cv_id", StringType(), nullable=True),
+            StructField("unit_cv_label", StringType(), nullable=True),
+            StructField("unit_string", StringType(), nullable=True),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "QuantityValue": StructType(
+        [
+            StructField("numeric_value", FloatType(), nullable=False),
+            StructField("unit_cv_id", StringType(), nullable=True),
+            StructField("unit_cv_label", StringType(), nullable=True),
+            StructField("unit_string", StringType(), nullable=True),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "ResourceDescription": StructType(
+        [
+            StructField("resource_description_id", StringType(), nullable=False),
+            StructField("description_text", StringType(), nullable=False),
+            StructField("description_type", StringType(), nullable=True),
+            StructField("language", StringType(), nullable=True),
+        ]
+    ),
+    "ResourceTitle": StructType(
+        [
+            StructField("resource_title_id", StringType(), nullable=False),
+            StructField("language", StringType(), nullable=True),
+            StructField("title", StringType(), nullable=False),
+            StructField("title_type", StringType(), nullable=True),
+        ]
+    ),
+    "Sample": StructType(
+        [
+            StructField("sample_id", StringType(), nullable=False),
+            StructField("description", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+        ]
+    ),
+    "Sequence": StructType(
+        [
+            StructField("sequence_id", StringType(), nullable=False),
+            StructField("entity_id", StringType(), nullable=False),
+            StructField("type", StringType(), nullable=True),
+            StructField("length", IntegerType(), nullable=True),
+            StructField("checksum", StringType(), nullable=True),
+        ]
+    ),
+    "Statement": StructType(
+        [
+            StructField("subject", StringType(), nullable=True),
+            StructField("predicate", StringType(), nullable=True),
+            StructField("object", StringType(), nullable=True),
+            StructField("value", StringType(), nullable=True),
+            StructField("datatype", StringType(), nullable=True),
+            StructField("language", StringType(), nullable=True),
+        ]
+    ),
+    "TextValue": StructType(
+        [
+            StructField("text_value", StringType(), nullable=False),
+            StructField("language", StringType(), nullable=True),
+            StructField("raw_value", StringType(), nullable=True),
+            StructField("type", StringType(), nullable=True),
+            StructField("attribute_cv_id", StringType(), nullable=True),
+            StructField("attribute_cv_label", StringType(), nullable=True),
+            StructField("attribute_string", StringType(), nullable=True),
+            StructField("entity_id", StringType(), nullable=False),
+        ]
+    ),
+    "Variable": StructType(
+        [
+            StructField("variable_id", StringType(), nullable=False),
+            StructField("name", StringType(), nullable=True),
+            StructField("description", StringType(), nullable=True),
+            StructField("name_cv_id", StringType(), nullable=True),
+            StructField("unit", StringType(), nullable=True),
+            StructField("value_type", StringType(), nullable=False),
+        ]
+    ),
+    "VariableValue": StructType(
+        [
+            StructField("variable_value_id", StringType(), nullable=False),
+            StructField("variable_id", StringType(), nullable=False),
+            StructField("value_type", StringType(), nullable=True),
+        ]
+    ),
+}
diff --git a/src/cdm_data_loader_utils/parsers/uniref.py b/src/cdm_data_loader_utils/parsers/uniref.py
index 6f24bb3..6e1cdf3 100644
--- a/src/cdm_data_loader_utils/parsers/uniref.py
+++ b/src/cdm_data_loader_utils/parsers/uniref.py
@@ -41,19 +41,18 @@
 import os
 import uuid
 import xml.etree.ElementTree as ET
-from datetime import datetime
+from datetime import UTC, datetime
+from pathlib import Path
 from urllib.error import URLError
-from datetime import timezone
 from urllib.request import urlretrieve
+
 import click
 from delta import configure_spark_with_delta_pip
 from pyspark.sql import SparkSession
 from pyspark.sql.types import StringType, StructField, StructType
-from pathlib import Path
 
 from cdm_data_loader_utils.parsers.xml_utils import get_text, parse_properties
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -102,7 +101,7 @@ def get_timestamps(
     if not uniref_id:
         raise ValueError("get_timestamps: uniref_id must be a non-empty string")
 
-    now_dt = now or datetime.now(timezone.utc)
+    now_dt = now or datetime.now(UTC)
     updated_time = now_dt.isoformat(timespec="seconds")
 
     created_time = existing_created.get(uniref_id) or updated_time
@@ -187,7 +186,6 @@ def get_accession_and_seed(dbref: ET.Element | None, ns: dict[str, str]) -> tupl
     """
     Extract UniProtKB accession and is_seed status from a dbReference element.
     """
-
     if dbref is None:
         return None, False
 
diff --git a/tests/parsers/refseq_importer/__init__.py b/tests/parsers/refseq_importer/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/parsers/refseq_importer/test_spark_delta.py b/tests/parsers/refseq_importer/test_spark_delta.py
index b5cd9d0..0f5751e 100644
--- a/tests/parsers/refseq_importer/test_spark_delta.py
+++ b/tests/parsers/refseq_importer/test_spark_delta.py
@@ -114,14 +114,12 @@ def test_write_delta_contig_collection_schema(spark) -> None:
     db = "cdmdb"
     spark.sql(f"CREATE DATABASE IF NOT EXISTS {db}")
 
-    schema = StructType(
-        [
-            StructField("collection_id", StringType(), True),
-            StructField("contig_collection_type", StringType(), True),
-            StructField("ncbi_taxon_id", StringType(), True),
-            StructField("gtdb_taxon_id", StringType(), True),
-        ]
-    )
+    schema = StructType([
+        StructField("collection_id", StringType(), True),
+        StructField("contig_collection_type", StringType(), True),
+        StructField("ncbi_taxon_id", StringType(), True),
+        StructField("gtdb_taxon_id", StringType(), True),
+    ])
 
     df = spark.createDataFrame(
         [("C1", "isolate", "NCBITaxon:123", None)],
diff --git a/tests/parsers/refseq_importer/test_tables_finalize.py b/tests/parsers/refseq_importer/test_tables_finalize.py
index c71911c..d9151fd 100644
--- a/tests/parsers/refseq_importer/test_tables_finalize.py
+++ b/tests/parsers/refseq_importer/test_tables_finalize.py
@@ -20,12 +20,10 @@ def spark():
 # -------------------------------------------------------------------
 @pytest.mark.requires_spark
 def test_list_of_dicts_to_spark(spark) -> None:
-    schema = StructType(
-        [
-            StructField("a", StringType(), True),
-            StructField("b", StringType(), True),
-        ]
-    )
+    schema = StructType([
+        StructField("a", StringType(), True),
+        StructField("b", StringType(), True),
+    ])
 
     rows = [{"a": "1", "b": "x"}, {"a": "2", "b": "y"}]
     df = list_of_dicts_to_spark(spark, rows, schema)
@@ -40,15 +38,13 @@ def test_list_of_dicts_to_spark(spark) -> None:
 @pytest.mark.requires_spark
 def test_finalize_tables_basic(spark) -> None:
     # ---------- entity ----------
-    e_schema = StructType(
-        [
-            StructField("entity_id", StringType(), True),
-            StructField("entity_type", StringType(), True),
-            StructField("data_source", StringType(), True),
-            StructField("created", StringType(), True),
-            StructField("updated", StringType(), True),
-        ]
-    )
+    e_schema = StructType([
+        StructField("entity_id", StringType(), True),
+        StructField("entity_type", StringType(), True),
+        StructField("data_source", StringType(), True),
+        StructField("created", StringType(), True),
+        StructField("updated", StringType(), True),
+    ])
 
     e1 = spark.createDataFrame(
         [Row(entity_id="E1", entity_type="genome", data_source="RefSeq", created="2020", updated="2021")],
@@ -60,14 +56,12 @@ def test_finalize_tables_basic(spark) -> None:
     )
 
     # ---------- contig_collection (schema REQUIRED due to None!) ----------
-    coll_schema = StructType(
-        [
-            StructField("collection_id", StringType(), True),
-            StructField("contig_collection_type", StringType(), True),
-            StructField("ncbi_taxon_id", StringType(), True),
-            StructField("gtdb_taxon_id", StringType(), True),
-        ]
-    )
+    coll_schema = StructType([
+        StructField("collection_id", StringType(), True),
+        StructField("contig_collection_type", StringType(), True),
+        StructField("ncbi_taxon_id", StringType(), True),
+        StructField("gtdb_taxon_id", StringType(), True),
+    ])
 
     c1 = spark.createDataFrame(
         [
diff --git a/tests/parsers/test_annotation_parse.py b/tests/parsers/test_annotation_parse.py
new file mode 100644
index 0000000..35c9ffe
--- /dev/null
+++ b/tests/parsers/test_annotation_parse.py
@@ -0,0 +1,710 @@
+import json
+from pathlib import Path
+import pytest
+
+from cdm_data_loader_utils.parsers.annotation_parse import (
+    load_contig_collection_x_feature,
+    load_contig_collection_x_protein,
+    load_contig_x_contig_collection,
+    load_contigs,
+    load_feature_records,
+    load_feature_x_protein,
+    load_identifiers,
+    load_names,
+    apply_prefix,
+    to_int,
+)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "1234",
+                            "name": "hypothetical protein",
+                            "relationship": "RefSeq gene symbol",
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:1234",
+                    "1234",
+                    "hypothetical protein",
+                    "RefSeq",
+                    "RefSeq gene symbol",
+                )
+            ],
+        ),
+        (
+            {"reports": [{"annotation": {"gene_id": "5678", "name": "some protein"}}]},
+            [("ncbigene:5678", "5678", "some protein", "RefSeq", None)],
+        ),
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "name": "no gene id here",
+                            "relationship": "RefSeq locus tag",
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "1001",
+                            "name": "abc",
+                            "relationship": "RefSeq gene symbol",
+                        }
+                    },
+                    {"annotation": {"gene_id": "1002", "name": "xyz"}},
+                ]
+            },
+            [
+                ("ncbigene:1001", "1001", "abc", "RefSeq", "RefSeq gene symbol"),
+                ("ncbigene:1002", "1002", "xyz", "RefSeq", None),
+            ],
+        ),
+    ],
+)
+def test_load_identifiers(input_data, expected_output):
+    result = load_identifiers(input_data)
+    assert result == expected_output
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: all name fields present
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "1234",
+                            "symbol": "abc",
+                            "name": "ABC protein",
+                            "locus_tag": "LTG_1234",
+                        }
+                    }
+                ]
+            },
+            [
+                ("ncbigene:1234", "abc", "RefSeq gene symbol", "RefSeq"),
+                ("ncbigene:1234", "ABC protein", "RefSeq gene name", "RefSeq"),
+                ("ncbigene:1234", "LTG_1234", "RefSeq locus tag", "RefSeq"),
+            ],
+        ),
+        # Case 2: only gene_name present
+        (
+            {"reports": [{"annotation": {"gene_id": "5678", "name": "Hypothetical protein"}}]},
+            [
+                (
+                    "ncbigene:5678",
+                    "Hypothetical protein",
+                    "RefSeq gene name",
+                    "RefSeq",
+                )
+            ],
+        ),
+        # Case 3: no gene_id
+        (
+            {"reports": [{"annotation": {"name": "Unnamed", "symbol": "XYZ"}}]},
+            [],
+        ),
+        # Case 4: only locus_tag present
+        (
+            {"reports": [{"annotation": {"gene_id": "8888", "locus_tag": "LTG_8888"}}]},
+            [("ncbigene:8888", "LTG_8888", "RefSeq locus tag", "RefSeq")],
+        ),
+        # Case 5: multiple reports
+        (
+            {
+                "reports": [
+                    {"annotation": {"gene_id": "1001", "symbol": "DEF"}},
+                    {"annotation": {"gene_id": "1002", "name": "DEF protein"}},
+                ]
+            },
+            [
+                ("ncbigene:1001", "DEF", "RefSeq gene symbol", "RefSeq"),
+                ("ncbigene:1002", "DEF protein", "RefSeq gene name", "RefSeq"),
+            ],
+        ),
+    ],
+)
+def test_load_names(input_data, expected_output):
+    result = load_names(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: basic valid input with plus strand
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "1234",
+                            "genomic_regions": [
+                                {
+                                    "gene_range": {
+                                        "range": [
+                                            {
+                                                "begin": "100",
+                                                "end": "200",
+                                                "orientation": "plus",
+                                            }
+                                        ]
+                                    }
+                                }
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:1234",
+                    None,
+                    None,
+                    None,
+                    200,
+                    None,
+                    100,
+                    "positive",
+                    "RefSeq",
+                    None,
+                    "gene",
+                )
+            ],
+        ),
+        # Case 2: multiple ranges, different strands
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "5678",
+                            "genomic_regions": [
+                                {
+                                    "gene_range": {
+                                        "range": [
+                                            {
+                                                "begin": "300",
+                                                "end": "500",
+                                                "orientation": "minus",
+                                            },
+                                            {
+                                                "begin": "600",
+                                                "end": "800",
+                                                "orientation": "plus",
+                                            },
+                                        ]
+                                    }
+                                }
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:5678",
+                    None,
+                    None,
+                    None,
+                    500,
+                    None,
+                    300,
+                    "negative",
+                    "RefSeq",
+                    None,
+                    "gene",
+                ),
+                (
+                    "ncbigene:5678",
+                    None,
+                    None,
+                    None,
+                    800,
+                    None,
+                    600,
+                    "positive",
+                    "RefSeq",
+                    None,
+                    "gene",
+                ),
+            ],
+        ),
+        # Case 3: missing orientation
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "9999",
+                            "genomic_regions": [{"gene_range": {"range": [{"begin": "1", "end": "2"}]}}],
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:9999",
+                    None,
+                    None,
+                    None,
+                    2,
+                    None,
+                    1,
+                    "unknown",
+                    "RefSeq",
+                    None,
+                    "gene",
+                )
+            ],
+        ),
+        # Case 4: no gene_id
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [
+                                {
+                                    "gene_range": {
+                                        "range": [
+                                            {
+                                                "begin": "100",
+                                                "end": "200",
+                                                "orientation": "plus",
+                                            }
+                                        ]
+                                    }
+                                }
+                            ]
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 5: non-integer start/end
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "1111",
+                            "genomic_regions": [
+                                {
+                                    "gene_range": {
+                                        "range": [
+                                            {
+                                                "begin": "abc",
+                                                "end": "xyz",
+                                                "orientation": "plus",
+                                            }
+                                        ]
+                                    }
+                                }
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                (
+                    "ncbigene:1111",
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    "positive",
+                    "RefSeq",
+                    None,
+                    "gene",
+                )
+            ],
+        ),
+    ],
+)
+def test_load_feature_records(input_data, expected_output):
+    result = load_feature_records(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: valid mapping
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "12345",
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}],
+                        }
+                    }
+                ]
+            },
+            [("refseq:NC_000001.11", "ncbigene:12345")],
+        ),
+        # Case 2: no gene_id
+        (
+            {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000002.11"}}]}}]},
+            [],
+        ),
+        # Case 3: no genomic_regions
+        (
+            {"reports": [{"annotation": {"gene_id": "67890"}}]},
+            [],
+        ),
+        # Case 4: empty genomic_regions list
+        (
+            {"reports": [{"annotation": {"gene_id": "99999", "genomic_regions": []}}]},
+            [],
+        ),
+        # Case 5: missing accession_version
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "13579",
+                            "genomic_regions": [{"gene_range": {}}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+    ],
+)
+def test_load_contig_collection_x_feature(input_data, expected_output):
+    result = load_contig_collection_x_feature(input_data)
+    assert result == expected_output
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: Valid report with multiple proteins
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "proteins": [
+                                {"accession_version": "XP_123"},
+                                {"accession_version": "XP_456"},
+                            ],
+                            "annotations": [{"assembly_accession": "GCF_000001"}],
+                        }
+                    }
+                ]
+            },
+            [
+                ("insdc.gcf:GCF_000001", "refseq:XP_123"),
+                ("insdc.gcf:GCF_000001", "refseq:XP_456"),
+            ],
+        ),
+        # Case 2: No proteins
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "proteins": [],
+                            "annotations": [{"assembly_accession": "GCF_000002"}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 3: No annotations
+        (
+            {"reports": [{"annotation": {"proteins": [{"accession_version": "XP_789"}]}}]},
+            [],
+        ),
+        # Case 4: Missing assembly_accession
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "proteins": [{"accession_version": "XP_789"}],
+                            "annotations": [{}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 5: Some proteins missing accession_version
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "proteins": [
+                                {"accession_version": "XP_111"},
+                                {},
+                                {"accession_version": "XP_222"},
+                            ],
+                            "annotations": [{"assembly_accession": "GCF_000003"}],
+                        }
+                    }
+                ]
+            },
+            [
+                ("insdc.gcf:GCF_000003", "refseq:XP_111"),
+                ("insdc.gcf:GCF_000003", "refseq:XP_222"),
+            ],
+        ),
+    ],
+)
+def test_load_contig_collection_x_protein(input_data, expected_output):
+    result = load_contig_collection_x_protein(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: valid gene with multiple proteins
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "4156311",
+                            "proteins": [
+                                {"accession_version": "XP_001"},
+                                {"accession_version": "XP_002"},
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                ("ncbigene:4156311", "refseq:XP_001"),
+                ("ncbigene:4156311", "refseq:XP_002"),
+            ],
+        ),
+        # Case 2: no gene_id
+        (
+            {"reports": [{"annotation": {"proteins": [{"accession_version": "XP_999"}]}}]},
+            [],
+        ),
+        # Case 3: gene with no proteins
+        (
+            {"reports": [{"annotation": {"gene_id": "4156312"}}]},
+            [],
+        ),
+        # Case 4: some proteins missing accession_version
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "gene_id": "4156313",
+                            "proteins": [
+                                {"accession_version": "XP_777"},
+                                {},
+                                {"accession_version": "XP_888"},
+                            ],
+                        }
+                    }
+                ]
+            },
+            [
+                ("ncbigene:4156313", "refseq:XP_777"),
+                ("ncbigene:4156313", "refseq:XP_888"),
+            ],
+        ),
+        # Case 5: empty report list
+        ({"reports": []}, []),
+    ],
+)
+def test_load_feature_x_protein(input_data, expected_output):
+    result = load_feature_x_protein(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: Valid contig and assembly
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}],
+                            "annotations": [{"assembly_accession": "GCF_000001.1"}],
+                        }
+                    }
+                ]
+            },
+            [("refseq:NC_000001.11", "insdc.gcf:GCF_000001.1")],
+        ),
+        # Case 2: Missing genomic_regions
+        (
+            {"reports": [{"annotation": {"annotations": [{"assembly_accession": "GCF_000002.1"}]}}]},
+            [],
+        ),
+        # Case 3: Missing annotations
+        (
+            {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000003.11"}}]}}]},
+            [],
+        ),
+        # Case 4: Missing accession_version in region
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {}}],
+                            "annotations": [{"assembly_accession": "GCF_000004.1"}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 5: Missing assembly_accession in annotations
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000005.11"}}],
+                            "annotations": [{}],
+                        }
+                    }
+                ]
+            },
+            [],
+        ),
+        # Case 6: Multiple reports, one valid
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000006.11"}}],
+                            "annotations": [{"assembly_accession": "GCF_000006.1"}],
+                        }
+                    },
+                    {
+                        "annotation": {
+                            "genomic_regions": [{"gene_range": {"accession_version": "NC_000007.11"}}],
+                            "annotations": [{}],
+                        }
+                    },
+                ]
+            },
+            [("refseq:NC_000006.11", "insdc.gcf:GCF_000006.1")],
+        ),
+    ],
+)
+def test_load_contig_x_contig_collection(input_data, expected_output):
+    result = load_contig_x_contig_collection(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+@pytest.mark.parametrize(
+    "input_data, expected_output",
+    [
+        # Case 1: Valid contig with accession_version
+        (
+            {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {"accession_version": "NC_000001.11"}}]}}]},
+            [("refseq:NC_000001.11", None, None, None)],
+        ),
+        # Case 2: Multiple contigs, different accession_versions
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [
+                                {"gene_range": {"accession_version": "NC_000001.11"}},
+                                {"gene_range": {"accession_version": "NC_000002.12"}},
+                            ]
+                        }
+                    }
+                ]
+            },
+            [
+                ("refseq:NC_000001.11", None, None, None),
+                ("refseq:NC_000002.12", None, None, None),
+            ],
+        ),
+        # Case 3: Duplicate accession versions
+        (
+            {
+                "reports": [
+                    {
+                        "annotation": {
+                            "genomic_regions": [
+                                {"gene_range": {"accession_version": "NC_000003.13"}},
+                                {"gene_range": {"accession_version": "NC_000003.13"}},
+                            ]
+                        }
+                    }
+                ]
+            },
+            [("refseq:NC_000003.13", None, None, None)],
+        ),
+        # Case 4: Missing accession_version
+        (
+            {"reports": [{"annotation": {"genomic_regions": [{"gene_range": {}}]}}]},
+            [],
+        ),
+        # Case 5: Empty reports
+        (
+            {"reports": []},
+            [],
+        ),
+    ],
+)
+def test_load_contigs(input_data, expected_output):
+    result = load_contigs(input_data)
+    assert sorted(result) == sorted(expected_output)
+
+
+### add new test: to_int
+@pytest.mark.parametrize(
+    "input_id, expected",
+    [
+        ("GeneID:123", "ncbigene:123"),
+        ("YP_009725307.1", "refseq:YP_009725307.1"),
+        ("GCF_000001405.39", "insdc.gcf:GCF_000001405.39"),
+        ("random", "random"),
+    ],
+)
+def test_apply_prefix(input_id, expected):
+    assert apply_prefix(input_id) == expected
+
+
+@pytest.mark.parametrize("val, expected", [("123", 123), ("abc", None), ("", None)])
+def test_to_int(val, expected):
+    assert to_int(val) == expected
diff --git a/tests/parsers/test_uniprot.py b/tests/parsers/test_uniprot.py
index 86ffca3..105fb76 100644
--- a/tests/parsers/test_uniprot.py
+++ b/tests/parsers/test_uniprot.py
@@ -1,4 +1,4 @@
-"""
+"""Tests for the UniProt parser.
 
 This file uses pytest to provide parameterized and functional tests for all major
 UniProt parsing utility functions, ensuring correct parsing and transformation of
@@ -16,729 +16,756 @@
     - parse_uniprot_entry: Full record parsing, all fields together
 
 How to run in the terminal:
-   pytest tests/uniprot_refactor/test_uniprot_parsers.py
+    PYTHONPATH=src pytest tests/test_uniprot.py
 
 """
 
 import datetime
-import json
+import re
 import xml.etree.ElementTree as ET
-from pathlib import Path
+from typing import Any
 
 import pytest
 
 from cdm_data_loader_utils.parsers.uniprot import (
     build_datasource_record,
+    generate_cdm_id,
     parse_associations,
-    parse_cross_references,
     parse_evidence_map,
     parse_identifiers,
     parse_names,
     parse_protein_info,
-    save_datasource_record,
+    parse_publications,
+    parse_uniprot_entry,
 )
 
-NS_URI = "https://uniprot.org/uniprot"
-
+# Regular expression to validate UUID format
+UUID_PATTERN = re.compile(r"^[a-f0-9]{8}-[a-f0-9]{4}-[1-5][a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$", re.IGNORECASE)
 
-@pytest.fixture(
-    params=[
-        "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz",
-        "http://example.org/uniprot_test.xml.gz",
-    ]
-)
-def xml_url(request):
-    return request.param
 
+@pytest.mark.parametrize("n", range(5))
+def test_generate_cdm_id_format(n: int) -> None:
+    uuid = generate_cdm_id()
+    assert uuid.startswith("CDM:")
+    uuid_str = uuid[4:]
+    assert UUID_PATTERN.match(uuid_str), f"{uuid_str} is not a valid UUID"
 
-def test_build_datasource_record(xml_url):
-    record = build_datasource_record(xml_url)
 
-    # ---- basic structure ----
+## build_datasource_record ##
+def test_build_datasource_record() -> None:
+    url = "https://example.com/uniprot.xml.gz"
+    record = build_datasource_record(url)
     assert isinstance(record, dict)
-
-    # ---- fixed fields ----
+    assert set(record.keys()) == {"name", "source", "url", "accessed", "version"}
     assert record["name"] == "UniProt import"
     assert record["source"] == "UniProt"
-    assert record["url"] == xml_url
+    assert record["url"] == url
+
+    # check accessed
+    accessed_dt = datetime.datetime.fromisoformat(record["accessed"])
+    now = datetime.datetime.now(datetime.UTC)
+    delta = abs((now - accessed_dt).total_seconds())
+    assert delta < 10
     assert record["version"] == 115
 
-    # ---- accessed field ----
-    accessed = record.get("accessed")
-    assert accessed is not None
 
-    parsed = datetime.datetime.fromisoformat(accessed)
-    assert parsed.tzinfo is not None
-    assert parsed.tzinfo == datetime.UTC
+@pytest.mark.parametrize("bad_url", [None, ""])
+def test_build_datasource_record_bad(bad_url: str | None) -> None:
+    record = build_datasource_record(bad_url)
+    assert record["url"] == bad_url
 
 
-def test_save_datasource_record(tmp_path: Path, xml_url):
-    """
-    save_datasource_record should:
-    - create output directory if missing
-    - write datasource.json
-    - return the same content that is written to disk
+## parse_identifiers function test ##
+@pytest.mark.parametrize(
+    ("xml_str", "cdm_id", "expected"),
+    [
+        ### multiple accessions, expect two dict, every dic use the same cdm_id
+        ### identifier according to <accession> number
+        (
+            """
+            <entry xmlns="https://uniprot.org/uniprot">
+                <accession>Q9V2L2</accession>
+                <accession>G8ZFP4</accession>
+            </entry>
+            """,
+            "CDM:001",
+            [
+                {
+                    "entity_id": "CDM:001",
+                    "identifier": "UniProt:Q9V2L2",
+                    "source": "UniProt",
+                    "description": "UniProt accession",
+                },
+                {
+                    "entity_id": "CDM:001",
+                    "identifier": "UniProt:G8ZFP4",
+                    "source": "UniProt",
+                    "description": "UniProt accession",
+                },
+            ],
+        ),
+        ### Use single accession
+        (
+            """
+            <entry xmlns="https://uniprot.org/uniprot">
+                <accession>X00001</accession>
+            </entry>
+            """,
+            "CDM:002",
+            [
+                {
+                    "entity_id": "CDM:002",
+                    "identifier": "UniProt:X00001",
+                    "source": "UniProt",
+                    "description": "UniProt accession",
+                }
+            ],
+        ),
+        ### No accession
+        (
+            """
+            <entry xmlns="https://uniprot.org/uniprot">
+            </entry>
+            """,
+            "CDM:003",
+            [],
+        ),
+    ],
+)
+def test_parse_identifiers(xml_str: str, cdm_id: str, expected: list[dict[str, str]]) -> None:
     """
-    output_dir = tmp_path / "output"
-
-    # ---- call function ----
-    result = save_datasource_record(xml_url, str(output_dir))
+    This approach ensures that parse_identifiers correctly parses and structures identifier data.
 
-    # ---- return value sanity ----
-    assert isinstance(result, dict)
-    assert result["url"] == xml_url
-    assert result["source"] == "UniProt"
-    assert result["name"] == "UniProt import"
-    assert "accessed" in result
-    assert "version" in result
+    The parsed Element object and the provided CDM_id are passed to the parse_identifiers funtion.
+    The function is expected to extract all relevant identifier information from the XML and return list of dict.
 
-    # ---- file existence ----
-    output_file = output_dir / "datasource.json"
-    assert output_file.exists()
-    assert output_file.is_file()
-
-    # ---- file content correctness ----
-    with open(output_file, encoding="utf-8") as f:
-        on_disk = json.load(f)
-
-    assert on_disk == result
+    The test compares the result output with the predefined expected result using an assert statement.
 
+    """
+    entry = ET.fromstring(xml_str)
+    result = parse_identifiers(entry, cdm_id)
+    assert result == expected
 
-def make_entry(names=None, protein_names=None):
-    entry = ET.Element(f"{{{NS_URI}}}entry")
 
-    # <entry><name>
-    for n in names or []:
-        e = ET.SubElement(entry, f"{{{NS_URI}}}name")
-        e.text = n
+"""
+    This parameterized pytest function tests the correctness of the parse_names function for various UniProt XML entry scenarios.
 
-    # <protein> block
-    if protein_names:
-        protein = ET.SubElement(entry, f"{{{NS_URI}}}protein")
+    XML string representing a UniProt entry with different protein names:
+    top-level <name>
+    recommended names,
+    alternative names,
+    combinations,
+    no names
 
-        for tag, logical in [
-            ("recommendedName", "recommended"),
-            ("alternativeName", "alternative"),
-        ]:
-            if logical not in protein_names:
-                continue
+    cdm_id: CDM entry ID
 
-            block = ET.SubElement(protein, f"{{{NS_URI}}}{tag}")
-            for xml_tag in ["fullName", "shortName"]:
-                val = protein_names[logical].get(xml_tag.replace("Name", ""))
-                if val:
-                    e = ET.SubElement(block, f"{{{NS_URI}}}{xml_tag}")
-                    e.text = val
+    Output:
+    A list of name records with their metadata
 
-    return entry
+"""
 
 
+## parse_names function test ##
 @pytest.mark.parametrize(
-    "entry_kwargs, cdm_id, expected",
+    ("xml_str", "cdm_id", "expected"),
     [
-        # Only <entry><name>
+        # Only top-level <name>
         (
-            {"names": ["ProteinA"]},
-            "cdm_1",
-            {
-                ("ProteinA", "UniProt entry name"),
-            },
+            """<entry xmlns="https://uniprot.org/uniprot">
+                   <name>MainProteinName</name>
+               </entry>""",
+            "CDM:001",
+            [
+                {
+                    "entity_id": "CDM:001",
+                    "name": "MainProteinName",
+                    "description": "UniProt protein name",
+                    "source": "UniProt",
+                }
+            ],
         ),
-        # entry name + recommended full name
+        # RecommendedName (fullName and shortName)
         (
-            {
-                "names": ["ProteinB"],
-                "protein_names": {
-                    "recommended": {"full": "Rec Full B", "short": None},
+            """<entry xmlns="https://uniprot.org/uniprot">
+                   <protein>
+                     <recommendedName>
+                        <fullName>RecFullName</fullName>
+                        <shortName>RecShort</shortName>
+                     </recommendedName>
+                   </protein>
+               </entry>""",
+            "CDM:002",
+            [
+                {
+                    "entity_id": "CDM:002",
+                    "name": "RecFullName",
+                    "description": "UniProt recommended full name",
+                    "source": "UniProt",
                 },
-            },
-            "cdm_2",
-            {
-                ("ProteinB", "UniProt entry name"),
-                ("Rec Full B", "UniProt recommended full name"),
-            },
+                {
+                    "entity_id": "CDM:002",
+                    "name": "RecShort",
+                    "description": "UniProt recommended short name",
+                    "source": "UniProt",
+                },
+            ],
         ),
-        # everything
+        # AlternativeName (fullName and shortName)
         (
-            {
-                "names": ["ProteinC"],
-                "protein_names": {
-                    "recommended": {"full": "Rec Full C", "short": "Rec Short C"},
-                    "alternative": {"full": "Alt Full C", "short": "Alt Short C"},
+            """<entry xmlns="https://uniprot.org/uniprot">
+                   <protein>
+                     <alternativeName>
+                        <fullName>AltFullName1</fullName>
+                        <shortName>AltShort1</shortName>
+                     </alternativeName>
+                     <alternativeName>
+                        <fullName>AltFullName2</fullName>
+                     </alternativeName>
+                   </protein>
+               </entry>""",
+            "CDM:003",
+            [
+                {
+                    "entity_id": "CDM:003",
+                    "name": "AltFullName1",
+                    "description": "UniProt alternative full name",
+                    "source": "UniProt",
                 },
-            },
-            "cdm_3",
-            {
-                ("ProteinC", "UniProt entry name"),
-                ("Rec Full C", "UniProt recommended full name"),
-                ("Rec Short C", "UniProt recommended short name"),
-                ("Alt Full C", "UniProt alternative full name"),
-                ("Alt Short C", "UniProt alternative short name"),
-            },
+                {
+                    "entity_id": "CDM:003",
+                    "name": "AltShort1",
+                    "description": "UniProt alternative short name",
+                    "source": "UniProt",
+                },
+                {
+                    "entity_id": "CDM:003",
+                    "name": "AltFullName2",
+                    "description": "UniProt alternative full name",
+                    "source": "UniProt",
+                },
+            ],
+        ),
+        # Mixed: top-level <name> and <protein>
+        (
+            """<entry xmlns="https://uniprot.org/uniprot">
+                   <name>TopLevel</name>
+                   <protein>
+                     <recommendedName>
+                        <fullName>MixedFull</fullName>
+                     </recommendedName>
+                   </protein>
+               </entry>""",
+            "CDM:004",
+            [
+                {
+                    "entity_id": "CDM:004",
+                    "name": "TopLevel",
+                    "description": "UniProt protein name",
+                    "source": "UniProt",
+                },
+                {
+                    "entity_id": "CDM:004",
+                    "name": "MixedFull",
+                    "description": "UniProt recommended full name",
+                    "source": "UniProt",
+                },
+            ],
+        ),
+        # No names at all
+        (
+            """<entry xmlns="https://uniprot.org/uniprot">
+               </entry>""",
+            "CDM:005",
+            [],
         ),
     ],
 )
-def test_parse_names_parametrized(entry_kwargs, cdm_id, expected):
-    entry = make_entry(**entry_kwargs)
+def test_parse_names(xml_str: str, cdm_id: str, expected: list[dict[str, str]]) -> None:
+    entry = ET.fromstring(xml_str)
+    result = parse_names(entry, cdm_id)
+    assert result == expected
 
-    rows = parse_names(entry, cdm_id)
 
-    # ---- row count ----
-    assert len(rows) == len(expected)
+"""
+
+    This test ensures parse_protein_info works correctly for different combinations of data
+    Including cases with no protein info, sequence only, existence only or EC numbers
+
+    This approach thoroughly validates that parse_protein_info can accurately extract, combine and structure metadata field.
 
-    # ---- content ----
-    observed = {(r["name"], r["description"]) for r in rows}
-    assert observed == expected
+    Include:
+    EC Number,
+    existence evidence,
+    sequence
 
-    # ---- entity_id and source ----
-    for r in rows:
-        assert r["entity_id"] == cdm_id
-        assert r["source"] == "UniProt"
+"""
 
 
+## parse_protein_info function test ##
 @pytest.mark.parametrize(
-    "build_entry, cdm_id, expected",
+    ("xml_str", "cdm_id", "expected"),
     [
-        # --------------------------------------------------
-        # Empty entry -> None
-        # --------------------------------------------------
+        # There are multiple ecNumbers under the recommend names
         (
-            lambda: ET.Element(f"{{{NS_URI}}}entry"),
-            "cdm_1",
-            None,
+            """<entry xmlns="https://uniprot.org/uniprot">
+                <protein>
+                  <recommendedName>
+                    <ecNumber>1.2.3.4</ecNumber>
+                    <ecNumber>5.6.7.8</ecNumber>
+                  </recommendedName>
+                </protein>
+            </entry>""",
+            "CDM:001",
+            {"ec_numbers": ["1.2.3.4", "5.6.7.8"]},
+        ),
+        # alternativeName has EC Number
+        (
+            """<entry xmlns="https://uniprot.org/uniprot">
+                <protein>
+                  <alternativeName>
+                    <ecNumber>3.3.3.3</ecNumber>
+                  </alternativeName>
+                </protein>
+            </entry>""",
+            "CDM:002",
+            {"ec_numbers": ["3.3.3.3"]},
         ),
-        # --------------------------------------------------
-        # Only EC numbers
-        # --------------------------------------------------
+        # If have both proteinExistence evidence and existence
         (
-            lambda: (
-                lambda entry: (
-                    ET.SubElement(
-                        ET.SubElement(
-                            ET.SubElement(entry, f"{{{NS_URI}}}protein"),
-                            f"{{{NS_URI}}}recommendedName",
-                        ),
-                        f"{{{NS_URI}}}ecNumber",
-                    ).__setattr__("text", "1.1.1.1"),
-                    entry,
-                )[1]
-            )(ET.Element(f"{{{NS_URI}}}entry")),
-            "cdm_2",
+            """<entry xmlns="https://uniprot.org/uniprot">
+                <proteinExistence type="evidence at protein level"/>
+            </entry>""",
+            "CDM:003",
             {
-                "ec_numbers": "1.1.1.1",
+                "protein_id": "CDM:003",
+                "evidence_for_existence": "evidence at protein level",
             },
         ),
-        # --------------------------------------------------
-        # Only sequence + entry modified
-        # --------------------------------------------------
+        # Sequence only
         (
-            lambda: (
-                lambda entry: (
-                    entry.set("modified", "2024-01-01"),
-                    ET.SubElement(
-                        entry,
-                        f"{{{NS_URI}}}sequence",
-                        {
-                            "length": "100",
-                            "mass": "12345",
-                            "checksum": "ABC",
-                            "version": "2",
-                        },
-                    ).__setattr__("text", "MKTIIALSY"),
-                    entry,
-                )[2]
-            )(ET.Element(f"{{{NS_URI}}}entry")),
-            "cdm_3",
+            """<entry xmlns="https://uniprot.org/uniprot">
+                <sequence length="357" mass="12345" checksum="ABCD" modified="2024-05-21" version="2">
+                MAGNLSKVAAVSGVAAAVLGK
+                </sequence>
+            </entry>""",
+            "CDM:004",
             {
-                "length": "100",
+                "length": "357",
                 "mass": "12345",
-                "checksum": "ABC",
+                "checksum": "ABCD",
+                "modified": "2024-05-21",
                 "sequence_version": "2",
-                "sequence": "MKTIIALSY",
-                "entry_modified": "2024-01-01",
+                "sequence": "MAGNLSKVAAVSGVAAAVLGK",
             },
         ),
-        # --------------------------------------------------
-        # Everything
-        # --------------------------------------------------
+        # Combine with three elements: proteinExistence, sequence and ecNumbers
         (
-            lambda: (
-                lambda entry: (
-                    entry.set("modified", "2024-02-02"),
-                    # protein + EC
-                    ET.SubElement(
-                        ET.SubElement(
-                            ET.SubElement(entry, f"{{{NS_URI}}}protein"),
-                            f"{{{NS_URI}}}recommendedName",
-                        ),
-                        f"{{{NS_URI}}}ecNumber",
-                    ).__setattr__("text", "3.5.4.4"),
-                    # proteinExistence
-                    ET.SubElement(
-                        entry,
-                        f"{{{NS_URI}}}proteinExistence",
-                        {"type": "evidence at protein level"},
-                    ),
-                    # sequence
-                    ET.SubElement(
-                        entry,
-                        f"{{{NS_URI}}}sequence",
-                        {
-                            "length": "250",
-                            "mass": "99999",
-                            "checksum": "XYZ",
-                            "modified": "2023-12-01",
-                            "version": "1",
-                        },
-                    ).__setattr__("text", "MADEUPSEQUENCE"),
-                    entry,
-                )[4]
-            )(ET.Element(f"{{{NS_URI}}}entry")),
-            "cdm_4",
+            """<entry xmlns="https://uniprot.org/uniprot">
+                <protein>
+                  <recommendedName>
+                    <ecNumber>3.3.3.3</ecNumber>
+                  </recommendedName>
+                  <alternativeName>
+                    <ecNumber>8.8.8.8</ecNumber>
+                  </alternativeName>
+                </protein>
+                <proteinExistence type="evidence at transcript level"/>
+                <sequence length="10" mass="1000" checksum="XYZ" modified="2021-12-01" version="1">
+                  MKTLLTGAAT
+                </sequence>
+            </entry>""",
+            "CDM:005",
             {
-                "ec_numbers": "3.5.4.4",
-                "protein_id": "cdm_4",
-                "evidence_for_existence": "evidence at protein level",
-                "length": "250",
-                "mass": "99999",
+                "ec_numbers": ["3.3.3.3", "8.8.8.8"],
+                "protein_id": "CDM:005",
+                "evidence_for_existence": "evidence at transcript level",
+                "length": "10",
+                "mass": "1000",
                 "checksum": "XYZ",
-                "modified": "2023-12-01",
+                "modified": "2021-12-01",
                 "sequence_version": "1",
-                "sequence": "MADEUPSEQUENCE",
-                "entry_modified": "2024-02-02",
+                "sequence": "MKTLLTGAAT",
             },
         ),
+        # return None
+        ("""<entry xmlns="https://uniprot.org/uniprot"></entry>""", "CDM:006", None),
     ],
 )
-def test_parse_protein_info(build_entry, cdm_id, expected):
-    entry = build_entry()
-
+def test_parse_protein_info(xml_str: str, cdm_id: str, expected: dict[str, Any]) -> None:
+    entry = ET.fromstring(xml_str)
     result = parse_protein_info(entry, cdm_id)
+    assert result == expected
+
 
-    if expected is None:
-        assert result is None
-    else:
-        assert isinstance(result, dict)
-        assert result == expected
+"""
+
+    This parameterized pytest function verifies the behavior of the parse_evidence_map function
+    for different UniProt XML entry structures involving evidence elements.
+
+    xml_str: Simulates a UniProt entry with various <evidence> and <source> sub-structures,
+    including cases with multiple evidence elements, missing sources, or no evidence at all.
+
+    expected: A dictionary mapping evidence keys to their extracted details—such as evidence type,
+    supporting objects, and publication references.
+
+    Ensure parse_evidence_map:
+    Accurately extract evidence keys and types
+    Correctly classify supporting objects and publication references
+    Handle entries with absent sources or evidence elements
+    Represent all relevant evidence metadata in the required structure
+
+"""
 
 
+## parse_evidence_map function test ##
 @pytest.mark.parametrize(
-    "build_xml, expected",
+    ("xml_str", "expected"),
     [
-        # --------------------------------------------------
-        # No evidence elements
-        # --------------------------------------------------
+        # Single evidence，include PubMed and supporting object
         (
-            lambda: ET.Element(f"{{{NS_URI}}}entry"),
-            {},
+            """<entry xmlns="https://uniprot.org/uniprot">
+                <evidence key="1" type="ECO:0000255">
+                  <source>
+                    <dbReference type="PubMed" id="123456"/>
+                    <dbReference type="Ensembl" id="ENSG00001"/>
+                  </source>
+                </evidence>
+            </entry>""",
+            {
+                "1": {
+                    "evidence_type": "ECO:0000255",
+                    "supporting_objects": ["Ensembl:ENSG00001"],
+                    "publications": ["PMID:123456"],
+                }
+            },
         ),
-        # --------------------------------------------------
-        # Evidence without key
-        # --------------------------------------------------
+        # multiple evidences
         (
-            lambda: (
-                lambda entry: (
-                    ET.SubElement(entry, f"{{{NS_URI}}}evidence", {"type": "ECO:0000269"}),
-                    entry,
-                )[1]
-            )(ET.Element(f"{{{NS_URI}}}entry")),
-            {},
+            """<entry xmlns="https://uniprot.org/uniprot">
+                <evidence key="E1" type="ECO:0000313">
+                  <source>
+                    <dbReference type="PubMed" id="654321"/>
+                  </source>
+                </evidence>
+                <evidence key="E2" type="ECO:0000250">
+                  <source>
+                    <dbReference type="PDB" id="2N7Q"/>
+                  </source>
+                </evidence>
+            </entry>""",
+            {
+                "E1": {
+                    "evidence_type": "ECO:0000313",
+                    "supporting_objects": None,
+                    "publications": ["PMID:654321"],
+                },
+                "E2": {
+                    "evidence_type": "ECO:0000250",
+                    "supporting_objects": ["PDB:2N7Q"],
+                    "publications": None,
+                },
+            },
         ),
-        # --------------------------------------------------
-        # Evidence with key, no source
-        # --------------------------------------------------
+        # no source
         (
-            lambda: (
-                lambda entry: (
-                    ET.SubElement(
-                        entry,
-                        f"{{{NS_URI}}}evidence",
-                        {"key": "1", "type": "ECO:0000313"},
-                    ),
-                    entry,
-                )[1]
-            )(ET.Element(f"{{{NS_URI}}}entry")),
+            """<entry xmlns="https://uniprot.org/uniprot">
+                <evidence key="X1" type="ECO:9999999"/>
+            </entry>""",
             {
-                "1": {
-                    "evidence_type": "ECO:0000313",
+                "X1": {
+                    "evidence_type": "ECO:9999999",
+                    "supporting_objects": None,
+                    "publications": None,
                 }
             },
         ),
-        # --------------------------------------------------
-        # Evidence with PUBMED with other refs
-        # --------------------------------------------------
+        # no evidence
+        (
+            """<entry xmlns="https://uniprot.org/uniprot">
+            </entry>""",
+            {},
+        ),
+        # one evidence with multiple supporting objects
         (
-            lambda: (
-                lambda entry: (
-                    lambda ev: (
-                        ET.SubElement(
-                            ET.SubElement(ev, f"{{{NS_URI}}}source"),
-                            f"{{{NS_URI}}}dbReference",
-                            {"type": "PubMed", "id": "12345"},
-                        ),
-                        ET.SubElement(
-                            ET.SubElement(ev, f"{{{NS_URI}}}source"),
-                            f"{{{NS_URI}}}dbReference",
-                            {"type": "GO", "id": "GO:0008150"},
-                        ),
-                        entry,
-                    )[2]
-                )(
-                    ET.SubElement(
-                        entry,
-                        f"{{{NS_URI}}}evidence",
-                        {"key": "E2", "type": "ECO:0000269"},
-                    )
-                )
-            )(ET.Element(f"{{{NS_URI}}}entry")),
+            """<entry xmlns="https://uniprot.org/uniprot">
+                <evidence key="K1" type="ECO:0000269">
+                  <source>
+                    <dbReference type="Ensembl" id="ENS1"/>
+                    <dbReference type="RefSeq" id="RS123"/>
+                  </source>
+                </evidence>
+            </entry>""",
             {
-                "E2": {
+                "K1": {
                     "evidence_type": "ECO:0000269",
-                    "publications": ["PMID:12345"],
+                    "supporting_objects": ["Ensembl:ENS1", "RefSeq:RS123"],
+                    "publications": None,
                 }
             },
         ),
     ],
 )
-def test_parse_evidence_map_parametrized(build_xml, expected):
-    entry = build_xml()
+def test_parse_evidence_map(xml_str: str, expected: dict[str, Any]) -> None:
+    entry = ET.fromstring(xml_str)
     result = parse_evidence_map(entry)
-
-    assert isinstance(result, dict)
     assert result == expected
 
 
+"""
+
+    xml_strings: models a UniProt entry with different types of possible associations
+    cdm_id: uniquely identifies the protein being parsed
+    evidence_map:  supplies external evidence metadata for associations
+    expected: list of association dictionaries
+
+    Arg:
+    The function correctly links proteins to organism taxonomy.
+	Cross-references are properly included, evidence metadata is correctly merged.
+	Associations derived from catalytic activity and cofactor comments are correctly generated.
+	All combinations and edge cases are handled robustly.
+
+"""
+
+
+## parse_associations function test ##
 @pytest.mark.parametrize(
-    "build_xml, cdm_id, evidence_map, expected",
+    ("xml_str", "cdm_id", "evidence_map", "expected"),
     [
-        # --------------------------------------------------
-        # Taxonomy association only
-        # --------------------------------------------------
+        # organism association（NCBI Taxonomy dbReference）
         (
-            lambda: (
-                lambda entry: (
-                    ET.SubElement(
-                        ET.SubElement(entry, f"{{{NS_URI}}}organism"),
-                        f"{{{NS_URI}}}dbReference",
-                        {"type": "NCBI Taxonomy", "id": "1234"},
-                    ),
-                    entry,
-                )[1]
-            )(ET.Element(f"{{{NS_URI}}}entry")),
-            "cdm_1",
+            """<entry xmlns="https://uniprot.org/uniprot">
+                   <organism>
+                      <dbReference type="NCBI Taxonomy" id="9606"/>
+                   </organism>
+            </entry>""",
+            "CDM:1",
             {},
+            [{"subject": "CDM:1", "object": "NCBITaxon:9606"}],
+        ),
+        # dbReference with evidence key
+        (
+            """<entry xmlns="https://uniprot.org/uniprot">
+                   <dbReference type="PDB" id="2N7Q" evidence="E1"/>
+            </entry>""",
+            "CDM:2",
+            {
+                "E1": {
+                    "evidence_type": "ECO:0000250",
+                    "supporting_objects": ["Ensembl:ENS1"],
+                    "publications": ["PMID:1234"],
+                }
+            },
             [
                 {
-                    "subject": "cdm_1",
-                    "object": "NCBITaxon:1234",
-                    "predicate": "in_taxon",
+                    "subject": "CDM:2",
+                    "object": "PDB:2N7Q",
+                    "evidence_type": "ECO:0000250",
+                    "supporting_objects": ["Ensembl:ENS1"],
+                    "publications": ["PMID:1234"],
                 }
             ],
         ),
-        # --------------------------------------------------
-        # Catalytic activity with evidence
-        # --------------------------------------------------
+        # comment catalytic activity (reaction) with evidence key
         (
-            lambda: (
-                lambda entry: (
-                    lambda comment: (
-                        lambda reaction: (
-                            ET.SubElement(
-                                reaction,
-                                f"{{{NS_URI}}}dbReference",
-                                {"type": "Rhea", "id": "RHEA:12345"},
-                            ),
-                            entry,
-                        )[1]
-                    )(
-                        ET.SubElement(
-                            comment,
-                            f"{{{NS_URI}}}reaction",
-                            {"evidence": "E1"},
-                        )
-                    )
-                )(
-                    ET.SubElement(
-                        entry,
-                        f"{{{NS_URI}}}comment",
-                        {"type": "catalytic activity"},
-                    )
-                )
-            )(ET.Element(f"{{{NS_URI}}}entry")),
-            "cdm_2",
+            """<entry xmlns="https://uniprot.org/uniprot">
+                   <comment type="catalytic activity">
+                     <reaction evidence="E2">
+                        <dbReference type="Rhea" id="12345"/>
+                     </reaction>
+                   </comment>
+            </entry>""",
+            "CDM:3",
             {
-                "E1": {
-                    "evidence_type": "ECO:0000269",
-                    "publications": ["PMID:12345"],
+                "E2": {
+                    "evidence_type": "ECO:0000313",
+                    "publications": ["PMID:2222"],
                 }
             },
             [
                 {
-                    "subject": "cdm_2",
+                    "subject": "CDM:3",
                     "predicate": "catalyzes",
-                    "object": "Rhea:RHEA:12345",
-                    "evidence_type": "ECO:0000269",
-                    "publications": ["PMID:12345"],
+                    "object": "Rhea:12345",
+                    "evidence_type": "ECO:0000313",
+                    "publications": ["PMID:2222"],
                 }
             ],
         ),
-        # --------------------------------------------------
-        # Cofactor association
-        # --------------------------------------------------
+        # Comment cofactor without evidence
         (
-            lambda: (
-                lambda entry: (
-                    lambda comment: (
-                        ET.SubElement(
-                            ET.SubElement(
-                                comment,
-                                f"{{{NS_URI}}}cofactor",
-                            ),
-                            f"{{{NS_URI}}}dbReference",
-                            {"type": "ChEBI", "id": "CHEBI:15377"},
-                        ),
-                        entry,
-                    )[1]
-                )(
-                    ET.SubElement(
-                        entry,
-                        f"{{{NS_URI}}}comment",
-                        {"type": "cofactor"},
-                    )
-                )
-            )(ET.Element(f"{{{NS_URI}}}entry")),
-            "cdm_3",
+            """<entry xmlns="https://uniprot.org/uniprot">
+                   <comment type="cofactor">
+                     <cofactor>
+                       <dbReference type="ChEBI" id="CHEBI:15377"/>
+                     </cofactor>
+                   </comment>
+            </entry>""",
+            "CDM:4",
             {},
             [
                 {
-                    "subject": "cdm_3",
+                    "subject": "CDM:4",
                     "predicate": "requires_cofactor",
                     "object": "ChEBI:CHEBI:15377",
                 }
             ],
         ),
+        # Several relevant relationship（with organism and dbReference）
+        (
+            """<entry xmlns="https://uniprot.org/uniprot">
+                   <organism>
+                      <dbReference type="NCBI Taxonomy" id="562"/>
+                   </organism>
+                   <dbReference type="RefSeq" id="NP_414543"/>
+            </entry>""",
+            "CDM:5",
+            {},
+            [
+                {"subject": "CDM:5", "object": "NCBITaxon:562"},
+                {"subject": "CDM:5", "object": "RefSeq:NP_414543"},
+            ],
+        ),
+        # if it is empty entry, return to []
+        ("""<entry xmlns="https://uniprot.org/uniprot"></entry>""", "CDM:6", {}, []),
     ],
 )
-def test_parse_associations_parametrized(build_xml, cdm_id, evidence_map, expected):
-    entry = build_xml()
-
+def test_parse_associations(
+    xml_str: str, cdm_id: str, evidence_map: dict[str, Any], expected: list[dict[str, str]]
+) -> None:
+    entry = ET.fromstring(xml_str)
     result = parse_associations(entry, cdm_id, evidence_map)
-
-    assert isinstance(result, list)
     assert result == expected
 
 
+"""
+
+    xml_str: Uniprot entry include <reference>, <citation>,
+    Refer: PubMed, DOI, GeneBank, DDBJ, EMBL
+
+    Output: List of publication identifier
+
+    Arg:
+    Extract publication of references
+	Recognize and format database types ( with prefixing “PMID:”, “DOI:”)
+	Handle entries with multiple or mixed publication types
+	Return an empty list if no publication data.
+
+"""
+
+
+## parse_publications function test ##
 @pytest.mark.parametrize(
-    "build_xml, cdm_id, expected",
+    ("xml_str", "expected"),
     [
-        # --------------------------------------------------
-        # No dbReference
-        # --------------------------------------------------
+        # Single PubMed
         (
-            lambda: ET.Element(f"{{{NS_URI}}}entry"),
-            "cdm_1",
-            [],
+            """<entry xmlns="https://uniprot.org/uniprot">
+                <reference>
+                  <citation>
+                    <dbReference type="PubMed" id="12345"/>
+                  </citation>
+                </reference>
+            </entry>""",
+            ["PMID:12345"],
         ),
-        # --------------------------------------------------
-        # dbReference with CURIE id
-        # --------------------------------------------------
+        # Multiple types include (PubMed, DOI, GenBank)
         (
-            lambda: (
-                lambda entry: (
-                    ET.SubElement(
-                        entry,
-                        f"{{{NS_URI}}}dbReference",
-                        {"type": "GO", "id": "GO:0008150"},
-                    ),
-                    entry,
-                )[1]
-            )(ET.Element(f"{{{NS_URI}}}entry")),
-            "cdm_2",
-            [
-                {
-                    "entity_id": "cdm_2",
-                    "xref_type": "GO",
-                    "xref_value": "GO:0008150",
-                    "xref": "GO:0008150",
-                }
-            ],
+            """<entry xmlns="https://uniprot.org/uniprot">
+                <reference>
+                  <citation>
+                    <dbReference type="PubMed" id="55555"/>
+                    <dbReference type="DOI" id="10.1000/j.jmb.2020.01.001"/>
+                    <dbReference type="GenBank" id="AB123456"/>
+                  </citation>
+                </reference>
+            </entry>""",
+            ["PMID:55555", "DOI:10.1000/j.jmb.2020.01.001"],
         ),
-        # --------------------------------------------------
-        # dbReference without CURIE (prefix)
-        # --------------------------------------------------
+        # Multiple references
         (
-            lambda: (
-                lambda entry: (
-                    ET.SubElement(
-                        entry,
-                        f"{{{NS_URI}}}dbReference",
-                        {"type": "CDD", "id": "cd04253"},
-                    ),
-                    entry,
-                )[1]
-            )(ET.Element(f"{{{NS_URI}}}entry")),
-            "cdm_3",
-            [
-                {
-                    "entity_id": "cdm_3",
-                    "xref_type": "CDD",
-                    "xref_value": "cd04253",
-                    "xref": "CDD:cd04253",
-                }
-            ],
-        ),
-        # --------------------------------------------------
-        # Mixed dbReferences
-        # --------------------------------------------------
-        (
-            lambda: (
-                lambda entry: (
-                    ET.SubElement(
-                        entry,
-                        f"{{{NS_URI}}}dbReference",
-                        {"type": "GO", "id": "GO:0003674"},
-                    ),
-                    ET.SubElement(
-                        entry,
-                        f"{{{NS_URI}}}dbReference",
-                        {"type": "PDB", "id": "1ABC"},
-                    ),
-                    entry,
-                )[2]
-            )(ET.Element(f"{{{NS_URI}}}entry")),
-            "cdm_4",
-            [
-                {
-                    "entity_id": "cdm_4",
-                    "xref_type": "GO",
-                    "xref_value": "GO:0003674",
-                    "xref": "GO:0003674",
-                },
-                {
-                    "entity_id": "cdm_4",
-                    "xref_type": "PDB",
-                    "xref_value": "1ABC",
-                    "xref": "PDB:1ABC",
-                },
-            ],
+            """<entry xmlns="https://uniprot.org/uniprot">
+                <reference>
+                  <citation>
+                    <dbReference type="DOI" id="10.1000/jmb.123456"/>
+                  </citation>
+                </reference>
+                <reference>
+                  <citation>
+                    <dbReference type="PubMed" id="98765"/>
+                  </citation>
+                </reference>
+            </entry>""",
+            ["DOI:10.1000/jmb.123456", "PMID:98765"],
         ),
-        # --------------------------------------------------
-        # Missing type or id
-        # --------------------------------------------------
+        # dbReference: DDBJ and EMBL
         (
-            lambda: (
-                lambda entry: (
-                    ET.SubElement(
-                        entry,
-                        f"{{{NS_URI}}}dbReference",
-                        {"type": "GO"},  # missing id
-                    ),
-                    ET.SubElement(
-                        entry,
-                        f"{{{NS_URI}}}dbReference",
-                        {"id": "123"},  # missing type
-                    ),
-                    entry,
-                )[2]
-            )(ET.Element(f"{{{NS_URI}}}entry")),
-            "cdm_5",
+            """<entry xmlns="https://uniprot.org/uniprot">
+                <reference>
+                  <citation>
+                    <dbReference type="DDBJ" id="BA000001"/>
+                    <dbReference type="EMBL" id="AB987654"/>
+                  </citation>
+                </reference>
+            </entry>""",
             [],
         ),
+        # no publication
+        ("""<entry xmlns="https://uniprot.org/uniprot"></entry>""", []),
     ],
 )
-def test_parse_cross_references_parametrized(build_xml, cdm_id, expected):
-    entry = build_xml()
-
-    result = parse_cross_references(entry, cdm_id)
-
-    assert isinstance(result, list)
+def test_parse_publications(xml_str: str, expected: list[str]) -> None:
+    entry = ET.fromstring(xml_str)
+    result = parse_publications(entry)
     assert result == expected
 
 
+## parse_uniprot_entry function test ##
 @pytest.mark.parametrize(
-    "build_xml, cdm_id, expected",
+    ("xml_str", "datasource_name", "prev_created"),
     [
-        # --------------------------------------------------
-        # No accession
-        # --------------------------------------------------
-        (
-            lambda: ET.Element(f"{{{NS_URI}}}entry"),
-            "cdm_1",
-            [],
-        ),
-        # --------------------------------------------------
-        # Single accession
-        # --------------------------------------------------
-        (
-            lambda: (
-                lambda entry: (
-                    ET.SubElement(entry, f"{{{NS_URI}}}accession").__setattr__("text", "P12345"),
-                    entry,
-                )[1]
-            )(ET.Element(f"{{{NS_URI}}}entry")),
-            "cdm_2",
-            [
-                {
-                    "entity_id": "cdm_2",
-                    "identifier": "UniProt:P12345",
-                    "source": "UniProt",
-                    "description": "UniProt accession",
-                }
-            ],
-        ),
-        # --------------------------------------------------
-        # Multiple accessions
-        # --------------------------------------------------
-        (
-            lambda: (
-                lambda entry: (
-                    ET.SubElement(entry, f"{{{NS_URI}}}accession").__setattr__("text", "Q11111"),
-                    ET.SubElement(entry, f"{{{NS_URI}}}accession").__setattr__("text", "Q22222"),
-                    entry,
-                )[2]
-            )(ET.Element(f"{{{NS_URI}}}entry")),
-            "cdm_3",
-            [
-                {
-                    "entity_id": "cdm_3",
-                    "identifier": "UniProt:Q11111",
-                    "source": "UniProt",
-                    "description": "UniProt accession",
-                },
-                {
-                    "entity_id": "cdm_3",
-                    "identifier": "UniProt:Q22222",
-                    "source": "UniProt",
-                    "description": "UniProt accession",
-                },
-            ],
-        ),
-        # --------------------------------------------------
-        # parse_identifiers_generic already sets source/description → setdefault
-        # --------------------------------------------------
         (
-            lambda: (
-                lambda entry: (
-                    ET.SubElement(entry, f"{{{NS_URI}}}accession").__setattr__("text", "A0A000"),
-                    entry,
-                )[1]
-            )(ET.Element(f"{{{NS_URI}}}entry")),
-            "cdm_4",
-            [
-                {
-                    "entity_id": "cdm_4",
-                    "identifier": "UniProt:A0A000",
-                    "source": "UniProt",  # remains
-                    "description": "UniProt accession",  # remains
-                }
-            ],
+            """
+            <entry xmlns="https://uniprot.org/uniprot" created="2020-01-01" modified="2021-01-01" version="3">
+                <accession>P12345</accession>
+                <name>ProteinX</name>
+                <protein>
+                    <recommendedName>
+                        <fullName>ProteinX Full Name</fullName>
+                    </recommendedName>
+                </protein>
+                <organism>
+                    <dbReference type="NCBI Taxonomy" id="9606"/>
+                </organism>
+                <reference>
+                    <citation>
+                        <dbReference type="PubMed" id="99999"/>
+                    </citation>
+                </reference>
+            </entry>
+            """,
+            "UniProt import",
+            None,
         ),
     ],
 )
-def test_parse_identifiers_parametrized(build_xml, cdm_id, expected):
-    entry = build_xml()
-
-    result = parse_identifiers(entry, cdm_id)
-
-    assert isinstance(result, list)
-    assert result == expected
+def test_parse_uniprot_entry(xml_str: str, datasource_name: str, prev_created: None) -> None:
+    entry = ET.fromstring(xml_str)
+    cdm_id = generate_cdm_id()
+
+    current_timestamp = "2024-07-17T13:00:00Z"
+
+    record = parse_uniprot_entry(entry, cdm_id, current_timestamp, datasource_name, prev_created)
+
+    entity = record["entity"]
+    assert entity["entity_type"] == "protein"
+    assert entity["data_source"] == datasource_name
+    assert entity["version"] == "3"
+    assert entity["uniprot_created"] == "2020-01-01"
+    assert entity["uniprot_modified"] == "2021-01-01"
+    assert entity["entity_id"].startswith("CDM:")
+
+    # identifiers/names/associations/publications
+    assert isinstance(record["identifiers"], list)
+    assert isinstance(record["names"], list)
+    assert isinstance(record["associations"], list)
+    assert isinstance(record["publications"], list)
diff --git a/tests/parsers/test_uniref.py b/tests/parsers/test_uniref.py
index 9ca5360..630949c 100644
--- a/tests/parsers/test_uniref.py
+++ b/tests/parsers/test_uniref.py
@@ -1,318 +1,284 @@
-import os
-import sys
+"""Tests for the UniRef importer."""
 
-sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
-
-import gzip
-import tempfile
+import datetime as dt
+import textwrap
 import xml.etree.ElementTree as ET
-from datetime import datetime, timezone
+
 import pytest
 
 from cdm_data_loader_utils.parsers.uniref import (
+    add_cluster_members,
     cdm_entity_id,
-    get_timestamps,
     extract_cluster,
-    get_accession_and_seed,
-    add_cluster_members,
     extract_cross_refs,
-    parse_uniref_xml,
+    get_accession_and_seed,
+    get_timestamps,
 )
 
-NS = {"ns": "http://uniprot.org/uniref"}
 
-
-# ---------------------------------------------------------
-# cdm_entity_id
-# ---------------------------------------------------------
 @pytest.mark.parametrize(
-    "value, should_raise",
-    [
-        ("A0A009HJL9", False),
-        ("UniRef100_A0A009HJL9", False),
-        ("", True),
-        (None, True),
-    ],
+    ("accession", "expected_prefix"),
+    [("A0B0123456", "CDM:"), ("P01234", "CDM:"), ("", None), (None, None)],
 )
-def test_cdm_entity_id(value, should_raise):
-    if should_raise:
-        with pytest.raises(ValueError):
-            cdm_entity_id(value)
+def test_cdm_entity_id(accession: str | None, expected_prefix: str | None) -> None:
+    """Ensure that CDM entities start with the appropriate prefix."""
+    result = cdm_entity_id(accession)
+    if expected_prefix is None:
+        assert result is None
     else:
-        out = cdm_entity_id(value)
-        assert isinstance(out, str)
-        assert out.startswith("CDM:")
+        assert result.startswith(expected_prefix)
 
 
-# ---------------------------------------------------------
-# get_timestamps
-# ---------------------------------------------------------
 @pytest.mark.parametrize(
-    "uniref_id, existing, now, expect_created_same_as_updated",
+    ("xml_str", "expected_name"),
     [
         (
-            "UniRef100_A",
-            {"UniRef100_A": "2024-01-01T00:00:00+00:00"},
-            datetime(2025, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
-            False,
+            "<entry xmlns='http://uniprot.org/uniref' id='UniRef100_A0A009GP46' updated='2016-10-05'>"
+            "<name>TestName</name></entry>",
+            "TestName",
         ),
         (
-            "UniRef100_B",
-            {},
-            datetime(2025, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
-            True,
-        ),
-        (
-            "UniRef100_C",
-            {},
-            None,
-            True,
-        ),
-    ],
-)
-def test_get_timestamps(uniref_id, existing, now, expect_created_same_as_updated):
-    updated, created = get_timestamps(uniref_id, existing, now)
-
-    assert isinstance(updated, str)
-    assert isinstance(created, str)
-    assert updated.endswith("+00:00")
-
-    if expect_created_same_as_updated:
-        assert updated == created
-    else:
-        assert updated != created
-
-
-@pytest.mark.parametrize("bad_id", ["", None])
-def test_get_timestamps_rejects_empty_uniref_id(bad_id):
-    with pytest.raises(ValueError):
-        get_timestamps(bad_id, {}, None)
-
-
-# ---------------------------------------------------------
-# add_cluster_members
-# ---------------------------------------------------------
-@pytest.mark.parametrize(
-    "repr_xml, member_xmls, expected_count",
-    [
-        (
-            """
-            <dbReference xmlns="http://uniprot.org/uniref">
-                <property type="UniProtKB accession" value="REP_ACC"/>
-                <property type="isSeed" value="true"/>
-            </dbReference>
-            """,
-            [
-                """
-                <dbReference xmlns="http://uniprot.org/uniref">
-                    <property type="UniProtKB accession" value="MEM1_ACC"/>
-                </dbReference>
-                """,
-                """
-                <dbReference xmlns="http://uniprot.org/uniref">
-                    <property type="UniProtKB accession" value="MEM2_ACC"/>
-                </dbReference>
-                """,
-            ],
-            3,
-        ),
-        (
-            None,
-            [
-                """
-                <dbReference xmlns="http://uniprot.org/uniref">
-                    <property type="UniProtKB accession" value="ONLY_ACC"/>
-                </dbReference>
-                """,
-            ],
-            1,
+            "<entry xmlns='http://uniprot.org/uniref' id='UniRef100_XYZ' updated='2024-01-01'/>",
+            "UNKNOWN",
         ),
-        (None, [], 0),
     ],
 )
-def test_add_cluster_members(repr_xml, member_xmls, expected_count):
-    cluster_id = "CDM_CLUSTER"
-    repr_db = ET.fromstring(repr_xml) if repr_xml else None
-
-    entry = ET.Element("{http://uniprot.org/uniref}entry")
-    for m in member_xmls:
-        mem = ET.SubElement(entry, "{http://uniprot.org/uniref}member")
-        mem.append(ET.fromstring(m))
-
-    rows = []
-    add_cluster_members(cluster_id, repr_db, entry, rows, NS)
-
-    assert len(rows) == expected_count
-    for r in rows:
-        assert r[0] == cluster_id
-        assert r[1].startswith("CDM:")
-        assert r[4] == "1.0"
+def test_extract_cluster(xml_str: str, expected_name: str) -> None:
+    """Test cluster extraction from XML."""
+    ns = {"ns": "http://uniprot.org/uniref"}
+    elem = ET.fromstring(xml_str)
+    cluster_id, name = extract_cluster(elem, ns)
+    assert cluster_id.startswith("CDM:")
+    assert isinstance(cluster_id, str)
+    assert name == expected_name
 
 
-# ---------------------------------------------------------
-# extract_cluster
-# ---------------------------------------------------------
 @pytest.mark.parametrize(
-    "xml_str, uniref_id, expected_name",
+    ("uniref_id", "existing_created", "now", "expected"),
     [
+        # Has existing_created
         (
-            "<entry xmlns='http://uniprot.org/uniref' id='UniRef100_A'><name>Test Cluster Name</name></entry>",
             "UniRef100_A",
-            "Test Cluster Name",
+            {"UniRef100_A": "2024-01-01T00:00:00"},
+            dt.datetime(2025, 1, 1, 0, 0, 0, tzinfo=dt.UTC),
+            ("2025-01-01T00:00:00", "2024-01-01T00:00:00"),
         ),
+        # There is no existing_created
         (
-            "<entry xmlns='http://uniprot.org/uniref' id='UniRef100_B'/>",
             "UniRef100_B",
-            "UNKNOWN",
+            {"UniRef100_A": "2024-01-01T00:00:00"},
+            dt.datetime(2025, 1, 1, 0, 0, 0, tzinfo=dt.UTC),
+            ("2025-01-01T00:00:00", "2025-01-01T00:00:00"),
+        ),
+        # There is no existing_created，also not provide "now"
+        (
+            "UniRef100_C",
+            {},
+            None,  # The system automatically use the current time
+            None,  # Only assert that the return is a string and they are equal
         ),
     ],
 )
-def test_extract_cluster(xml_str, uniref_id, expected_name):
-    elem = ET.fromstring(xml_str)
-
-    cluster_id, name = extract_cluster(elem, NS, uniref_id)
-
-    # ---- cluster_id checks ----
-    assert isinstance(cluster_id, str)
-    assert cluster_id.startswith("CDM:")
-
-    # ---- name checks ----
-    assert name == expected_name
+def test_get_timestamps(uniref_id: str, existing_created: str, now: dt.datetime, expected: tuple[str] | None) -> None:
+    """Test timestamps."""
+    result = get_timestamps(uniref_id, existing_created, now)
+    if expected is not None:
+        assert result == expected
+    else:
+        formatted_now, created_time = result
+        assert formatted_now == created_time
+        assert isinstance(formatted_now, str)
+        assert len(formatted_now) == 19  # "YYYY-MM-DDTHH:MM:SS" ---> 19 bites
 
 
 @pytest.mark.parametrize(
-    "xml_str, expected_acc, expected_is_seed",
+    ("xml_str", "expected_acc", "expected_is_seed"),
     [
-        # accession + isSeed=true
+        # Have accession and isSeed
         (
             """
-            <dbReference xmlns="http://uniprot.org/uniref">
-                <property type="UniProtKB accession" value="A0A009HJL9"/>
-                <property type="isSeed" value="true"/>
-            </dbReference>
-            """,
+        <dbReference xmlns="http://uniprot.org/uniref" type="UniProtKB ID" id="A0A009HJL9_ACIB9">
+            <property type="UniProtKB accession" value="A0A009HJL9"/>
+            <property type="isSeed" value="true"/>
+        </dbReference>
+        """,
             "A0A009HJL9",
             True,
         ),
-        # accession only
+        # Only accession, no isSeed
         (
             """
-            <dbReference xmlns="http://uniprot.org/uniref">
-                <property type="UniProtKB accession" value="A0A241V597"/>
-            </dbReference>
-            """,
+        <dbReference xmlns="http://uniprot.org/uniref" type="UniProtKB ID" id="A0A241V597_9GAMM">
+            <property type="UniProtKB accession" value="A0A241V597"/>
+        </dbReference>
+        """,
             "A0A241V597",
             False,
         ),
-        # no accession
+        # No accession, only id
         (
             """
-            <dbReference xmlns="http://uniprot.org/uniref">
-                <property type="protein name" value="Some protein"/>
-            </dbReference>
-            """,
-            None,
-            False,
-        ),
-        # dbref is None
-        (
-            None,
-            None,
+        <dbReference xmlns="http://uniprot.org/uniref" type="UniProtKB ID" id="ID_ONLY"></dbReference>
+        """,
+            "ID_ONLY",
             False,
         ),
+        # None
+        (None, None, False),
     ],
 )
-def test_get_accession_and_seed(xml_str, expected_acc, expected_is_seed):
+def test_get_accession_and_seed(xml_str: str | None, expected_acc: str | None, expected_is_seed: bool) -> None:
+    """Test parsing of UniRef entries for accession and seed status."""
+    ns = {"ns": "http://uniprot.org/uniref"}
     dbref = ET.fromstring(xml_str) if xml_str else None
-
-    acc, is_seed = get_accession_and_seed(dbref, NS)
-
+    acc, is_seed = get_accession_and_seed(dbref, ns)
     assert acc == expected_acc
     assert is_seed == expected_is_seed
 
 
-# ---------------------------------------------------------
-# extract_cross_refs
-# ---------------------------------------------------------
+def make_entry_with_members(member_xmls: list[str], ns_uri: str = "http://uniprot.org/uniref") -> ET.Element:
+    """
+    Receives a list of xml strings from dbReference, generates an <entry> element with <member> child nodes.
+    """
+    entry_elem = ET.Element(f"{{{ns_uri}}}entry")
+    for dbref_xml in member_xmls:
+        dbref_elem = ET.fromstring(dbref_xml)
+        member_elem = ET.SubElement(entry_elem, f"{{{ns_uri}}}member")
+        member_elem.append(dbref_elem)
+    return entry_elem
+
+
 @pytest.mark.parametrize(
-    "props, expected",
+    ("repr_xml", "member_xmls", "expected"),
     [
-        (
+        pytest.param(
+            # representative member, two members
+            textwrap.dedent("""
+                <dbReference xmlns="http://uniprot.org/uniref" type="UniProtKB ID" id="REP_ID">
+                    <property type="UniProtKB accession" value="REP_ACC"/>
+                    <property type="isSeed" value="true"/>
+                </dbReference>
+            """),
+            [
+                textwrap.dedent("""
+                    <dbReference xmlns="http://uniprot.org/uniref" type="UniProtKB ID" id="MEM1_ID">
+                        <property type="UniProtKB accession" value="MEM1_ACC"/>
+                    </dbReference>
+                """),
+                textwrap.dedent("""
+                    <dbReference xmlns="http://uniprot.org/uniref" type="UniProtKB ID" id="MEM2_ID">
+                        <property type="UniProtKB accession" value="MEM2_ACC"/>
+                        <property type="isSeed" value="true"/>
+                    </dbReference>
+                """),
+            ],
             [
-                ("UniProtKB accession", "A0A1"),
-                ("UniRef90 ID", "UniRef90_X"),
-                ("UniParc ID", "UPI0001"),
+                ("CLUSTER_X", "CDM:", "true", "true", "1.0"),
+                ("CLUSTER_X", "CDM:", "false", "false", "1.0"),
+                ("CLUSTER_X", "CDM:", "false", "true", "1.0"),
             ],
-            {
-                ("UniRef90 ID", "UniRef90_X"),
-                ("UniParc ID", "UPI0001"),
-            },
+            id="with-representative-and-members",
         ),
-        (
+        pytest.param(
+            # Only memebers, no representative member
+            None,
             [
-                ("UniProtKB accession", "A0A2"),
+                textwrap.dedent("""
+                    <dbReference xmlns="http://uniprot.org/uniref" type="UniProtKB ID" id="MEM_ID">
+                        <property type="UniProtKB accession" value="MEM_ACC"/>
+                    </dbReference>
+                """)
             ],
-            set(),
+            [("CLUSTER_X", "CDM:", "false", "false", "1.0")],
+            id="members-only",
+        ),
+        pytest.param(
+            # No members, no representative member
+            None,
+            [],
+            [],
+            id="no-members",
         ),
     ],
 )
-def test_extract_cross_refs(props, expected):
-    dbref = ET.Element("{http://uniprot.org/uniref}dbReference", id="UniProtKB:A0A1")
-
-    for k, v in props:
-        ET.SubElement(
-            dbref,
-            "{http://uniprot.org/uniref}property",
-            type=k,
-            value=v,
-        )
+def test_add_cluster_members(repr_xml: str | None, member_xmls: list[str], expected: list[tuple[str, ...]]) -> None:
+    """Test add_cluster_members with various representative/member combinations."""
+    ns = {"ns": "http://uniprot.org/uniref"}
+    cluster_id = "CLUSTER_X"
 
-    rows = []
-    extract_cross_refs(dbref, rows, NS)
-
-    got = {(t, v) for _, t, v in rows}
-    assert got == expected
+    # Structure (representative members) dbReference if it exists
+    repr_db = ET.fromstring(repr_xml) if repr_xml else None
 
-    for entity_id, _, _ in rows:
-        assert entity_id is not None
-        assert isinstance(entity_id, str)
+    # Structure <entry> nodes, and add <member>
+    elem = make_entry_with_members(member_xmls)
 
+    # Calling the function under test
+    cluster_member_data = []
+    add_cluster_members(cluster_id, repr_db, elem, cluster_member_data, ns)
 
-# ---------------------------------------------------------
-# parse_uniref_xml
-# ---------------------------------------------------------
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_parse_uniref_xml_batch(batch_size):
-    xml = """
-    <root xmlns="http://uniprot.org/uniref">
-        <entry id="UniRef100_A">
-            <name>A</name>
-            <representativeMember>
-                <dbReference id="UniProtKB:A1">
-                    <property type="UniProtKB accession" value="A1"/>
-                </dbReference>
-            </representativeMember>
-        </entry>
+    assert len(cluster_member_data) == len(expected)
+    for i, (clu_id, cdm_prefix, is_repr, is_seed, score) in enumerate(expected):
+        out = cluster_member_data[i]
+        assert out[0] == clu_id, f"Wrong cluster_id at idx {i}: {out[0]}"
+        assert out[1].startswith(cdm_prefix), f"Wrong entity_id at idx {i}: {out[1]}"
+        assert out[2] == is_repr, f"Wrong is_representative at idx {i}: {out[2]}"
+        assert out[3] == is_seed, f"Wrong is_seed at idx {i}: {out[3]}"
+        assert out[4] == score, f"Wrong score at idx {i}: {out[4]}"
 
-        <entry id="UniRef100_B">
-            <name>B</name>
-            <representativeMember>
-                <dbReference id="UniProtKB:B1">
-                    <property type="UniProtKB accession" value="B1"/>
-                </dbReference>
-            </representativeMember>
-        </entry>
-    </root>
-    """.strip()
 
-    with tempfile.TemporaryDirectory() as tmpdir:
-        gz_path = f"{tmpdir}/uniref_test.xml.gz"
-        with gzip.open(gz_path, "wb") as gz:
-            gz.write(xml.encode("utf-8"))
+XREF_TYPES = ["UniRef90 ID", "UniRef50 ID", "UniParc ID"]
 
-        result = parse_uniref_xml(gz_path, batch_size, {})
 
-    assert len(result["cluster_data"]) == batch_size
-    assert len(result["entity_data"]) == batch_size
-    assert len(result["cluster_member_data"]) == batch_size
-    assert len(result["cross_reference_data"]) in (0, batch_size)
+@pytest.mark.parametrize(
+    ("dbref_props", "expected_xrefs"),
+    [
+        (
+            # all cross-ref fields present
+            [
+                ("UniRef90 ID", "UniRef90_N8Q6C0"),
+                ("UniRef50 ID", "UniRef50_A0A7Z7LP76"),
+                ("UniParc ID", "UPI00044F6C4F"),
+                ("protein name", "foo"),
+            ],
+            [
+                ("UniRef90 ID", "UniRef90_N8Q6C0"),
+                ("UniRef50 ID", "UniRef50_A0A7Z7LP76"),
+                ("UniParc ID", "UPI00044F6C4F"),
+            ],
+        ),
+        (
+            # partial cross-ref
+            [
+                ("UniRef90 ID", "UniRef90_ABC"),
+                ("protein name", "bar"),
+            ],
+            [
+                ("UniRef90 ID", "UniRef90_ABC"),
+            ],
+        ),
+        (
+            # No cross-ref
+            [
+                ("protein name", "baz"),
+            ],
+            [],
+        ),
+    ],
+)
+def test_extract_cross_refs_param(dbref_props: list[tuple[str, str]], expected_xrefs: list[tuple[str, str]]) -> None:
+    """
+    Test that extract_cross_refs correctly extracts all UniRef cross-reference fields.
+    """
+    dbref = ET.Element("{http://uniprot.org/uniref}dbReference", type="UniProtKB ID", id="TEST_ID")
+
+    for t, v in dbref_props:
+        ET.SubElement(dbref, "{http://uniprot.org/uniref}property", type=t, value=v)
+
+    ns = {"ns": "http://uniprot.org/uniref"}
+    cross_reference_data = []
+    extract_cross_refs(dbref, cross_reference_data, ns)
+
+    entity_id = cdm_entity_id("TEST_ID")
+    expected = {(entity_id, typ, val) for typ, val in expected_xrefs}
+    got = set(cross_reference_data)
+    assert got == expected