From 949429de955f377fdb3b9cebd729eb3a6ced6388 Mon Sep 17 00:00:00 2001 From: Susanna Kiwala Date: Mon, 3 Mar 2025 09:31:39 -0600 Subject: [PATCH 1/3] Allow vep annotation reporter to provide a list of preferred transcripts --- .../output.preferred_transcript.list.tsv | 77 +++++++++++++++++++ .../output.preferred_transcript.tsv | 77 +++++++++++++++++++ .../preferred_transcripts.list.tsv | 9 +++ .../preferred_transcripts.pos.tsv | 77 +++++++++++++++++++ tests/test_vep_annotation_reporter.py | 30 ++++++++ vatools/vep_annotation_reporter.py | 77 ++++++++++++++++--- 6 files changed, 335 insertions(+), 12 deletions(-) create mode 100644 tests/test_data/vep_annotation_reporter/output.preferred_transcript.list.tsv create mode 100644 tests/test_data/vep_annotation_reporter/output.preferred_transcript.tsv create mode 100644 tests/test_data/vep_annotation_reporter/preferred_transcripts.list.tsv create mode 100644 tests/test_data/vep_annotation_reporter/preferred_transcripts.pos.tsv diff --git a/tests/test_data/vep_annotation_reporter/output.preferred_transcript.list.tsv b/tests/test_data/vep_annotation_reporter/output.preferred_transcript.list.tsv new file mode 100644 index 0000000..8b87dc9 --- /dev/null +++ b/tests/test_data/vep_annotation_reporter/output.preferred_transcript.list.tsv @@ -0,0 +1,77 @@ +CHROM POS REF ALT transcript_id Consequence Gene Feature +chr17 7663110 G A ENST00000413465.6 intron_variant ENSG00000141510 ENST00000413465.6 +chr17 7663126 GTATATATATAATATATATAATATAATATAA G ENST00000413465.6 intron_variant ENSG00000141510 ENST00000413465.6 +chr17 7663661 A G ENST00000269305.8 downstream_gene_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7666871 T C ENST00000269305.8 downstream_gene_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7668134 G A ENST00000269305.8 downstream_gene_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7668169 A AGCCGTG ENST00000269305.8 downstream_gene_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7674797 T C ENST00000269305.8 intron_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7675088 C T ENST00000269305.8 missense_variant,missense_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7675327 C T ENST00000269305.8 intron_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7675393 CTTT C ENST00000269305.8 intron_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7675519 A G ENST00000269305.8 intron_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7676154 G C ENST00000269305.8 missense_variant,missense_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7676325 CCCCCAGCCCTCCAGGT C ENST00000269305.8 intron_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7676483 G C ENST00000269305.8 intron_variant,intron_variant ENSG00000141510,ENSG00000141510 ENST00000269305.8,ENST00000413465.6 +chr17 7684369 A G ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 43047896 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43049347 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43057078 G A ENST00000471181.6 stop_gained ENSG00000012048 ENST00000471181.6 +chr17 43058379 A C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43059469 C CACAACA ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43059636 A G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43063808 C T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43064004 G A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43066555 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43067763 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43067787 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43071077 T C ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43074086 C T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43074584 G C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43074658 A T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077743 AT A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077746 T A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077756 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077760 G GT ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077795 A G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077840 G A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077891 T A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43079204 A C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43079499 C T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43082453 A G ENST00000471181.6 synonymous_variant ENSG00000012048 ENST00000471181.6 +chr17 43087455 G A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43087474 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43091173 T G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43091983 T C ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43092418 T C ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43092919 G A ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43093220 A G ENST00000471181.6 synonymous_variant ENSG00000012048 ENST00000471181.6 +chr17 43093449 G A ENST00000471181.6 synonymous_variant ENSG00000012048 ENST00000471181.6 +chr17 43093454 C T ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43097077 A G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43097346 TA T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43099629 T A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43104083 AAAG A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43121331 T C ENST00000471181.6 upstream_gene_variant,upstream_gene_variant,intron_variant ENSG00000198496,ENSG00000198496,ENSG00000012048 ENST00000356906.7,ENST00000467245.5,ENST00000471181.6 +chr17 43121362 G C ENST00000471181.6 upstream_gene_variant,upstream_gene_variant,intron_variant ENSG00000198496,ENSG00000198496,ENSG00000012048 ENST00000356906.7,ENST00000467245.5,ENST00000471181.6 +chr17 43127281 A G ENST00000471181.6 intron_variant&non_coding_transcript_variant,intron_variant&non_coding_transcript_variant,upstream_gene_variant,upstream_gene_variant ENSG00000198496,ENSG00000198496,ENSG00000012048,ENSG00000198496 ENST00000356906.7,ENST00000467245.5,ENST00000471181.6,ENST00000587322.1 +chr17 43129737 G A ENST00000471181.6 intron_variant&non_coding_transcript_variant,intron_variant&non_coding_transcript_variant,upstream_gene_variant,upstream_gene_variant ENSG00000198496,ENSG00000198496,ENSG00000012048,ENSG00000198496 ENST00000356906.7,ENST00000467245.5,ENST00000471181.6,ENST00000587322.1 +chr17 43131360 C T ENST00000587322.1 intron_variant&non_coding_transcript_variant,intron_variant&non_coding_transcript_variant,downstream_gene_variant ENSG00000198496,ENSG00000198496,ENSG00000198496 ENST00000356906.7,ENST00000467245.5,ENST00000587322.1 +chr17 43131380 AAAT A ENST00000587322.1 intron_variant&non_coding_transcript_variant,intron_variant&non_coding_transcript_variant,downstream_gene_variant ENSG00000198496,ENSG00000198496,ENSG00000198496 ENST00000356906.7,ENST00000467245.5,ENST00000587322.1 +chr17 43135863 A G ENST00000587322.1 intron_variant&non_coding_transcript_variant,intron_variant&non_coding_transcript_variant,downstream_gene_variant ENSG00000198496,ENSG00000198496,ENSG00000198496 ENST00000356906.7,ENST00000467245.5,ENST00000587322.1 +chr17 43138596 G T ENST00000356906.7 intron_variant&non_coding_transcript_variant,intron_variant&non_coding_transcript_variant ENSG00000198496,ENSG00000198496 ENST00000356906.7,ENST00000467245.5 +chr17 43138657 G C ENST00000356906.7 splice_region_variant&non_coding_transcript_exon_variant,splice_region_variant&non_coding_transcript_exon_variant ENSG00000198496,ENSG00000198496 ENST00000356906.7,ENST00000467245.5 +chr17 43140410 C G ENST00000464237.2 downstream_gene_variant,upstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000198496,ENSG00000267681,ENSG00000198496 ENST00000356906.7,ENST00000464237.2,ENST00000467245.5 +chr17 43140722 G A ENST00000464237.2 downstream_gene_variant,upstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000198496,ENSG00000267681,ENSG00000198496 ENST00000356906.7,ENST00000464237.2,ENST00000467245.5 +chr17 43142914 TTG T ENST00000464237.2 downstream_gene_variant,upstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000198496,ENSG00000267681,ENSG00000198496 ENST00000356906.7,ENST00000464237.2,ENST00000467245.5 +chr17 43142940 G A ENST00000464237.2 downstream_gene_variant,upstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000198496,ENSG00000267681,ENSG00000198496 ENST00000356906.7,ENST00000464237.2,ENST00000467245.5 +chr17 43142964 GTGTATATATA G ENST00000464237.2 downstream_gene_variant,upstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000198496,ENSG00000267681,ENSG00000198496 ENST00000356906.7,ENST00000464237.2,ENST00000467245.5 +chr17 43145635 A C ENST00000464237.2 downstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000267681,ENSG00000198496 ENST00000464237.2,ENST00000467245.5 +chr17 43145975 G T ENST00000464237.2 downstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000267681,ENSG00000198496 ENST00000464237.2,ENST00000467245.5 +chr17 43146482 G A ENST00000464237.2 downstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000267681,ENSG00000198496 ENST00000464237.2,ENST00000467245.5 +chr17 43147572 C T ENST00000464237.2 downstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000267681,ENSG00000198496 ENST00000464237.2,ENST00000467245.5 +chr17 43147590 A G ENST00000464237.2 downstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000267681,ENSG00000198496 ENST00000464237.2,ENST00000467245.5 +chr17 43147814 C CCT ENST00000464237.2 downstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000267681,ENSG00000198496 ENST00000464237.2,ENST00000467245.5 +chr17 43147911 G C ENST00000464237.2 downstream_gene_variant,intron_variant&non_coding_transcript_variant ENSG00000267681,ENSG00000198496 ENST00000464237.2,ENST00000467245.5 +chr17 43152446 T C ENST00000467245.5 intron_variant&non_coding_transcript_variant ENSG00000198496 ENST00000467245.5 +chr17 43169893 C T ENST00000341165.10 upstream_gene_variant ENSG00000188554 ENST00000341165.10 diff --git a/tests/test_data/vep_annotation_reporter/output.preferred_transcript.tsv b/tests/test_data/vep_annotation_reporter/output.preferred_transcript.tsv new file mode 100644 index 0000000..9831997 --- /dev/null +++ b/tests/test_data/vep_annotation_reporter/output.preferred_transcript.tsv @@ -0,0 +1,77 @@ +CHROM POS REF ALT transcript_id Consequence Gene Feature +chr17 7663110 G A ENST00000413465.6 intron_variant ENSG00000141510 ENST00000413465.6 +chr17 7663126 GTATATATATAATATATATAATATAATATAA G ENST00000413465.6 intron_variant ENSG00000141510 ENST00000413465.6 +chr17 7663661 A G ENST00000269305.8 downstream_gene_variant ENSG00000141510 ENST00000269305.8 +chr17 7666871 T C ENST00000269305.8 downstream_gene_variant ENSG00000141510 ENST00000269305.8 +chr17 7668134 G A ENST00000269305.8 downstream_gene_variant ENSG00000141510 ENST00000269305.8 +chr17 7668169 A AGCCGTG ENST00000269305.8 downstream_gene_variant ENSG00000141510 ENST00000269305.8 +chr17 7674797 T C ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 7675088 C T ENST00000269305.8 missense_variant ENSG00000141510 ENST00000269305.8 +chr17 7675327 C T ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 7675393 CTTT C ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 7675519 A G ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 7676154 G C ENST00000269305.8 missense_variant ENSG00000141510 ENST00000269305.8 +chr17 7676325 CCCCCAGCCCTCCAGGT C ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 7676483 G C ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 7684369 A G ENST00000269305.8 intron_variant ENSG00000141510 ENST00000269305.8 +chr17 43047896 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43049347 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43057078 G A ENST00000471181.6 stop_gained ENSG00000012048 ENST00000471181.6 +chr17 43058379 A C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43059469 C CACAACA ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43059636 A G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43063808 C T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43064004 G A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43066555 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43067763 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43067787 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43071077 T C ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43074086 C T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43074584 G C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43074658 A T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077743 AT A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077746 T A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077756 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077760 G GT ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077795 A G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077840 G A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43077891 T A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43079204 A C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43079499 C T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43082453 A G ENST00000471181.6 synonymous_variant ENSG00000012048 ENST00000471181.6 +chr17 43087455 G A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43087474 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43091173 T G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43091983 T C ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43092418 T C ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43092919 G A ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43093220 A G ENST00000471181.6 synonymous_variant ENSG00000012048 ENST00000471181.6 +chr17 43093449 G A ENST00000471181.6 synonymous_variant ENSG00000012048 ENST00000471181.6 +chr17 43093454 C T ENST00000471181.6 missense_variant ENSG00000012048 ENST00000471181.6 +chr17 43097077 A G ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43097346 TA T ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43099629 T A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43104083 AAAG A ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43121331 T C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43121362 G C ENST00000471181.6 intron_variant ENSG00000012048 ENST00000471181.6 +chr17 43127281 A G ENST00000471181.6 upstream_gene_variant ENSG00000012048 ENST00000471181.6 +chr17 43129737 G A ENST00000471181.6 upstream_gene_variant ENSG00000012048 ENST00000471181.6 +chr17 43131360 C T ENST00000587322.1 downstream_gene_variant ENSG00000198496 ENST00000587322.1 +chr17 43131380 AAAT A ENST00000587322.1 downstream_gene_variant ENSG00000198496 ENST00000587322.1 +chr17 43135863 A G ENST00000587322.1 downstream_gene_variant ENSG00000198496 ENST00000587322.1 +chr17 43138596 G T ENST00000356906.7 intron_variant&non_coding_transcript_variant ENSG00000198496 ENST00000356906.7 +chr17 43138657 G C ENST00000356906.7 splice_region_variant&non_coding_transcript_exon_variant ENSG00000198496 ENST00000356906.7 +chr17 43140410 C G ENST00000464237.2 upstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43140722 G A ENST00000464237.2 upstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43142914 TTG T ENST00000464237.2 upstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43142940 G A ENST00000464237.2 upstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43142964 GTGTATATATA G ENST00000464237.2 upstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43145635 A C ENST00000464237.2 downstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43145975 G T ENST00000464237.2 downstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43146482 G A ENST00000464237.2 downstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43147572 C T ENST00000464237.2 downstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43147590 A G ENST00000464237.2 downstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43147814 C CCT ENST00000464237.2 downstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43147911 G C ENST00000464237.2 downstream_gene_variant ENSG00000267681 ENST00000464237.2 +chr17 43152446 T C ENST00000467245.5 intron_variant&non_coding_transcript_variant ENSG00000198496 ENST00000467245.5 +chr17 43169893 C T ENST00000341165.10 upstream_gene_variant ENSG00000188554 ENST00000341165.10 diff --git a/tests/test_data/vep_annotation_reporter/preferred_transcripts.list.tsv b/tests/test_data/vep_annotation_reporter/preferred_transcripts.list.tsv new file mode 100644 index 0000000..8f83ad0 --- /dev/null +++ b/tests/test_data/vep_annotation_reporter/preferred_transcripts.list.tsv @@ -0,0 +1,9 @@ +transcript_id +ENST00000587322.1 +ENST00000471181.6 +ENST00000467245.5 +ENST00000464237.2 +ENST00000413465.6 +ENST00000356906.7 +ENST00000341165.10 +ENST00000269305.8 diff --git a/tests/test_data/vep_annotation_reporter/preferred_transcripts.pos.tsv b/tests/test_data/vep_annotation_reporter/preferred_transcripts.pos.tsv new file mode 100644 index 0000000..8956451 --- /dev/null +++ b/tests/test_data/vep_annotation_reporter/preferred_transcripts.pos.tsv @@ -0,0 +1,77 @@ +CHROM POS REF ALT transcript_id +chr17 7663110 G A ENST00000413465.6 +chr17 7663126 GTATATATATAATATATATAATATAATATAA G ENST00000413465.6 +chr17 7663661 A G ENST00000269305.8 +chr17 7666871 T C ENST00000269305.8 +chr17 7668134 G A ENST00000269305.8 +chr17 7668169 A AGCCGTG ENST00000269305.8 +chr17 7674797 T C ENST00000269305.8 +chr17 7675088 C T ENST00000269305.8 +chr17 7675327 C T ENST00000269305.8 +chr17 7675393 CTTT C ENST00000269305.8 +chr17 7675519 A G ENST00000269305.8 +chr17 7676154 G C ENST00000269305.8 +chr17 7676325 CCCCCAGCCCTCCAGGT C ENST00000269305.8 +chr17 7676483 G C ENST00000269305.8 +chr17 7684369 A G ENST00000269305.8 +chr17 43047896 T C ENST00000471181.6 +chr17 43049347 T C ENST00000471181.6 +chr17 43057078 G A ENST00000471181.6 +chr17 43058379 A C ENST00000471181.6 +chr17 43059469 C CACAACA ENST00000471181.6 +chr17 43059636 A G ENST00000471181.6 +chr17 43063808 C T ENST00000471181.6 +chr17 43064004 G A ENST00000471181.6 +chr17 43066555 T C ENST00000471181.6 +chr17 43067763 T C ENST00000471181.6 +chr17 43067787 T C ENST00000471181.6 +chr17 43071077 T C ENST00000471181.6 +chr17 43074086 C T ENST00000471181.6 +chr17 43074584 G C ENST00000471181.6 +chr17 43074658 A T ENST00000471181.6 +chr17 43077743 AT A ENST00000471181.6 +chr17 43077746 T A ENST00000471181.6 +chr17 43077756 T C ENST00000471181.6 +chr17 43077760 G GT ENST00000471181.6 +chr17 43077795 A G ENST00000471181.6 +chr17 43077840 G A ENST00000471181.6 +chr17 43077891 T A ENST00000471181.6 +chr17 43079204 A C ENST00000471181.6 +chr17 43079499 C T ENST00000471181.6 +chr17 43082453 A G ENST00000471181.6 +chr17 43087455 G A ENST00000471181.6 +chr17 43087474 T C ENST00000471181.6 +chr17 43091173 T G ENST00000471181.6 +chr17 43091983 T C ENST00000471181.6 +chr17 43092418 T C ENST00000471181.6 +chr17 43092919 G A ENST00000471181.6 +chr17 43093220 A G ENST00000471181.6 +chr17 43093449 G A ENST00000471181.6 +chr17 43093454 C T ENST00000471181.6 +chr17 43097077 A G ENST00000471181.6 +chr17 43097346 TA T ENST00000471181.6 +chr17 43099629 T A ENST00000471181.6 +chr17 43104083 AAAG A ENST00000471181.6 +chr17 43121331 T C ENST00000471181.6 +chr17 43121362 G C ENST00000471181.6 +chr17 43127281 A G ENST00000471181.6 +chr17 43129737 G A ENST00000471181.6 +chr17 43131360 C T ENST00000587322.1 +chr17 43131380 AAAT A ENST00000587322.1 +chr17 43135863 A G ENST00000587322.1 +chr17 43138596 G T ENST00000356906.7 +chr17 43138657 G C ENST00000356906.7 +chr17 43140410 C G ENST00000464237.2 +chr17 43140722 G A ENST00000464237.2 +chr17 43142914 TTG T ENST00000464237.2 +chr17 43142940 G A ENST00000464237.2 +chr17 43142964 GTGTATATATA G ENST00000464237.2 +chr17 43145635 A C ENST00000464237.2 +chr17 43145975 G T ENST00000464237.2 +chr17 43146482 G A ENST00000464237.2 +chr17 43147572 C T ENST00000464237.2 +chr17 43147590 A G ENST00000464237.2 +chr17 43147814 C CCT ENST00000464237.2 +chr17 43147911 G C ENST00000464237.2 +chr17 43152446 T C ENST00000467245.5 +chr17 43169893 C T ENST00000341165.10 diff --git a/tests/test_vep_annotation_reporter.py b/tests/test_vep_annotation_reporter.py index e6fd30e..722053f 100644 --- a/tests/test_vep_annotation_reporter.py +++ b/tests/test_vep_annotation_reporter.py @@ -88,6 +88,36 @@ def test_multiple_multiallelic_site(self): self.assertTrue(cmp(os.path.join(self.test_data_dir, 'output.multiallelic.tsv'), os.path.join(temp_path.name, 'input.tsv'))) temp_path.cleanup() + def test_preferred_transcripts(self): + temp_path = tempfile.TemporaryDirectory() + os.symlink(os.path.join(self.test_data_dir, 'input.vcf.gz'), os.path.join(temp_path.name, 'input.vcf.gz')) + command = [ + os.path.join(temp_path.name, 'input.vcf.gz'), + 'Consequence', + 'Gene', + 'Feature', + '-t', os.path.join(self.test_data_dir, 'preferred_transcripts.pos.tsv'), + '-p', os.path.join(self.test_data_dir, 'preferred_transcripts.pos.tsv'), + ] + vep_annotation_reporter.main(command) + self.assertTrue(cmp(os.path.join(self.test_data_dir, 'output.preferred_transcript.tsv'), os.path.join(temp_path.name, 'input.tsv'))) + temp_path.cleanup() + + def test_preferred_transcripts_list(self): + temp_path = tempfile.TemporaryDirectory() + os.symlink(os.path.join(self.test_data_dir, 'input.vcf.gz'), os.path.join(temp_path.name, 'input.vcf.gz')) + command = [ + os.path.join(temp_path.name, 'input.vcf.gz'), + 'Consequence', + 'Gene', + 'Feature', + '-t', os.path.join(self.test_data_dir, 'preferred_transcripts.pos.tsv'), + '-p', os.path.join(self.test_data_dir, 'preferred_transcripts.list.tsv'), + ] + vep_annotation_reporter.main(command) + self.assertTrue(cmp(os.path.join(self.test_data_dir, 'output.preferred_transcript.list.tsv'), os.path.join(temp_path.name, 'input.tsv'))) + temp_path.cleanup() + def test_no_input_tsv(self): temp_path = tempfile.TemporaryDirectory() os.symlink(os.path.join(self.test_data_dir, 'input.vcf.gz'), os.path.join(temp_path.name, 'input.vcf.gz')) diff --git a/vatools/vep_annotation_reporter.py b/vatools/vep_annotation_reporter.py index 995dada..1c5cb1a 100644 --- a/vatools/vep_annotation_reporter.py +++ b/vatools/vep_annotation_reporter.py @@ -31,6 +31,12 @@ def define_parser(): help="A TSV report file to add VEP annotations to. Required columns are CHROM, POS, REF, ALT. " +"These are used to match each TSV entry to a VCF entry. Must be tab-delimited." ) + parser.add_argument( + "-p", "--pick-transcript-tsv", + help="A TSV file listing transcript annotations to prioritize. Instead of reporting all transcript annotations " + +"or the ones selected via the VEP --flag_pick option (PICK field), report only the transcripts with the Ensembl transcript IDs listed in this TSV (expected header: transcript_id). " + +"To specify a preferred transcript for each variant, include CHROM, POS, REF, and ALT columns in this file in addition to the transcript_id column." + ) parser.add_argument( "-o", "--output-tsv", help="Path to write the output report TSV file. If not provided, the output TSV will be written " @@ -52,6 +58,33 @@ def create_tsv_reader(input_filehandle): raise Exception("ERROR: Input TSV {} doesn't contain required column '{}'.".format(input_filehandle.name, field)) return tsv_reader +def parse_pick_transcript_tsv(pick_transcript_tsv): + if pick_transcript_tsv is None: + return None + with open(pick_transcript_tsv, 'r') as fh: + tsv_reader = csv.DictReader(fh, delimiter="\t") + if 'transcript_id' not in tsv_reader.fieldnames: + raise Exception("ERROR pick transcript TSV {} doesn't contain required column 'transcript_id'.".format(pick_transcript_tsv)) + if all([header in tsv_reader.fieldnames for header in ['CHROM', 'POS', 'REF', 'ALT']]): + preferred_transcripts = {} + for line in tsv_reader: + if line['CHROM'] not in preferred_transcripts: + preferred_transcripts[line['CHROM']] = {} + + if line['POS'] not in preferred_transcripts[line['CHROM']]: + preferred_transcripts[line['CHROM']][line['POS']] = {} + + if line['REF'] not in preferred_transcripts[line['CHROM']][line['POS']]: + preferred_transcripts[line['CHROM']][line['POS']][line['REF']] = {} + + preferred_transcripts[line['CHROM']][line['POS']][line['REF']][line['ALT']] = line['transcript_id'] + return preferred_transcripts + else: + preferred_transcripts = [] + for line in tsv_reader: + preferred_transcripts.append(line['transcript_id']) + return preferred_transcripts + def parse_csq_header(vcf_reader): format_pattern = re.compile('Format: (.*)') return format_pattern.search(vcf_reader.header.get_info_field_info('CSQ').description).group(1).split('|') @@ -99,25 +132,40 @@ def resolve_alleles(entry, csq_alleles): alleles[alt] = alt return alleles -def transcript_for_alt(transcripts, alt): - no_pick_value = False - for transcript in transcripts[alt]: - if 'PICK' in transcript and transcript['PICK'] == '1': - return transcript, no_pick_value - - if 'PICK' in transcripts[alt][0]: +def transcript_for_alt(transcripts, alt, preferred_transcripts): + no_pick_value = None + if preferred_transcripts is not None: + if type(preferred_transcripts) is list: + transcripts_to_include = [] + for transcript in transcripts[alt]: + if transcript['Feature'] in preferred_transcripts: + transcripts_to_include.append(transcript) + else: + transcripts_to_include = [] + for transcript in transcripts[alt]: + if transcript['Feature'] == preferred_transcripts: + transcripts_to_include.append(transcript) + if len(transcripts_to_include) == 0: + transcripts_to_include = transcripts[alt] + elif 'PICK' in transcripts[alt][0]: + for transcript in transcripts[alt]: + if 'PICK' in transcript and transcript['PICK'] == '1': + return transcript, False no_pick_value = True + transcripts_to_include = transcripts[alt] + else: + transcripts_to_include = transcripts[alt] merged_transcripts = {} - for key in transcripts[alt][0].keys(): - merged_transcripts[key] = ",".join([transcript[key] for transcript in transcripts[alt]]) + for key in transcripts_to_include[0].keys(): + merged_transcripts[key] = ",".join([transcript[key] for transcript in transcripts_to_include]) return merged_transcripts, no_pick_value def decode_hex(match_string): hex_string = match_string.group(0).replace('%', '') return binascii.unhexlify(hex_string).decode('utf-8') -def extract_vep_fields(args): +def extract_vep_fields(args, preferred_transcripts): vcf_reader = create_vcf_reader(args) csq_fields = parse_csq_header(vcf_reader) vep = {} @@ -148,7 +196,11 @@ def extract_vep_fields(args): alt = alt.serialize() if alt not in vep[chr][pos][ref]: if alleles_dict[alt] in transcripts: - values, no_pick_value = transcript_for_alt(transcripts, alleles_dict[alt]) + if type(preferred_transcripts) is dict: + p = preferred_transcripts[chr][pos][ref][alt] + else: + p = preferred_transcripts + values, no_pick_value = transcript_for_alt(transcripts, alleles_dict[alt], p) if no_pick_value: logging.warning("VCF is annotated with the PICK flag but no PICK'ed transcript found for variant {} {} {} {}. Writing values for all transcripts.".format(chr, pos, ref, alt)) vep[chr][pos][ref][alt] = values @@ -177,7 +229,8 @@ def main(args_input = sys.argv[1:]): parser = define_parser() args = parser.parse_args(args_input) - vep = extract_vep_fields(args) + preferred_transcripts = parse_pick_transcript_tsv(args.pick_transcript_tsv) + vep = extract_vep_fields(args, preferred_transcripts) if args.output_tsv: output_file = args.output_tsv From a5474b700f0309088d06c49dcc8662892f6f3af4 Mon Sep 17 00:00:00 2001 From: Susanna Kiwala Date: Tue, 4 Mar 2025 07:49:00 -0600 Subject: [PATCH 2/3] Use PICK'ed value as a fallback when none of the preferred transcripts match --- vatools/vep_annotation_reporter.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/vatools/vep_annotation_reporter.py b/vatools/vep_annotation_reporter.py index 1c5cb1a..be12cc9 100644 --- a/vatools/vep_annotation_reporter.py +++ b/vatools/vep_annotation_reporter.py @@ -133,7 +133,7 @@ def resolve_alleles(entry, csq_alleles): return alleles def transcript_for_alt(transcripts, alt, preferred_transcripts): - no_pick_value = None + no_preferred_transcript = None if preferred_transcripts is not None: if type(preferred_transcripts) is list: transcripts_to_include = [] @@ -145,21 +145,24 @@ def transcript_for_alt(transcripts, alt, preferred_transcripts): for transcript in transcripts[alt]: if transcript['Feature'] == preferred_transcripts: transcripts_to_include.append(transcript) - if len(transcripts_to_include) == 0: - transcripts_to_include = transcripts[alt] - elif 'PICK' in transcripts[alt][0]: + if len(transcripts_to_include) > 0: + return merge_transcripts(transcripts_to_include), None, False + else: + no_preferred_transcript = True + + if 'PICK' in transcripts[alt][0]: for transcript in transcripts[alt]: if 'PICK' in transcript and transcript['PICK'] == '1': - return transcript, False - no_pick_value = True - transcripts_to_include = transcripts[alt] + return transcript, False, no_preferred_transcript + return merge_transcripts(transcripts[alt]), True, no_preferred_transcript else: - transcripts_to_include = transcripts[alt] + return merge_transcripts(transcripts[alt]), None, no_preferred_transcript +def merge_transcripts(transcripts_to_include): merged_transcripts = {} for key in transcripts_to_include[0].keys(): merged_transcripts[key] = ",".join([transcript[key] for transcript in transcripts_to_include]) - return merged_transcripts, no_pick_value + return merged_transcripts def decode_hex(match_string): hex_string = match_string.group(0).replace('%', '') @@ -197,10 +200,12 @@ def extract_vep_fields(args, preferred_transcripts): if alt not in vep[chr][pos][ref]: if alleles_dict[alt] in transcripts: if type(preferred_transcripts) is dict: - p = preferred_transcripts[chr][pos][ref][alt] + p = preferred_transcripts.get(chr, {}).get(pos, {}).get(ref, {}).get(alt, {}) else: p = preferred_transcripts - values, no_pick_value = transcript_for_alt(transcripts, alleles_dict[alt], p) + values, no_pick_value, no_preferred_transcript = transcript_for_alt(transcripts, alleles_dict[alt], p) + if no_preferred_transcript: + logging.warning("Preferred transcripts TSV provided but no matching transcript found for variant {} {} {} {}. Writing value for PICK'ed transcript.".format(chr, pos, ref, alt)) if no_pick_value: logging.warning("VCF is annotated with the PICK flag but no PICK'ed transcript found for variant {} {} {} {}. Writing values for all transcripts.".format(chr, pos, ref, alt)) vep[chr][pos][ref][alt] = values From 9f72b39699f4e3efd9e3b30938fe0ddf5d5b3572 Mon Sep 17 00:00:00 2001 From: Susanna Kiwala Date: Tue, 4 Mar 2025 08:29:15 -0600 Subject: [PATCH 3/3] Rename option --- vatools/vep_annotation_reporter.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vatools/vep_annotation_reporter.py b/vatools/vep_annotation_reporter.py index be12cc9..54a5614 100644 --- a/vatools/vep_annotation_reporter.py +++ b/vatools/vep_annotation_reporter.py @@ -32,7 +32,7 @@ def define_parser(): +"These are used to match each TSV entry to a VCF entry. Must be tab-delimited." ) parser.add_argument( - "-p", "--pick-transcript-tsv", + "-p", "--preferred-transcripts-tsv", help="A TSV file listing transcript annotations to prioritize. Instead of reporting all transcript annotations " +"or the ones selected via the VEP --flag_pick option (PICK field), report only the transcripts with the Ensembl transcript IDs listed in this TSV (expected header: transcript_id). " +"To specify a preferred transcript for each variant, include CHROM, POS, REF, and ALT columns in this file in addition to the transcript_id column." @@ -58,13 +58,13 @@ def create_tsv_reader(input_filehandle): raise Exception("ERROR: Input TSV {} doesn't contain required column '{}'.".format(input_filehandle.name, field)) return tsv_reader -def parse_pick_transcript_tsv(pick_transcript_tsv): - if pick_transcript_tsv is None: +def parse_preferred_transcripts_tsv(preferred_transcripts_tsv): + if preferred_transcripts_tsv is None: return None - with open(pick_transcript_tsv, 'r') as fh: + with open(preferred_transcripts_tsv, 'r') as fh: tsv_reader = csv.DictReader(fh, delimiter="\t") if 'transcript_id' not in tsv_reader.fieldnames: - raise Exception("ERROR pick transcript TSV {} doesn't contain required column 'transcript_id'.".format(pick_transcript_tsv)) + raise Exception("ERROR preferred transcripts TSV {} doesn't contain required column 'transcript_id'.".format(preferred_transcripts_tsv)) if all([header in tsv_reader.fieldnames for header in ['CHROM', 'POS', 'REF', 'ALT']]): preferred_transcripts = {} for line in tsv_reader: @@ -234,7 +234,7 @@ def main(args_input = sys.argv[1:]): parser = define_parser() args = parser.parse_args(args_input) - preferred_transcripts = parse_pick_transcript_tsv(args.pick_transcript_tsv) + preferred_transcripts = parse_preferred_transcripts_tsv(args.preferred_transcripts_tsv) vep = extract_vep_fields(args, preferred_transcripts) if args.output_tsv: