diff --git a/R_Iris/CNA_processing.R b/R_Iris/CNA_processing.R new file mode 100644 index 0000000..c57458b --- /dev/null +++ b/R_Iris/CNA_processing.R @@ -0,0 +1,127 @@ +# library(data.table) +# library(stringr) +# library(tidyr) +# library(dplyr) + +#' @export +CNA_processing = function( + master.ref, results.dir, dmp.dir = '/data1/core006/access/production/resources/cbioportal/versions/msk-impact/msk_solid_heme/', + gene_set = "v1" +) { + # Define gene sets + access.cna.genes = c('AR','BRCA1','BRCA2','CDK4','EGFR','ERBB2','MET','MDM2','MLH1','MSH2','MSH6','MYC') + access.v2.cna.genes = c( + "ALK", "APC", "AR", "ARID1A", "ASXL1", "ATM", "BAP1", "BRCA1", "BRCA2", "CDH1", "CDK12", "CDK4", "CHEK2", + "DNMT3A", "EGFR", "ERBB2", "ERBB3", "ERCC2", "FBXW7", "FGFR2", "FGFR3", "KDM6A", "KIT", "MAP2K1", "MAP2K2", + "MDM2", "MET", "MLH1", "MSH2", "MSH6", "MTOR", "MYC", "MYCN", "NF1", "PALB2", "PDGFRA", "PIK3CA", "PMS2", + "PTCH1", "RB1", "RET", "SMAD4", "TP53", "TSC1", "TSC2" + ) + + # Select the appropriate gene set + selected_genes = if (gene_set == "v2") access.v2.cna.genes else access.cna.genes + + # DMP CNA calls -------------------------------------------------------- + DMP.cna <- fread(paste0(dmp.dir, '/data_CNA.txt')) + + dmp.ids <- master.ref$dmp_patient_id[(master.ref$dmp_patient_id != "") & !is.na(master.ref$dmp_patient_id)] + dmp.pattern <- paste0(dmp.ids, collapse = '|') + + DMP.cna.edited = DMP.cna[ + !Hugo_Symbol %in% c('CDKN2Ap14ARF', 'CDKN2Ap16INK4A'), + c('Hugo_Symbol', grep(dmp.pattern, colnames(DMP.cna), value = T)), + with = F + ] %>% + melt.data.table( + id.vars = 'Hugo_Symbol', + variable.name = 'Tumor_Sample_Barcode', + value.name = 'fc' + ) %>% + filter(fc != 0) %>% + mutate( + CNA_tumor = case_when(fc < 0 ~ 'HOMDEL', fc > 0 ~ 'AMP'), + dmp_patient_id = gsub('-T..-IM.', '', Tumor_Sample_Barcode) + ) %>% + merge( + unique(master.ref[!is.na(dmp_patient_id), .(cmo_patient_id, dmp_patient_id)]), + by = 'dmp_patient_id', + all.x = T + ) %>% + transmute(Hugo_Symbol, CNA_tumor, cmo_patient_id, dmp_patient_id) %>% + unique() %>% + data.table() + + # Access CNA calls -------------------------------------------------------- + access.cna = do.call(rbind, lapply(master.ref$cna_path, function(x) { + fread(x) %>% + transmute( + Tumor_Sample_Barcode = gsub('\\.', '-', gsub('_mean_cvg', '', sample)), + Hugo_Symbol = region, + p.adj, + fc + ) %>% + filter(p.adj <= 0.05) %>% + merge( + unique(master.ref[, .(cmo_patient_id, cmo_sample_id_plasma)]), + by.x = 'Tumor_Sample_Barcode', + by.y = 'cmo_sample_id_plasma', + all.x = T + ) %>% + data.table() + })) + + merge( + access.cna, + DMP.cna.edited, + by = c('Hugo_Symbol', 'cmo_patient_id'), + all.x = T + ) %>% + mutate( + CNA = case_when( + (fc >= 1.5 & Hugo_Symbol %in% selected_genes) ~ 'AMP', + (fc <= -1.5 & Hugo_Symbol %in% selected_genes) ~ 'HOMDEL', + (fc >= 1.2 & !is.na(CNA_tumor)) ~ 'AMP', + (fc <= -1.2 & !is.na(CNA_tumor)) ~ 'HOMDEL', + TRUE ~ '' + ) + ) %>% + filter(CNA != '') %>% + data.table() -> access.call.set + + # Compare tumor and plasma CNAs + condition <- any(access.call.set$CNA_tumor != access.call.set$CNA) + if (condition || is.na(condition)) warning('Tumor and plasma have conflicting CNA detected!') + + # Save results + dir.create(paste0(results.dir, '/CNA_final_call_set'), showWarnings = FALSE) + lapply(unique(master.ref$cmo_sample_id_plasma), function(x) { + write.table(access.call.set[Tumor_Sample_Barcode == x], + paste0(results.dir, '/CNA_final_call_set/', x, '_cna_final_call_set.txt'), + sep = '\t', quote = F, row.names = F) + }) +} +# Executable ----------------------------------------------------------------------------------------------------------- +suppressPackageStartupMessages({ + library(argparse) + library(data.table) + library(stringr) + library(dplyr) +}) + +if (!interactive()) { + + parser = ArgumentParser() + parser$add_argument('-m', '--masterref', type = 'character', help = 'File path to master reference file') + parser$add_argument('-o', '--resultsdir', type = 'character', help = 'Output directory') + parser$add_argument('-dmp', '--dmpdir', type = 'character', default = '/data1/core006/access/production/resources/cbioportal/versions/msk-impact/msk_solid_heme/', + help = 'Directory of clinical DMP IMPACT repository [default]') + parser$add_argument('-g', '--gene-set', type = 'character', default = 'v1', + help = 'Gene set to use: "v1" for access.cna.genes or "v2" for access.v2.cna.genes [default: v1]') + args = parser$parse_args() + + master.ref = args$masterref + results.dir = args$resultsdir + dmp.dir = args$dmpdir + gene_set = args$gene_set + + CNA_processing(fread(master.ref), results.dir, dmp.dir, gene_set) +} diff --git a/R_Iris/compile_reads_all.R b/R_Iris/compile_reads_all.R new file mode 100644 index 0000000..4a8dab6 --- /dev/null +++ b/R_Iris/compile_reads_all.R @@ -0,0 +1,766 @@ +#library(data.table) +#library(tidyr) +#library(stringr) +#library(dplyr) + + +#' @export +compile_reads_all <- function(master.ref, + results.dir, + project.ID, + pooled.bam.dir = "/data1/core006/access/production/resources/msk-access/v2.0/novaseq_curated_duplex_bams_dmp/current/", + fasta.path = "/data1/core006/access/production/resources/reference/versions/hg19_virus_special/hg19_virus.fasta", + genotyper.path = "/data1/core006/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample", + dmp.dir = "/data1/core006/access/production/resources/cbioportal/versions/msk-impact/msk_solid_heme/", + mirror.bam.dir = "/juno/dmp/share/irb12_245", + mirror.access.bam.dir = "/juno/dmp/share/access_12_245/", + dmp.key.path = "/juno/dmp/request/12-245/key.txt", + access.key.path = "/juno/dmp/request/ACCESS-12-245/key.txtt") { + # # test input section ----------------------------------------------------------- + # master.ref = fread('/juno/data1/core006/bergerm1/bergerlab/zhengy1/access_data_analysis/data/example_master_file.csv') + # results.dir = paste0('/juno/data1/core006/bergerm1/MSK-ACCESS/ACCESS-Projects/test_access/access_data_analysis/output_',format(Sys.time(),'%m%d%y')) + # pooled.bam.dir = '/ifs/data1/core006/bergerm1/ACCESS-Projects/novaseq_curated_duplex_v2/' + # fasta.path = '/data1/core006/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta' + # genotyper.path = '/ifs/data1/core006/bergerm1/Innovation/software/maysun/GetBaseCountsMultiSample/GetBaseCountsMultiSample' + # dmp.dir = '/ifs/data1/core006/bergerm1/zhengy1/dmp/mskimpact/' + # mirror.bam.dir = '/ifs/dmpshare/share/irb12_245/' + # dmp.key.path = '/ifs/dmprequest/12-245/key.txt' + # setting up directory ---------------------------------------------------- + dir.create(results.dir) + # make tmp directory in output directory + dir.create(paste0(results.dir, "/tmp")) + # checking virtualenv ----------------------------------------------------- + geno.bash <- system("which genotype_variants", intern = T) + if (length(geno.bash) == 0) { + # print(pyclone.path) + stop( + "needs to run \nsource /home/accessbot/miniconda3/bin/activate && conda activate genotype-variants-0.3.0" + ) + } + + # data from DMP ----------------------------------------------------------- + DMP.key <- as.data.table(read.csv(dmp.key.path, header = FALSE, sep = ",")) + if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) { + message(paste0( + "These DMP IDs are not found in DMP key file: ", + paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% + gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))], collapse = " ,") + )) + } + # data from DMP ACCESS ---------------------------------------------------- + access.key <- + as.data.table(read.csv(access.key.path, header = FALSE, sep = ",")) + if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) { + message(paste0( + "These DMP IDs are not found in DMP ACCESS key file: ", + paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% + gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))], collapse = " ,") + )) + } + + DMP.maf <- + fread(paste0(dmp.dir, "/data_mutations_extended.txt")) %>% + filter(Mutation_Status != "GERMLINE") %>% + data.table() + DMP.RET.maf <- + DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ] + + # Pooled normal samples --------------------------------------------------- + pooled.bams <- + list.files(pooled.bam.dir, pattern = "\\.bam$", full.names = T) + + # For each patient -------------------------------------------------------- + x <- unique(master.ref$cmo_patient_id)[1] + # x = unique(master.ref$cmo_sample_id_plasma)[16] + # x = 'C-YW82CY' + print("Compiling reads per patient") + all.fillout.id <- + lapply(unique(master.ref$cmo_patient_id), function(x) { + print(x) + dir.create(paste0(results.dir, "/", x)) + dmp_id <- + unique(master.ref[cmo_patient_id == x]$dmp_patient_id) + # sample sheet with colummns -- TSB, sample type, bam path, treatm -------- + # need to get DMP tumor, DMP normal, plasma, plasma normal (if there is any), pooled normal + # DMP sample sheet + if (is.na(dmp_id) | dmp_id == '') { + dmp.sample.sheet <- NULL + } else { + all.dmp.ids.IM <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V1 + all.dmp.ids.IH <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V1 + all.dmp.ids.XS <- + access.key[grepl(paste0(dmp_id, "-T..-XS."), V1)]$V1 + all.dmp.ids.normal.XS <- + access.key[grepl(paste0(dmp_id, "-N..-XS."), V1)]$V1 + all.dmp.ids <- c(all.dmp.ids.IM, all.dmp.ids.IH) + all.dmp.bam.ids.IM <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V2 + all.dmp.bam.ids.IH <- + DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V2 + all.dmp.bam.ids.XS <- + gsub("-standard|-unfilter|-simplex|-duplex", + "", + access.key[grepl(paste0(dmp_id, "-T..-XS."), V1)]$V2) + all.dmp.bam.ids.normal.XS <- + gsub("-standard|-unfilter|-simplex|-duplex", + "", + access.key[grepl(paste0(dmp_id, "-N..-XS."), V1)]$V2) + all.dmp.bam.ids <- + c(all.dmp.bam.ids.IM, + all.dmp.bam.ids.IH) + if (length(all.dmp.ids) == 0) { + dmp.sample.sheet <- NULL + } else{ + bam.sub.dir <- + unlist(lapply(strsplit(substr( + all.dmp.bam.ids, 1, 2 + ), ""), function(x) { + paste0(x, collapse = "/") + })) + dmp.sample.sheet <- data.frame( + Sample_Barcode = all.dmp.ids, + standard_bam = paste0( + mirror.bam.dir, + "/", + bam.sub.dir, + "/", + all.dmp.bam.ids, + ".bam" + ), + duplex_bam = NA_character_, + simplex_bam = NA_character_ + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse( + grepl("-T", Sample_Barcode), + "DMP_Tumor", + "DMP_Normal" + ), + dmp_patient_id = dmp_id + ) + } + if (length(all.dmp.ids.XS) == 0) { + access.sample.sheet <- NULL + } else{ + access.bam.sub.dir <- + unlist(lapply(strsplit( + substr(all.dmp.bam.ids.XS, 1, 2), "" + ), function(x) { + paste0(x, collapse = "/") + })) + access.sample.sheet <- unique( + data.frame( + Sample_Barcode = all.dmp.ids.XS, + standard_bam = NA_character_, + duplex_bam = paste0( + mirror.access.bam.dir, + "/", + access.bam.sub.dir, + "/", + all.dmp.bam.ids.XS, + "-duplex.bam" + ), + simplex_bam = paste0( + mirror.access.bam.dir, + "/", + access.bam.sub.dir, + "/", + all.dmp.bam.ids.XS, + "-simplex.bam" + ) + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse( + grepl("-T", Sample_Barcode), + "duplex", + "unfilterednormal" + ), + dmp_patient_id = dmp_id + ) + ) + access.normal.bam.sub.dir <- + unlist(lapply(strsplit( + substr(all.dmp.bam.ids.normal.XS, 1, 2), "" + ), function(x) { + paste0(x, collapse = "/") + })) + access.normal.sample.sheet <- unique( + data.frame( + Sample_Barcode = all.dmp.ids.normal.XS, + standard_bam = paste0( + mirror.access.bam.dir, + "/", + access.normal.bam.sub.dir, + "/", + all.dmp.bam.ids.normal.XS, + "-unfilter.bam" + ), + duplex_bam = NA_character_, + simplex_bam = NA_character_ + ) %>% + mutate( + cmo_patient_id = x, + Sample_Type = ifelse( + grepl("-N", Sample_Barcode), + "unfilterednormal", + "duplex" + ), + dmp_patient_id = dmp_id + ) + ) + access.sample.sheet = bind_rows(access.sample.sheet, access.normal.sample.sheet) + } + if (!is.null(dmp.sample.sheet) & + !is.null(access.sample.sheet)) { + print("DMP IMPACT and DMP ACCESS samples are available") + dmp.sample.sheet <- + bind_rows(dmp.sample.sheet, access.sample.sheet) + + } else if (is.null(dmp.sample.sheet) & + !is.null(access.sample.sheet)) { + print("DMP IMPACT samples are NOT available and DMP ACCESS samples are available") + dmp.sample.sheet <- access.sample.sheet + } else if (!is.null(dmp.sample.sheet) & + is.null(access.sample.sheet)) { + print("DMP IMPACT samples are available and DMP ACCESS samples are NOT available") + dmp.sample.sheet <- dmp.sample.sheet + } else{ + print("No DMP IMPACT samples or DMP ACCESS samples are available") + dmp.sample.sheet <- NULL + } + } + # total sample sheet + sample.sheet <- master.ref[cmo_patient_id == x, + # plasma bams -- duplex and simplex bam + .( + Sample_Barcode = as.character(cmo_sample_id_plasma), + standard_bam = NA_character_, + duplex_bam = bam_path_plasma_duplex, + simplex_bam = bam_path_plasma_simplex, + cmo_patient_id, + Sample_Type = "duplex", + dmp_patient_id + )] %>% + merge(rbind(unique(master.ref[cmo_patient_id == x & + paired == 'Paired', + # buffy coat + DMP bams -- standard bam only + .( + Sample_Barcode = as.character(cmo_sample_id_normal), + standard_bam = as.character(bam_path_normal), + duplex_bam = NA_character_, + simplex_bam = NA_character_, + cmo_patient_id, + Sample_Type = "unfilterednormal", + dmp_patient_id + )]), + dmp.sample.sheet), all = T) + # catch '' or NA for empty cells for some cmo_sample_id_normal + sample.sheet <- + sample.sheet[!is.na(Sample_Barcode) | + Sample_Barcode != ""] + # Check for existence of bam files and remove rows if files are missing + rows_to_remove <- c() + for (i in 1:nrow(sample.sheet)) { + bam_files <- c(sample.sheet$standard_bam[i], sample.sheet$duplex_bam[i], sample.sheet$simplex_bam[i]) + bam_files <- bam_files[!is.na(bam_files) & bam_files != ""] # remove NAs and empty strings + + for (bam_file in bam_files) { + if (!file.exists(bam_file)) { + message(paste0("Warning: BAM file does not exist: ", bam_file, ". Removing sample ", sample.sheet$Sample_Barcode[i], " from sample sheet.")) + rows_to_remove <- c(rows_to_remove, i) + break # No need to check other bams for this row + } + } + } + + if (length(rows_to_remove) > 0) { + sample.sheet <- sample.sheet[-unique(rows_to_remove),] + } + write.table( + sample.sheet, + paste0(results.dir, "/", x, "/", x, "_sample_sheet.tsv"), + sep = "\t", + quote = F, + row.names = F + ) + # piece together all unique calls ----------------------------------------- + # get duplex calls + duplex.calls <- + do.call(rbind, lapply(master.ref[cmo_patient_id == x]$maf_path, function(x) { + # fread(x) %>% filter(as.numeric(D_t_alt_count_fragment) > 0) %>% data.table() + selectcolumns <- + c( + "Hugo_Symbol", + "Entrez_Gene_Id", + "Center", + "NCBI_Build", + "Chromosome", + "Start_Position", + "End_Position", + "Strand", + "Variant_Classification", + "Variant_Type", + "Reference_Allele", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", + "dbSNP_RS", + "dbSNP_Val_Status", + "Tumor_Sample_Barcode", + "caller_Norm_Sample_Barcode", + "Match_Norm_Seq_Allele1", + "Match_Norm_Seq_Allele2", + "Tumor_Validation_Allele1", + "Tumor_Validation_Allele2", + "Match_Norm_Validation_Allele1", + "Match_Norm_Validation_Allele2", + "Verification_Status", + "Validation_Status", + "Mutation_Status", + "Sequencing_Phase", + "Sequence_Source", + "Validation_Method", + "Score", + "BAM_File", + "Sequencer", + "Tumor_Sample_UUID", + "Matched_Norm_Sample_UUID", + "HGVSc", + "HGVSp", + "HGVSp_Short", + "Transcript_ID", + "Exon_Number", + "caller_t_depth", + "caller_t_ref_count", + "caller_t_alt_count", + "caller_n_depth", + "caller_n_ref_count", + "caller_n_alt_count", + "all_effects", + "Allele", + "Gene", + "Feature", + "Feature_type", + "Consequence", + "cDNA_position", + "CDS_position", + "Protein_position", + "Amino_acids", + "Codons", + "Existing_variation", + "ALLELE_NUM", + "DISTANCE", + "STRAND_VEP", + "SYMBOL", + "SYMBOL_SOURCE", + "HGNC_ID", + "BIOTYPE", + "CANONICAL", + "CCDS", + "ENSP", + "SWISSPROT", + "TREMBL", + "UNIPARC", + "RefSeq", + "SIFT", + "PolyPhen", + "EXON", + "INTRON", + "DOMAINS", + "AF", + "AFR_AF", + "AMR_AF", + "ASN_AF", + "EAS_AF", + "EUR_AF", + "SAS_AF", + "AA_AF", + "EA_AF", + "CLIN_SIG", + "SOMATIC", + "PUBMED", + "MOTIF_NAME", + "MOTIF_POS", + "HIGH_INF_POS", + "MOTIF_SCORE_CHANGE", + "IMPACT", + "PICK", + "VARIANT_CLASS", + "TSL", + "HGVS_OFFSET", + "PHENO", + "MINIMISED", + "ExAC_AF", + "ExAC_AF_AFR", + "ExAC_AF_AMR", + "ExAC_AF_EAS", + "ExAC_AF_FIN", + "ExAC_AF_NFE", + "ExAC_AF_OTH", + "ExAC_AF_SAS", + "GENE_PHENO", + "FILTER", + "flanking_bps", + "variant_id", + "variant_qual", + "ExAC_AF_Adj", + "ExAC_AC_AN_Adj", + "ExAC_AC_AN", + "ExAC_AC_AN_AFR", + "ExAC_AC_AN_AMR", + "ExAC_AC_AN_EAS", + "ExAC_AC_AN_FIN", + "ExAC_AC_AN_NFE", + "ExAC_AC_AN_OTH", + "ExAC_AC_AN_SAS", + "ExAC_FILTER", + "gnomAD_AF", + "gnomAD_AFR_AF", + "gnomAD_AMR_AF", + "gnomAD_ASJ_AF", + "gnomAD_EAS_AF", + "gnomAD_FIN_AF", + "gnomAD_NFE_AF", + "gnomAD_OTH_AF", + "gnomAD_SAS_AF", + "CallMethod", + "VCF_POS", + "VCF_REF", + "VCF_ALT", + "hotspot_whitelist", + "Status", + "D_t_alt_count_fragment", + "D_t_ref_count_fragment", + "D_t_vaf_fragment", + "SD_t_alt_count_fragment", + "SD_t_ref_count_fragment", + "SD_t_vaf_fragment", + "Matched_Norm_Sample_Barcode", + "Matched_Norm_Bamfile", + "n_alt_count_fragment", + "n_ref_count_fragment", + "n_vaf_fragment" + ) + if ("Status" %in% names(fread(x))) { + fread(x) %>% select(one_of(selectcolumns)) %>% subset((Status == "") | + (is.na(Status))) + } else { + fread(x) %>% select(one_of(selectcolumns)) + } + # fread(x) + # %>% + # filter(as.numeric(t_alt_count) > 0) %>% + # data.table() + })) + # get impact calls + impact.calls <- + DMP.RET.maf[Tumor_Sample_Barcode %in% sample.sheet$Sample_Barcode] + write.table( + impact.calls[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )], + paste0(results.dir, "/", x, "/", x, "_impact_calls.maf"), + sep = "\t", + quote = F, + row.names = F + ) + # combining plasma and impact calls + all.calls <- + rbind(duplex.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F], + impact.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F]) + # getting rid of duplicate calls and take the first occurence of all events + all.calls <- + all.calls[which(!duplicated(all.calls[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )])), ] %>% + mutate( + t_ref_count = 0, + t_alt_count = 0, + n_ref_count = 0, + n_alt_count = 0, + Matched_Norm_Sample_Barcode = NA + ) %>% + filter( + Variant_Classification != "Silent" & + !grepl("RP11-", Hugo_Symbol) & + !grepl("Intron", Variant_Classification) + ) + write.table( + all.calls, + paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), + sep = "\t", + quote = F, + row.names = F + ) + # tagging hotspots + system( + paste0( + 'sbatch', + " --partition=cmobic_cpu", + ' --mem=4G', # Memory (4GB) + ' --chdir=', results.dir, "/", x, "/", # Working directory + ' --output=hotspot.o', # Standard output + ' --error=hotspot.e', # Standard error + ' --time=00:59:00', # Time limit (59 minutes) + ' --job-name=tag_hotspot', # Job name + ' --wrap="python /data1/core006/access/production/workflows/access_workflows/v1/pipeline_2.0.0/ACCESS-Pipeline/cwl_tools/hotspots/tag_hotspots.py', + ' -m ', results.dir, "/", x, "/", x, '_all_unique_calls.maf', + ' -itxt /data1/core006/access/production/resources/msk-access/v2.0/regions_of_interest/versions/v1.0/hotspot-list-union-v1-v2_with_TERT.txt', + ' -o ', results.dir, "/", x, "/", x, '_all_unique_calls_hotspots.maf', + ' -outdir ', results.dir, "/", x, "/", x, '"' + ) + ) + # genotype all bams in this patient directory ----------------------------- + # genotyping plasma samples -- plasma duplex&simplex, plasma normal, pooled plasma normal + write.table( + sample.sheet[, .( + sample_id = Sample_Barcode, + maf = paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"), + standard_bam, + duplex_bam, + simplex_bam + )], + paste0(results.dir, "/", x, "/", x, "_genotype_metadata.tsv"), + sep = "\t", + quote = F, + row.names = F + ) + job.ids <- system( + paste0( + "sbatch", + " --partition=cmobic_cpu", + " --chdir=", results.dir, "/", x, + " --time=12:00:00", + " --mem=8G", + " --output=genotyping.o", + " --error=genotyping.e", + " --job-name=genotype_variants", + ' --wrap="genotype_variants small_variants multiple-samples', + " -i ", results.dir, "/", x, "/", x, "_genotype_metadata.tsv", + " -r ", fasta.path, + " -g ", genotyper.path, + ' -v DEBUG"' + ), + intern = T + ) + job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids)) + }) + + + # Get base count multi sample in pooled normal ---------------------------- + # all all unique calls in entire cohort + print("Compiling reads in pooled samples") + dir.create(paste0(results.dir, "/pooled")) + all.all.unique.mafs <- + do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) { + fread(list.files( + paste0(results.dir, "/", x), + pattern = "unique_calls.maf$", + full.names = T + )) + })) + all.all.unique.mafs <- + all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .( + Hugo_Symbol, + Chromosome, + Start_Position, + End_Position, + Variant_Classification, + HGVSp_Short, + Reference_Allele, + Tumor_Seq_Allele2 + )]),] + write.table( + all.all.unique.mafs, + paste0(results.dir, "/pooled/all_all_unique.maf"), + sep = "\t", + quote = F, + row.names = F + ) + + write.table( + data.frame( + sample_id = gsub("^.*./|.bam", "", pooled.bams), + maf = paste0(results.dir, "/pooled/all_all_unique.maf"), + standard_bam = as.character(pooled.bams), + duplex_bam = "", + simplex_bam = NA_character_ + ), + paste0(results.dir, "/pooled/pooled_metadata.tsv"), + sep = "\t", + quote = F, + row.names = F + ) + + pooled.sample.job.id <- system( + paste0( + "sbatch", + " --partition=cmobic_cpu", + " --chdir=", results.dir, "/pooled", + " --time=12:00:00", + " --mem=8G", + " --output=genotyping.o", + " --error=genotyping.e", + paste0(unlist(all.fillout.id), collapse = ":"), + " --job-name=pooled_genotype_variants", + ' --wrap="genotype_variants small_variants multiple-samples', + " -i ", results.dir, "/pooled/pooled_metadata.tsv", + " -r ", fasta.path, + " -g ", genotyper.path, + ' -v DEBUG"' + ), + intern = T + ) + pooled.sample.job.id <- + as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id)) + while (length(system(paste0("squeue -j ", pooled.sample.job.id, " -h"), intern = TRUE)) > 0) { + Sys.sleep(120) + } + print("Compile reads done!") +} + +# Executable ----------------------------------------------------------------------------------------------------------- +# Minimal columns for input mafs +# +# Hugo_Symbol,Chromosome,Start_Position,End_Position,Tumor_Sample_Barcode,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,D_t_alt_count_fragment + +suppressPackageStartupMessages({ + library(data.table) + library(tidyr) + library(stringr) + library(dplyr) + library(argparse) +}) + +if (!interactive()) { + parser <- ArgumentParser() + parser$add_argument("-m", "--masterref", type = "character", help = "File path to master reference file") + parser$add_argument("-o", "--resultsdir", type = "character", help = "Output directory") + parser$add_argument( + "-pid", + "--projectid", + type = "character", + default = "", + help = "Project ID for submitted jobs involved in this run" + ) + parser$add_argument( + "-pb", + "--pooledbamdir", + type = "character", + default = "/data1/core006/access/production/resources/msk-access/v2.0/novaseq_curated_duplex_bams_dmp/current/", + help = "Directory for all pooled bams [default]" + ) + parser$add_argument( + "-fa", + "--fastapath", + type = "character", + default = "/data1/core006/access/production/resources/reference/versions/hg19_virus_special/hg19_virus.fasta", + help = "Reference fasta path [default]" + ) + parser$add_argument( + "-gt", + "--genotyperpath", + type = "character", + default = "/data1/core006/access/production/resources/tools/GetBaseCountsMultiSample/versions/GetBaseCountsMultiSample-1.2.5/GetBaseCountsMultiSample", + help = "Genotyper executable path [default]" + ) + parser$add_argument( + "-dmp", + "--dmpdir", + type = "character", + default = "/data1/core006/access/production/resources/cbioportal/versions/msk-impact/msk_solid_heme/", + help = "Directory of clinical DMP repository [default]" + ) + parser$add_argument( + "-mb", + "--mirrorbamdir", + type = "character", + default = "/data1/share001/share/impact_12_245/", + help = "Mirror BAM file directory [default]" + ) + parser$add_argument( + "-mab", + "--mirroraccessbamdir", + type = "character", + default = "/data1/share001/share/access_12_245/", + help = "Mirror BAM file directory for MSK-ACCESS [default]" + ) + parser$add_argument( + "-dmpk", + "--dmpkeypath", + type = "character", + default = "/data1/share001/share/impact_12_245.key.txt", + help = "DMP mirror BAM key file [default]" + ) + parser$add_argument( + "-dmpak", + "--dmpaccesskeypath", + type = "character", + default = "/data1/share001/share/access_12_245.key.txt", + help = "DMP mirror BAM key file for MSK-ACCESS [default]" + ) + args <- parser$parse_args() + + master.ref <- args$masterref + results.dir <- args$resultsdir + project.ID <- args$projectid + pooled.bam.dir <- args$pooledbamdir + fasta.path <- args$fastapath + genotyper.path <- args$genotyperpath + dmp.dir <- args$dmpdir + mirror.bam.dir <- args$mirrorbamdir + mirror.access.bam.dir <- args$mirroraccessbamdir + dmp.key.path <- args$dmpkeypath + access.key.path <- args$dmpaccesskeypath + + + if (project.ID == "") { + project.ID <- + paste0(sample(c(0:9), size = 10, replace = T), collapse = "") + } + + print(paste0("Input parameters for run ", project.ID)) + print(master.ref) + print(results.dir) + print(pooled.bam.dir) + print(fasta.path) + print(genotyper.path) + print(dmp.dir) + print(mirror.bam.dir) + print(mirror.access.bam.dir) + print(dmp.key.path) + print(access.key.path) + suppressWarnings( + compile_reads_all( + fread(master.ref), + results.dir, + project.ID, + pooled.bam.dir, + fasta.path, + genotyper.path, + dmp.dir, + mirror.bam.dir, + mirror.access.bam.dir, + dmp.key.path, + access.key.path + ) + ) + print("compile reads function finished") +} \ No newline at end of file diff --git a/R_Iris/filter_calls.R b/R_Iris/filter_calls.R new file mode 100644 index 0000000..0c62683 --- /dev/null +++ b/R_Iris/filter_calls.R @@ -0,0 +1,319 @@ +# library(data.table) +# library(tidyr) +# library(stringr) +# library(dplyr) + +#' @export +filter_calls = function( + master.ref,results.dir, + CH.path = '/data1/core006/access/production/resources/dmp_signedout_CH/versions/10Feb2020/signedout_CH.txt', + criteria = 'stringent' +){ + # # test input section ----------------------------------------------------------- + # master.ref = fread('/juno/work/bergerm1/bergerlab/zhengy1/access_data_analysis/data/example_master_file.csv') + # results.dir = paste0('/juno/work/bergerm1/MSK-ACCESS/ACCESS-Projects/test_access/access_data_analysis/output_042020/') + # dmp.key.path = '/ifs/dmprequest/12-245/key.txt' + # CH.calls = fread('/ifs/work/bergerm1/zhengy1/RET_all/Original_files/signedout_CH.txt') + # # criteria <- 'permissive' + # criteria <- 'stringent' + # + # criteria definition ----------------------------------------------------- + if(criteria == 'permissive'){ + hotspot.support <- 1 + non.hotspot.support <- 3 + }else{ + hotspot.support <- 3 + non.hotspot.support <- 5 + } + + dir.create(paste0(results.dir,'/results_',criteria)) + + # inputs --------------------------------------------------------------- + # DMP.key <- fread(dmp.key.path) + CH.calls = fread(CH.path) + pooled.normal.mafs <- + fread(paste0(results.dir,'/pooled/all_all_unique.maf')) %>% + mutate(Tumor_Sample_Barcode = paste0(Tumor_Sample_Barcode,'___pooled')) %>% + select(Hugo_Symbol,Tumor_Sample_Barcode,Chromosome,Start_Position,End_Position,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,t_alt_count) %>% + group_by(Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Reference_Allele,Tumor_Seq_Allele2) %>% + summarise(duplex_support_num = length(which(t_alt_count >= 2))) %>% + filter(duplex_support_num > 0,.preserve = T) %>% + transmute(Hugo_Symbol, Chromosome=as.character(Chromosome), Start_Position, End_Position, Variant_Classification, Reference_Allele, Tumor_Seq_Allele2, duplex_support_num) %>% + data.table() + + # for each patient produce the correct results ---------------------------- + # x <- unique(master.ref$cmo_patient_id)[1] + all.fillout.dim <- lapply(unique(master.ref$cmo_patient_id),function(x){ + print(paste0('Processing patient ',x)) + + # Inputs and sanity checks ------------------------------------------------ + fillouts.filenames <- list.files(paste0(results.dir,'/',x,'/'),'ORG-STD_genotyped.maf|ORG-SIMPLEX-DUPLEX_genotyped.maf',full.names = T) + + # compiling a sample sheet with duplex, simplex, normal, DMP tumor and DMP normal + sample.sheet <- fread(paste0(results.dir,'/',x,'/',x,'_sample_sheet.tsv'))[,.(Sample_Barcode,cmo_patient_id,Sample_Type)] + simplex.sample.sheet = sample.sheet[Sample_Type == 'duplex',.(Sample_Barcode,cmo_patient_id,Sample_Type = 'simplex')] + sample.sheet = rbind(sample.sheet,simplex.sample.sheet) %>% + mutate(column.names = paste0(Sample_Barcode,'___',Sample_Type)) %>% + data.table() + + # compiling different genotype files from step 1 + fillouts.dt <- do.call(rbind,lapply(fillouts.filenames,function(y){ + sample.name = gsub('.*./|-ORG.*.','',y) + sample.type = unique(sample.sheet[Sample_Barcode == sample.name]$Sample_Type) + + # t_alt_count,t_ref_count,t_depth these columns are useless, have to use duplex/simplex/standard columms + maf.file <- fread(y) %>% + select(-c(t_alt_count,t_ref_count)) %>% + data.table() + + if (nrow(maf.file) == 0) { + + columns <- c( + "Hugo_Symbol", "Tumor_Sample_Barcode", "Chromosome", "Start_Position", + "End_Position", "Variant_Classification", "HGVSp_Short", + "Reference_Allele", "Tumor_Seq_Allele2", "t_var_freq", "ExAC_AF") + df <- data.frame(matrix(ncol = length(columns), nrow = 0)) + colnames(df) <- columns + + return(df) + } + + # fragment counts replacing actual allele counts + if(grepl('SIMPLEX-DUPLEX_genotyped',y)){ + melt.id.vars = colnames(maf.file)[!grepl('fragment',colnames(maf.file))] + + # get rid of simplex duplex aggregate columns + maf.file %>% + select(-c(contains('simplex_duplex'))) %>% + # melting and dcasting columns back but separating duplex and simplex columns + # t_alt_duplex, t_depth_duplex, t_alt_simplex, t_depth_simplex --> t_alt, t_depth + melt.data.table(id.vars = melt.id.vars,variable.name = 'variable',value.name = 'value') %>% + mutate(variable = gsub('fragment','_',variable)) %>% + separate(variable,c('variable','Sample_Type'),sep = '___') %>% + mutate(Tumor_Sample_Barcode = paste0(sample.name,'___',Sample_Type)) %>% + select(-Sample_Type) %>% + data.table() %>% + unique() %>% + dcast.data.table(as.formula(paste0(paste0(melt.id.vars,collapse = ' + '),' ~ variable')),value.var = 'value') -> maf.file + }else{ + maf.file <- maf.file %>% + mutate(Tumor_Sample_Barcode = paste0(sample.name,'___',sample.type)) %>% + # swaping the t_alt_count(etc)_standard for t_alt_count(etc) + mutate(t_alt_count = t_alt_count_standard,t_total_count = t_total_count_standard) + } + + maf.file = maf.file %>% + mutate(t_var_freq = paste0(t_alt_count,'/',t_total_count,'(',round(t_alt_count/t_total_count,4),')')) %>% + transmute(Hugo_Symbol,Tumor_Sample_Barcode,Chromosome = as.character(Chromosome),Start_Position,End_Position,Variant_Classification, + HGVSp_Short=as.character(HGVSp_Short),Reference_Allele,Tumor_Seq_Allele2,t_var_freq,ExAC_AF) %>% + data.table() + + return(maf.file) + })) %>% + unique() %>% + data.table() + + # merging and melting ----------------------------------------------------- + hotspot.maf <- fread(paste0(results.dir,'/',x,'/',x,'_all_unique_calls_hotspots.maf')) %>% + rowwise() %>% + transmute(Hugo_Symbol,Chromosome = as.character(Chromosome),Start_Position,End_Position,Variant_Classification, + # HGVSp_Short, + Reference_Allele,Tumor_Seq_Allele2,Hotspot = ifelse(hotspot_whitelist,'Hotspot',NA)) %>% + data.table() + + dmp.maf <- fread(paste0(results.dir,'/',x,'/',x,'_impact_calls.maf')) %>% + transmute(Hugo_Symbol,Chromosome = as.character(Chromosome),Start_Position,End_Position,Variant_Classification, + # HGVSp_Short, + Reference_Allele,Tumor_Seq_Allele2) %>% + mutate(DMP = 'Signed out') %>% + unique() %>% + data.table() + + if((nrow(dmp.maf) > 0) && (nrow(fillouts.dt) > 0)){ + fillouts.dt <- fillouts.dt %>% + dcast.data.table(Hugo_Symbol + Chromosome + Start_Position + End_Position + Variant_Classification + + HGVSp_Short + Reference_Allele + Tumor_Seq_Allele2 + ExAC_AF ~ Tumor_Sample_Barcode, + value.var = 't_var_freq') %>% + # hotspot information + merge( + hotspot.maf, + by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), + all.x = T) %>% + # Identifying signed out calls + merge( + dmp.maf, + by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), + all.x = T) %>% + # pooled normal for systemic artifacts + merge( + pooled.normal.mafs, + by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), + all.x = T) %>% + data.table() + } else if (nrow(fillouts.dt) > 0){ + fillouts.dt <- fillouts.dt %>% + dcast.data.table( + Hugo_Symbol + Chromosome + Start_Position + End_Position + Variant_Classification + + HGVSp_Short + Reference_Allele + Tumor_Seq_Allele2 + ExAC_AF ~ Tumor_Sample_Barcode, + value.var = 't_var_freq') %>% + # hotspot information + merge( + hotspot.maf, + by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), + all.x = T) %>% + # pooled normal for systemic artifacts + merge( + pooled.normal.mafs, + by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'), + all.x = T) %>% + mutate(DMP = NA) %>% + data.table() + } else { + + print(paste0("Found no tumor or DMP mutations for ", x, ". Writing an empty data.frame to CSV.")) + + # if fillouts.dt has no data, then add the needed columns with no data + fillouts.dt[,c("DMP", "Hotspot", "duplex_support_num", "call_confidence", "CH") := NA] + + fillouts.dt <- fillouts.dt %>% select( + Hugo_Symbol,Chromosome,Start_Position,End_Position, + Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2, + ExAC_AF,Hotspot,DMP,CH,duplex_support_num,call_confidence,sort(everything())) + + write.csv( + fillouts.dt, + paste0(results.dir,'/results_',criteria,'/',x,'_SNV_table.csv'), + row.names = F) + + return() + } + + # Interesting cases where DMP signed out calls are artifacets + if(any(!is.na(fillouts.dt$DMP) & !is.na(fillouts.dt$duplex_support_num))){ + print(paste0('Look at ',x,' for DMP signed out plasma artifacts...')) + } + + # germline filtering for matched and unmatched ---------------------------- + plasma.samples <- sample.sheet[Sample_Type %in% c('duplex')]$column.names + normal.samples <- sample.sheet[Sample_Type %in% c('unfilterednormal','normal_DMP')]$column.names + fillouts.dt[,c( + paste0(plasma.samples,'.called') + # paste0(gsub('duplex','simplex',plasma.samples),'.called') + + ) := 'Not Called'] + + # preliminary calling + # tmp.col.name <- plasma.samples[1] + lapply(plasma.samples,function(tmp.col.name){ + # genotyping (signed out stuff) + fillouts.dt[(as.numeric(gsub("/.*.$",'',get(tmp.col.name))) >= 1 | as.numeric(gsub("/.*.$",'',get(paste0(gsub('duplex','simplex',tmp.col.name))))) > 1) & DMP == 'Signed out', + eval(paste0(tmp.col.name,'.called')) := 'Genotyped'] + # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] + # hotspot reads + fillouts.dt[as.numeric(gsub("/.*.$",'',get(tmp.col.name))) >= hotspot.support & Hotspot == 'Hotspot', + eval(paste0(tmp.col.name,'.called')) := 'Called'] + # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] + # non hotspot reads + fillouts.dt[as.numeric(gsub("/.*.$",'',get(tmp.col.name))) >= non.hotspot.support & is.na(Hotspot), + eval(paste0(tmp.col.name,'.called')) := 'Called'] + # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')] + # print(table(fillouts.dt[,get(paste0(tmp.col.name,'.called'))])) + }) + + if(all(!c('unfilterednormal','normal_DMP') %in% sample.sheet$Sample_Type)){ + tmp.col.name <- plasma.samples[1] + lapply(plasma.samples,function(tmp.col.name){ + #fillouts.dt[as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name),"\\(.*.\\)"))) >= 0.3 | ExAC_AF >= 0.0001,eval(paste0(tmp.col.name,'.called')) := 'Not Called'] + fillouts.dt[get(tmp.col.name) == '0/0(NaN)',eval(paste0(tmp.col.name,'.called')) := 'Not Covered'] + }) + }else{ + lapply(plasma.samples,function(tmp.col.name){ + lapply(normal.samples,function(tmp.col.name.normal){ + # duplex tvar/nvar > 5 + fillouts.dt[(as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name),"\\(.*.\\)")))/as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name.normal),"\\(.*.\\)"))) < 2) | + # if duplex have no reads, use simplex tvar + (as.numeric(gsub("\\(|\\)",'',str_extract(get(gsub('duplex','simplex',tmp.col.name)),"\\(.*.\\)")))/as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name.normal),"\\(.*.\\)"))) < 2 & + as.numeric(gsub("/.*.$",'',get(tmp.col.name))) == 0), + eval(paste0(tmp.col.name,'.called')) := 'Not Called'] + fillouts.dt[get(tmp.col.name) == '0/0(NaN)',eval(paste0(tmp.col.name,'.called')) := 'Not Covered'] + }) + }) + } + + # final processing -------------------------------------------------------- + # Save only the useful column + #print(fillouts.dt) + #print("#######") + fillouts.dt <- fillouts.dt[DMP == 'Signed out' | fillouts.dt[,apply(.SD,1,function(x){any(x == 'Called')})]] + #print(fillouts.dt) + # combining duplex and simplex counts + lapply(plasma.samples,function(tmp.col.name){ + # hotspot reads + fillouts.dt[,eval(gsub('duplex','total',tmp.col.name)) := paste0( + as.numeric(gsub("/.*.$",'',get(tmp.col.name)))+as.numeric(gsub("/.*.$",'',get(gsub('duplex','simplex',tmp.col.name)))),'/', + as.numeric(gsub("^.*./|\\(.*.$",'',get(tmp.col.name)))+as.numeric(gsub("^.*./|\\(.*.$",'',get(gsub('duplex','simplex',tmp.col.name)))),'(', + round((as.numeric(gsub("/.*.$",'',get(tmp.col.name)))+as.numeric(gsub("/.*.$",'',get(gsub('duplex','simplex',tmp.col.name)))))/ + (as.numeric(gsub("^.*./|\\(.*.$",'',get(tmp.col.name)))+as.numeric(gsub("^.*./|\\(.*.$",'',get(gsub('duplex','simplex',tmp.col.name))))),4),')' + )] + fillouts.dt[,c(eval(gsub('duplex','simplex',tmp.col.name)),eval(tmp.col.name)):= list(NULL,NULL)] + }) + + fillouts.dt <- fillouts.dt[,order(colnames(fillouts.dt)),with = F] %>% + # filter for artifacts + mutate(call_confidence = case_when( + (Hugo_Symbol == 'TERT' & is.na(Hotspot)) | (Hugo_Symbol == 'ERBB2' & grepl('[A-Z]90[0-9][A-Z]',HGVSp_Short)) | + (Hugo_Symbol == 'BRAF' & grepl('711',HGVSp_Short)) | (Hugo_Symbol == 'NF1' & grepl('[A-Z]106[0-9][A-Z]',HGVSp_Short)) ~ 'Low', + DMP == 'Signed out' ~ 'High', + TRUE ~ '' + )) %>% + merge(CH.calls[,.(Hugo_Symbol = Gene,Chromosome = Chrom,Start_Position = Start,Reference_Allele = Ref,Tumor_Seq_Allele2 = Alt,HGVSp_Short = AAchange,Variant_Classification = VariantClass,CH = 'Yes')], + by = c('Hugo_Symbol','Chromosome','Start_Position','Variant_Classification','HGVSp_Short','Reference_Allele','Tumor_Seq_Allele2'), + all.x = T) %>% + mutate(CH = ifelse(is.na(CH),'No','Yes')) %>% + select(Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2, + ExAC_AF,Hotspot,DMP,CH,duplex_support_num,call_confidence,sort(everything())) + + write.csv(fillouts.dt,paste0(results.dir,'/results_',criteria,'/',x,'_SNV_table.csv'),row.names = F) + }) + + if(all(unlist(all.fillout.dim))){ + print('All dimension of fillout mafs for each patient looks correct') + } + +} + +# Executable ----------------------------------------------------------------------------------------------------------- +suppressPackageStartupMessages({ + library(data.table) + library(tidyr) + library(stringr) + library(dplyr) + library(argparse) +}) + +if (!interactive()) { + + parser=ArgumentParser() + parser$add_argument('-m', '--masterref', type='character', help='File path to master reference file') + parser$add_argument('-o', '--resultsdir', type='character', help='Output directory') + parser$add_argument('-ch', '--chlist', type='character', default = '/data1/core006/access/production/resources/dmp_signedout_CH/versions/10Feb2020/signedout_CH.txt', + help='List of signed out CH calls [default]') + parser$add_argument('-c', '--criteria', type='character', default = 'stringent', + help='Calling criteria [default]') + args=parser$parse_args() + + master.ref = args$masterref + results.dir = args$resultsdir + chlist = args$chlist + criteria = args$criteria + + cat(paste0(paste0(c(paste0(rep('-',15),collapse = ''),'Arguments input: ',master.ref,results.dir,chlist,criteria, + paste0(rep('-',15),collapse = '')),collapse = "\n"),'\n')) + + if(!criteria %in% c('stringent','permissive')){ + stop('Criteria argument should be either stringent or permissive') + } + + suppressWarnings(filter_calls(fread(master.ref),results.dir,chlist,criteria)) + +}