diff --git a/R_Iris/CNA_processing.R b/R_Iris/CNA_processing.R
new file mode 100644
index 0000000..c57458b
--- /dev/null
+++ b/R_Iris/CNA_processing.R
@@ -0,0 +1,127 @@
+# library(data.table)
+# library(stringr)
+# library(tidyr)
+# library(dplyr)
+
+#' @export
+CNA_processing = function(
+  master.ref, results.dir, dmp.dir = '/data1/core006/access/production/resources/cbioportal/versions/msk-impact/msk_solid_heme/',
+  gene_set = "v1"
+) {
+  # Define gene sets
+  access.cna.genes = c('AR','BRCA1','BRCA2','CDK4','EGFR','ERBB2','MET','MDM2','MLH1','MSH2','MSH6','MYC')
+  access.v2.cna.genes = c(
+    "ALK", "APC", "AR", "ARID1A", "ASXL1", "ATM", "BAP1", "BRCA1", "BRCA2", "CDH1", "CDK12", "CDK4", "CHEK2",
+    "DNMT3A", "EGFR", "ERBB2", "ERBB3", "ERCC2", "FBXW7", "FGFR2", "FGFR3", "KDM6A", "KIT", "MAP2K1", "MAP2K2",
+    "MDM2", "MET", "MLH1", "MSH2", "MSH6", "MTOR", "MYC", "MYCN", "NF1", "PALB2", "PDGFRA", "PIK3CA", "PMS2",
+    "PTCH1", "RB1", "RET", "SMAD4", "TP53", "TSC1", "TSC2"
+  )
+
+  # Select the appropriate gene set
+  selected_genes = if (gene_set == "v2") access.v2.cna.genes else access.cna.genes
+
+  # DMP CNA calls --------------------------------------------------------
+  DMP.cna <- fread(paste0(dmp.dir, '/data_CNA.txt'))
+
+  dmp.ids <- master.ref$dmp_patient_id[(master.ref$dmp_patient_id != "") & !is.na(master.ref$dmp_patient_id)]
+  dmp.pattern <- paste0(dmp.ids, collapse = '|')
+
+  DMP.cna.edited = DMP.cna[
+    !Hugo_Symbol %in% c('CDKN2Ap14ARF', 'CDKN2Ap16INK4A'),
+    c('Hugo_Symbol', grep(dmp.pattern, colnames(DMP.cna), value = T)),
+    with = F
+  ] %>%
+    melt.data.table(
+      id.vars = 'Hugo_Symbol',
+      variable.name = 'Tumor_Sample_Barcode',
+      value.name = 'fc'
+    ) %>%
+    filter(fc != 0) %>%
+    mutate(
+      CNA_tumor = case_when(fc < 0 ~ 'HOMDEL', fc > 0 ~ 'AMP'),
+      dmp_patient_id = gsub('-T..-IM.', '', Tumor_Sample_Barcode)
+    ) %>%
+    merge(
+      unique(master.ref[!is.na(dmp_patient_id), .(cmo_patient_id, dmp_patient_id)]),
+      by = 'dmp_patient_id',
+      all.x = T
+    ) %>%
+    transmute(Hugo_Symbol, CNA_tumor, cmo_patient_id, dmp_patient_id) %>%
+    unique() %>%
+    data.table()
+
+  # Access CNA calls --------------------------------------------------------
+  access.cna = do.call(rbind, lapply(master.ref$cna_path, function(x) {
+    fread(x) %>%
+      transmute(
+        Tumor_Sample_Barcode = gsub('\\.', '-', gsub('_mean_cvg', '', sample)),
+        Hugo_Symbol = region,
+        p.adj,
+        fc
+      ) %>%
+      filter(p.adj <= 0.05) %>%
+      merge(
+        unique(master.ref[, .(cmo_patient_id, cmo_sample_id_plasma)]),
+        by.x = 'Tumor_Sample_Barcode',
+        by.y = 'cmo_sample_id_plasma',
+        all.x = T
+      ) %>%
+      data.table()
+  }))
+
+  merge(
+    access.cna,
+    DMP.cna.edited,
+    by = c('Hugo_Symbol', 'cmo_patient_id'),
+    all.x = T
+  ) %>%
+    mutate(
+      CNA = case_when(
+        (fc >= 1.5 & Hugo_Symbol %in% selected_genes) ~ 'AMP',
+        (fc <= -1.5 & Hugo_Symbol %in% selected_genes) ~ 'HOMDEL',
+        (fc >= 1.2 & !is.na(CNA_tumor)) ~ 'AMP',
+        (fc <= -1.2 & !is.na(CNA_tumor)) ~ 'HOMDEL',
+        TRUE ~ ''
+      )
+    ) %>%
+    filter(CNA != '') %>%
+    data.table() -> access.call.set
+
+  # Compare tumor and plasma CNAs
+  condition <- any(access.call.set$CNA_tumor != access.call.set$CNA)
+  if (condition || is.na(condition)) warning('Tumor and plasma have conflicting CNA detected!')
+
+  # Save results
+  dir.create(paste0(results.dir, '/CNA_final_call_set'), showWarnings = FALSE)
+  lapply(unique(master.ref$cmo_sample_id_plasma), function(x) {
+    write.table(access.call.set[Tumor_Sample_Barcode == x],
+                paste0(results.dir, '/CNA_final_call_set/', x, '_cna_final_call_set.txt'),
+                sep = '\t', quote = F, row.names = F)
+  })
+}
+# Executable -----------------------------------------------------------------------------------------------------------
+suppressPackageStartupMessages({
+  library(argparse)
+  library(data.table)
+  library(stringr)
+  library(dplyr)
+})
+
+if (!interactive()) {
+
+  parser = ArgumentParser()
+  parser$add_argument('-m', '--masterref', type = 'character', help = 'File path to master reference file')
+  parser$add_argument('-o', '--resultsdir', type = 'character', help = 'Output directory')
+  parser$add_argument('-dmp', '--dmpdir', type = 'character', default = '/data1/core006/access/production/resources/cbioportal/versions/msk-impact/msk_solid_heme/',
+                      help = 'Directory of clinical DMP IMPACT repository [default]')
+  parser$add_argument('-g', '--gene-set', type = 'character', default = 'v1',
+                      help = 'Gene set to use: "v1" for access.cna.genes or "v2" for access.v2.cna.genes [default: v1]')
+  args = parser$parse_args()
+
+  master.ref = args$masterref
+  results.dir = args$resultsdir
+  dmp.dir = args$dmpdir
+  gene_set = args$gene_set
+
+  CNA_processing(fread(master.ref), results.dir, dmp.dir, gene_set)
+}
diff --git a/R_Iris/compile_reads_all.R b/R_Iris/compile_reads_all.R
new file mode 100644
index 0000000..4a8dab6
--- /dev/null
+++ b/R_Iris/compile_reads_all.R
@@ -0,0 +1,766 @@
+#library(data.table)
+#library(tidyr)
+#library(stringr)
+#library(dplyr)
+
+
+#' @export
+compile_reads_all <- function(master.ref,
+                          results.dir,
+                          project.ID,
+                          pooled.bam.dir = "/data1/core006/access/production/resources/msk-access/v2.0/novaseq_curated_duplex_bams_dmp/current/",
+                          fasta.path = "/data1/core006/access/production/resources/reference/versions/hg19_virus_special/hg19_virus.fasta",
+                          genotyper.path = "/data1/core006/access/production/resources/tools/GetBaseCountsMultiSample/current/GetBaseCountsMultiSample",
+                          dmp.dir = "/data1/core006/access/production/resources/cbioportal/versions/msk-impact/msk_solid_heme/",
+                          mirror.bam.dir = "/juno/dmp/share/irb12_245",
+                          mirror.access.bam.dir = "/juno/dmp/share/access_12_245/",
+                          dmp.key.path = "/juno/dmp/request/12-245/key.txt",
+                          access.key.path = "/juno/dmp/request/ACCESS-12-245/key.txtt") {
+  # # test input section -----------------------------------------------------------
+  # master.ref = fread('/juno/data1/core006/bergerm1/bergerlab/zhengy1/access_data_analysis/data/example_master_file.csv')
+  # results.dir = paste0('/juno/data1/core006/bergerm1/MSK-ACCESS/ACCESS-Projects/test_access/access_data_analysis/output_',format(Sys.time(),'%m%d%y'))
+  # pooled.bam.dir = '/ifs/data1/core006/bergerm1/ACCESS-Projects/novaseq_curated_duplex_v2/'
+  # fasta.path = '/data1/core006/access/production/resources/reference/current/Homo_sapiens_assembly19.fasta'
+  # genotyper.path = '/ifs/data1/core006/bergerm1/Innovation/software/maysun/GetBaseCountsMultiSample/GetBaseCountsMultiSample'
+  # dmp.dir = '/ifs/data1/core006/bergerm1/zhengy1/dmp/mskimpact/'
+  # mirror.bam.dir = '/ifs/dmpshare/share/irb12_245/'
+  # dmp.key.path = '/ifs/dmprequest/12-245/key.txt'
+  # setting up directory ----------------------------------------------------
+  dir.create(results.dir)
+  # make tmp directory in output directory
+  dir.create(paste0(results.dir, "/tmp"))
+  # checking virtualenv -----------------------------------------------------
+  geno.bash <- system("which genotype_variants", intern = T)
+  if (length(geno.bash) == 0) {
+    # print(pyclone.path)
+    stop(
+      "needs to run \nsource /home/accessbot/miniconda3/bin/activate && conda activate genotype-variants-0.3.0"
+    )
+  }
+
+  # data from DMP -----------------------------------------------------------
+  DMP.key <- as.data.table(read.csv(dmp.key.path, header = FALSE, sep = ","))
+  if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))) {
+    message(paste0(
+      "These DMP IDs are not found in DMP key file: ",
+      paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in%
+                                                                             gsub("-T..-IH.|-T..-IM.", "", DMP.key[grepl("IH|IM", V1)]$V1))], collapse = " ,")
+    ))
+  }
+  # data from DMP ACCESS ----------------------------------------------------
+  access.key <-
+    as.data.table(read.csv(access.key.path, header = FALSE, sep = ","))
+  if (any(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in% gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))) {
+    message(paste0(
+      "These DMP IDs are not found in DMP ACCESS key file: ",
+      paste0(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id[which(!master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id %in%
+                                                                             gsub("-T..-XS.", "", access.key[grepl("XS", V1)]$V1))], collapse = " ,")
+    ))
+  }
+
+  DMP.maf <-
+    fread(paste0(dmp.dir, "/data_mutations_extended.txt")) %>%
+    filter(Mutation_Status != "GERMLINE") %>%
+    data.table()
+  DMP.RET.maf <-
+    DMP.maf[grepl(paste0(unique(master.ref[grepl("^P-", dmp_patient_id)]$dmp_patient_id), collapse = "|"), Tumor_Sample_Barcode), ]
+
+  # Pooled normal samples ---------------------------------------------------
+  pooled.bams <-
+    list.files(pooled.bam.dir, pattern = "\\.bam$", full.names = T)
+
+  # For each patient --------------------------------------------------------
+  x <- unique(master.ref$cmo_patient_id)[1]
+  # x = unique(master.ref$cmo_sample_id_plasma)[16]
+  # x = 'C-YW82CY'
+  print("Compiling reads per patient")
+  all.fillout.id <-
+    lapply(unique(master.ref$cmo_patient_id), function(x) {
+      print(x)
+      dir.create(paste0(results.dir, "/", x))
+      dmp_id <-
+        unique(master.ref[cmo_patient_id == x]$dmp_patient_id)
+      # sample sheet with colummns -- TSB, sample type, bam path, treatm --------
+      # need to get DMP tumor, DMP normal, plasma, plasma normal (if there is any), pooled normal
+      # DMP sample sheet
+      if (is.na(dmp_id) | dmp_id == '') {
+        dmp.sample.sheet <- NULL
+      } else {
+        all.dmp.ids.IM <-
+          DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V1
+        all.dmp.ids.IH <-
+          DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V1
+        all.dmp.ids.XS <-
+          access.key[grepl(paste0(dmp_id, "-T..-XS."), V1)]$V1
+        all.dmp.ids.normal.XS <-
+          access.key[grepl(paste0(dmp_id, "-N..-XS."), V1)]$V1
+        all.dmp.ids <- c(all.dmp.ids.IM, all.dmp.ids.IH)
+        all.dmp.bam.ids.IM <-
+          DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IM."), V1)]$V2
+        all.dmp.bam.ids.IH <-
+          DMP.key[grepl(paste0(dmp_id, "-(T|N)..-IH."), V1)]$V2
+        all.dmp.bam.ids.XS <-
+          gsub("-standard|-unfilter|-simplex|-duplex",
+               "",
+               access.key[grepl(paste0(dmp_id, "-T..-XS."), V1)]$V2)
+        all.dmp.bam.ids.normal.XS <-
+          gsub("-standard|-unfilter|-simplex|-duplex",
+               "",
+               access.key[grepl(paste0(dmp_id, "-N..-XS."), V1)]$V2)
+        all.dmp.bam.ids <-
+          c(all.dmp.bam.ids.IM,
+            all.dmp.bam.ids.IH)
+        if (length(all.dmp.ids) == 0) {
+          dmp.sample.sheet <- NULL
+        } else{
+          bam.sub.dir <-
+            unlist(lapply(strsplit(substr(
+              all.dmp.bam.ids, 1, 2
+            ), ""), function(x) {
+              paste0(x, collapse = "/")
+            }))
+          dmp.sample.sheet <- data.frame(
+            Sample_Barcode = all.dmp.ids,
+            standard_bam = paste0(
+              mirror.bam.dir,
+              "/",
+              bam.sub.dir,
+              "/",
+              all.dmp.bam.ids,
+              ".bam"
+            ),
+            duplex_bam = NA_character_,
+            simplex_bam = NA_character_
+          ) %>%
+            mutate(
+              cmo_patient_id = x,
+              Sample_Type = ifelse(
+                grepl("-T", Sample_Barcode),
+                "DMP_Tumor",
+                "DMP_Normal"
+              ),
+              dmp_patient_id = dmp_id
+            )
+        }
+        if (length(all.dmp.ids.XS) == 0) {
+          access.sample.sheet <- NULL
+        } else{
+          access.bam.sub.dir <-
+            unlist(lapply(strsplit(
+              substr(all.dmp.bam.ids.XS, 1, 2), ""
+            ), function(x) {
+              paste0(x, collapse = "/")
+            }))
+          access.sample.sheet <- unique(
+            data.frame(
+              Sample_Barcode = all.dmp.ids.XS,
+              standard_bam = NA_character_,
+              duplex_bam = paste0(
+                mirror.access.bam.dir,
+                "/",
+                access.bam.sub.dir,
+                "/",
+                all.dmp.bam.ids.XS,
+                "-duplex.bam"
+              ),
+              simplex_bam = paste0(
+                mirror.access.bam.dir,
+                "/",
+                access.bam.sub.dir,
+                "/",
+                all.dmp.bam.ids.XS,
+                "-simplex.bam"
+              )
+            ) %>%
+              mutate(
+                cmo_patient_id = x,
+                Sample_Type = ifelse(
+                  grepl("-T", Sample_Barcode),
+                  "duplex",
+                  "unfilterednormal"
+                ),
+                dmp_patient_id = dmp_id
+              )
+          )
+          access.normal.bam.sub.dir <-
+            unlist(lapply(strsplit(
+              substr(all.dmp.bam.ids.normal.XS, 1, 2), ""
+            ), function(x) {
+              paste0(x, collapse = "/")
+            }))
+          access.normal.sample.sheet <- unique(
+            data.frame(
+              Sample_Barcode = all.dmp.ids.normal.XS,
+              standard_bam = paste0(
+                mirror.access.bam.dir,
+                "/",
+                access.normal.bam.sub.dir,
+                "/",
+                all.dmp.bam.ids.normal.XS,
+                "-unfilter.bam"
+              ),
+              duplex_bam = NA_character_,
+              simplex_bam = NA_character_
+            ) %>%
+              mutate(
+                cmo_patient_id = x,
+                Sample_Type = ifelse(
+                  grepl("-N", Sample_Barcode),
+                  "unfilterednormal",
+                  "duplex"
+                ),
+                dmp_patient_id = dmp_id
+              )
+          )
+          access.sample.sheet = bind_rows(access.sample.sheet, access.normal.sample.sheet)
+        }
+        if (!is.null(dmp.sample.sheet) &
+            !is.null(access.sample.sheet)) {
+          print("DMP IMPACT and DMP ACCESS samples are available")
+          dmp.sample.sheet <-
+            bind_rows(dmp.sample.sheet, access.sample.sheet)
+
+        } else if (is.null(dmp.sample.sheet) &
+                   !is.null(access.sample.sheet)) {
+          print("DMP IMPACT samples are NOT available and DMP ACCESS samples are available")
+          dmp.sample.sheet <- access.sample.sheet
+        } else if (!is.null(dmp.sample.sheet) &
+                   is.null(access.sample.sheet)) {
+          print("DMP IMPACT samples are available and DMP ACCESS samples are NOT available")
+          dmp.sample.sheet <- dmp.sample.sheet
+        } else{
+          print("No DMP IMPACT samples or DMP ACCESS samples are available")
+          dmp.sample.sheet <- NULL
+        }
+      }
+      # total sample sheet
+      sample.sheet <- master.ref[cmo_patient_id == x,
+                                 # plasma bams -- duplex and simplex bam
+                                 .(
+                                   Sample_Barcode = as.character(cmo_sample_id_plasma),
+                                   standard_bam = NA_character_,
+                                   duplex_bam = bam_path_plasma_duplex,
+                                   simplex_bam = bam_path_plasma_simplex,
+                                   cmo_patient_id,
+                                   Sample_Type = "duplex",
+                                   dmp_patient_id
+                                 )] %>%
+        merge(rbind(unique(master.ref[cmo_patient_id == x &
+                                        paired == 'Paired',
+                                      # buffy coat + DMP bams -- standard bam only
+                                      .(
+                                        Sample_Barcode = as.character(cmo_sample_id_normal),
+                                        standard_bam = as.character(bam_path_normal),
+                                        duplex_bam = NA_character_,
+                                        simplex_bam = NA_character_,
+                                        cmo_patient_id,
+                                        Sample_Type = "unfilterednormal",
+                                        dmp_patient_id
+                                      )]),
+                    dmp.sample.sheet), all = T)
+      # catch '' or NA for empty cells for some cmo_sample_id_normal
+      sample.sheet <-
+        sample.sheet[!is.na(Sample_Barcode) |
+                       Sample_Barcode != ""]
+      # Check for existence of bam files and remove rows if files are missing
+      rows_to_remove <- c()
+      for (i in 1:nrow(sample.sheet)) {
+        bam_files <- c(sample.sheet$standard_bam[i], sample.sheet$duplex_bam[i], sample.sheet$simplex_bam[i])
+        bam_files <- bam_files[!is.na(bam_files) & bam_files != ""]  # remove NAs and empty strings
+
+        for (bam_file in bam_files) {
+          if (!file.exists(bam_file)) {
+            message(paste0("Warning: BAM file does not exist: ", bam_file, ". Removing sample ", sample.sheet$Sample_Barcode[i], " from sample sheet."))
+            rows_to_remove <- c(rows_to_remove, i)
+            break # No need to check other bams for this row
+          }
+        }
+      }
+
+      if (length(rows_to_remove) > 0) {
+        sample.sheet <- sample.sheet[-unique(rows_to_remove),]
+      }
+      write.table(
+        sample.sheet,
+        paste0(results.dir, "/", x, "/", x, "_sample_sheet.tsv"),
+        sep = "\t",
+        quote = F,
+        row.names = F
+      )
+      # piece together all unique calls -----------------------------------------
+      # get duplex calls
+      duplex.calls <-
+        do.call(rbind, lapply(master.ref[cmo_patient_id == x]$maf_path, function(x) {
+          # fread(x) %>% filter(as.numeric(D_t_alt_count_fragment) > 0) %>% data.table()
+          selectcolumns <-
+            c(
+              "Hugo_Symbol",
+              "Entrez_Gene_Id",
+              "Center",
+              "NCBI_Build",
+              "Chromosome",
+              "Start_Position",
+              "End_Position",
+              "Strand",
+              "Variant_Classification",
+              "Variant_Type",
+              "Reference_Allele",
+              "Tumor_Seq_Allele1",
+              "Tumor_Seq_Allele2",
+              "dbSNP_RS",
+              "dbSNP_Val_Status",
+              "Tumor_Sample_Barcode",
+              "caller_Norm_Sample_Barcode",
+              "Match_Norm_Seq_Allele1",
+              "Match_Norm_Seq_Allele2",
+              "Tumor_Validation_Allele1",
+              "Tumor_Validation_Allele2",
+              "Match_Norm_Validation_Allele1",
+              "Match_Norm_Validation_Allele2",
+              "Verification_Status",
+              "Validation_Status",
+              "Mutation_Status",
+              "Sequencing_Phase",
+              "Sequence_Source",
+              "Validation_Method",
+              "Score",
+              "BAM_File",
+              "Sequencer",
+              "Tumor_Sample_UUID",
+              "Matched_Norm_Sample_UUID",
+              "HGVSc",
+              "HGVSp",
+              "HGVSp_Short",
+              "Transcript_ID",
+              "Exon_Number",
+              "caller_t_depth",
+              "caller_t_ref_count",
+              "caller_t_alt_count",
+              "caller_n_depth",
+              "caller_n_ref_count",
+              "caller_n_alt_count",
+              "all_effects",
+              "Allele",
+              "Gene",
+              "Feature",
+              "Feature_type",
+              "Consequence",
+              "cDNA_position",
+              "CDS_position",
+              "Protein_position",
+              "Amino_acids",
+              "Codons",
+              "Existing_variation",
+              "ALLELE_NUM",
+              "DISTANCE",
+              "STRAND_VEP",
+              "SYMBOL",
+              "SYMBOL_SOURCE",
+              "HGNC_ID",
+              "BIOTYPE",
+              "CANONICAL",
+              "CCDS",
+              "ENSP",
+              "SWISSPROT",
+              "TREMBL",
+              "UNIPARC",
+              "RefSeq",
+              "SIFT",
+              "PolyPhen",
+              "EXON",
+              "INTRON",
+              "DOMAINS",
+              "AF",
+              "AFR_AF",
+              "AMR_AF",
+              "ASN_AF",
+              "EAS_AF",
+              "EUR_AF",
+              "SAS_AF",
+              "AA_AF",
+              "EA_AF",
+              "CLIN_SIG",
+              "SOMATIC",
+              "PUBMED",
+              "MOTIF_NAME",
+              "MOTIF_POS",
+              "HIGH_INF_POS",
+              "MOTIF_SCORE_CHANGE",
+              "IMPACT",
+              "PICK",
+              "VARIANT_CLASS",
+              "TSL",
+              "HGVS_OFFSET",
+              "PHENO",
+              "MINIMISED",
+              "ExAC_AF",
+              "ExAC_AF_AFR",
+              "ExAC_AF_AMR",
+              "ExAC_AF_EAS",
+              "ExAC_AF_FIN",
+              "ExAC_AF_NFE",
+              "ExAC_AF_OTH",
+              "ExAC_AF_SAS",
+              "GENE_PHENO",
+              "FILTER",
+              "flanking_bps",
+              "variant_id",
+              "variant_qual",
+              "ExAC_AF_Adj",
+              "ExAC_AC_AN_Adj",
+              "ExAC_AC_AN",
+              "ExAC_AC_AN_AFR",
+              "ExAC_AC_AN_AMR",
+              "ExAC_AC_AN_EAS",
+              "ExAC_AC_AN_FIN",
+              "ExAC_AC_AN_NFE",
+              "ExAC_AC_AN_OTH",
+              "ExAC_AC_AN_SAS",
+              "ExAC_FILTER",
+              "gnomAD_AF",
+              "gnomAD_AFR_AF",
+              "gnomAD_AMR_AF",
+              "gnomAD_ASJ_AF",
+              "gnomAD_EAS_AF",
+              "gnomAD_FIN_AF",
+              "gnomAD_NFE_AF",
+              "gnomAD_OTH_AF",
+              "gnomAD_SAS_AF",
+              "CallMethod",
+              "VCF_POS",
+              "VCF_REF",
+              "VCF_ALT",
+              "hotspot_whitelist",
+              "Status",
+              "D_t_alt_count_fragment",
+              "D_t_ref_count_fragment",
+              "D_t_vaf_fragment",
+              "SD_t_alt_count_fragment",
+              "SD_t_ref_count_fragment",
+              "SD_t_vaf_fragment",
+              "Matched_Norm_Sample_Barcode",
+              "Matched_Norm_Bamfile",
+              "n_alt_count_fragment",
+              "n_ref_count_fragment",
+              "n_vaf_fragment"
+            )
+          if ("Status" %in% names(fread(x))) {
+            fread(x) %>% select(one_of(selectcolumns)) %>% subset((Status == "") |
+                                                                    (is.na(Status)))
+          } else {
+            fread(x) %>% select(one_of(selectcolumns))
+          }
+          #      fread(x)
+          # %>%
+          # filter(as.numeric(t_alt_count) > 0) %>%
+          # data.table()
+        }))
+      # get impact calls
+      impact.calls <-
+        DMP.RET.maf[Tumor_Sample_Barcode %in% sample.sheet$Sample_Barcode]
+      write.table(
+        impact.calls[, .(
+          Hugo_Symbol,
+          Chromosome,
+          Start_Position,
+          End_Position,
+          Variant_Classification,
+          HGVSp_Short,
+          Reference_Allele,
+          Tumor_Seq_Allele2
+        )],
+        paste0(results.dir, "/", x, "/", x, "_impact_calls.maf"),
+        sep = "\t",
+        quote = F,
+        row.names = F
+      )
+      # combining plasma and impact calls
+      all.calls <-
+        rbind(duplex.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F],
+              impact.calls[, intersect(colnames(duplex.calls), colnames(DMP.RET.maf)), with = F])
+      # getting rid of duplicate calls and take the first occurence of all events
+      all.calls <-
+        all.calls[which(!duplicated(all.calls[, .(
+          Hugo_Symbol,
+          Chromosome,
+          Start_Position,
+          End_Position,
+          Variant_Classification,
+          HGVSp_Short,
+          Reference_Allele,
+          Tumor_Seq_Allele2
+        )])), ] %>%
+        mutate(
+          t_ref_count = 0,
+          t_alt_count = 0,
+          n_ref_count = 0,
+          n_alt_count = 0,
+          Matched_Norm_Sample_Barcode = NA
+        ) %>%
+        filter(
+          Variant_Classification != "Silent" &
+            !grepl("RP11-", Hugo_Symbol) &
+            !grepl("Intron", Variant_Classification)
+        )
+      write.table(
+        all.calls,
+        paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"),
+        sep = "\t",
+        quote = F,
+        row.names = F
+      )
+      # tagging hotspots
+      system(
+		paste0(
+  		'sbatch',
+  		" --partition=cmobic_cpu",
+  		' --mem=4G',                                     # Memory (4GB)
+  		' --chdir=', results.dir, "/", x, "/",         # Working directory
+  		' --output=hotspot.o',                          # Standard output
+  		' --error=hotspot.e',                           # Standard error
+  		' --time=00:59:00',                             # Time limit (59 minutes)
+  		' --job-name=tag_hotspot',             # Job name
+  		' --wrap="python /data1/core006/access/production/workflows/access_workflows/v1/pipeline_2.0.0/ACCESS-Pipeline/cwl_tools/hotspots/tag_hotspots.py',
+  		' -m ', results.dir, "/", x, "/", x, '_all_unique_calls.maf',
+  		' -itxt /data1/core006/access/production/resources/msk-access/v2.0/regions_of_interest/versions/v1.0/hotspot-list-union-v1-v2_with_TERT.txt',
+  		' -o ', results.dir, "/", x, "/", x, '_all_unique_calls_hotspots.maf',
+  		' -outdir ', results.dir, "/", x, "/", x, '"'
+		)
+      )
+      # genotype all bams in this patient directory -----------------------------
+      # genotyping plasma samples -- plasma duplex&simplex, plasma normal, pooled plasma normal
+      write.table(
+        sample.sheet[, .(
+          sample_id = Sample_Barcode,
+          maf = paste0(results.dir, "/", x, "/", x, "_all_unique_calls.maf"),
+          standard_bam,
+          duplex_bam,
+          simplex_bam
+        )],
+        paste0(results.dir, "/", x, "/", x, "_genotype_metadata.tsv"),
+        sep = "\t",
+        quote = F,
+        row.names = F
+      )
+      job.ids <- system(
+		paste0(
+			"sbatch",
+			" --partition=cmobic_cpu",
+ 			 " --chdir=", results.dir, "/", x,
+  			" --time=12:00:00",
+ 			 " --mem=8G",
+  			" --output=genotyping.o",
+  			" --error=genotyping.e",
+  			" --job-name=genotype_variants",
+  			' --wrap="genotype_variants small_variants multiple-samples',
+  			" -i ", results.dir, "/", x, "/", x, "_genotype_metadata.tsv",
+  			" -r ", fasta.path,
+  			" -g ", genotyper.path,
+  			' -v DEBUG"'
+		),
+        intern = T
+      )
+      job.ids <- as.numeric(gsub("Job <|> is.*.$", "", job.ids))
+    })
+
+
+  # Get base count multi sample in pooled normal ----------------------------
+  # all all unique calls in entire cohort
+  print("Compiling reads in pooled samples")
+  dir.create(paste0(results.dir, "/pooled"))
+  all.all.unique.mafs <-
+    do.call(rbind, lapply(unique(master.ref$cmo_patient_id), function(x) {
+      fread(list.files(
+        paste0(results.dir, "/", x),
+        pattern = "unique_calls.maf$",
+        full.names = T
+      ))
+    }))
+  all.all.unique.mafs <-
+    all.all.unique.mafs[!duplicated(all.all.unique.mafs[, .(
+      Hugo_Symbol,
+      Chromosome,
+      Start_Position,
+      End_Position,
+      Variant_Classification,
+      HGVSp_Short,
+      Reference_Allele,
+      Tumor_Seq_Allele2
+    )]),]
+  write.table(
+    all.all.unique.mafs,
+    paste0(results.dir, "/pooled/all_all_unique.maf"),
+    sep = "\t",
+    quote = F,
+    row.names = F
+  )
+
+  write.table(
+    data.frame(
+      sample_id = gsub("^.*./|.bam", "", pooled.bams),
+      maf = paste0(results.dir, "/pooled/all_all_unique.maf"),
+      standard_bam = as.character(pooled.bams),
+      duplex_bam = "",
+      simplex_bam = NA_character_
+    ),
+    paste0(results.dir, "/pooled/pooled_metadata.tsv"),
+    sep = "\t",
+    quote = F,
+    row.names = F
+  )
+
+  pooled.sample.job.id <- system(
+    paste0(
+		"sbatch",
+		" --partition=cmobic_cpu",
+  		" --chdir=", results.dir, "/pooled",
+  		" --time=12:00:00",
+  		" --mem=8G",
+  		" --output=genotyping.o",
+  		" --error=genotyping.e",
+  		paste0(unlist(all.fillout.id), collapse = ":"),
+  		" --job-name=pooled_genotype_variants",
+  		' --wrap="genotype_variants small_variants multiple-samples',
+  		" -i ", results.dir, "/pooled/pooled_metadata.tsv",
+  		" -r ", fasta.path,
+  		" -g ", genotyper.path,
+  		' -v DEBUG"'
+    ),
+    intern = T
+  )
+  pooled.sample.job.id <-
+    as.numeric(gsub("Job <|> is.*.$", "", pooled.sample.job.id))
+  while (length(system(paste0("squeue -j ", pooled.sample.job.id, " -h"), intern = TRUE)) > 0) {
+  Sys.sleep(120)
+	}
+  print("Compile reads done!")
+}
+
+# Executable -----------------------------------------------------------------------------------------------------------
+# Minimal columns for input mafs
+#
+# Hugo_Symbol,Chromosome,Start_Position,End_Position,Tumor_Sample_Barcode,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,D_t_alt_count_fragment
+
+suppressPackageStartupMessages({
+  library(data.table)
+  library(tidyr)
+  library(stringr)
+  library(dplyr)
+  library(argparse)
+})
+
+if (!interactive()) {
+  parser <- ArgumentParser()
+  parser$add_argument("-m", "--masterref", type = "character", help = "File path to master reference file")
+  parser$add_argument("-o", "--resultsdir", type = "character", help = "Output directory")
+  parser$add_argument(
+    "-pid",
+    "--projectid",
+    type = "character",
+    default = "",
+    help = "Project ID for submitted jobs involved in this run"
+  )
+  parser$add_argument(
+    "-pb",
+    "--pooledbamdir",
+    type = "character",
+    default = "/data1/core006/access/production/resources/msk-access/v2.0/novaseq_curated_duplex_bams_dmp/current/",
+    help = "Directory for all pooled bams [default]"
+  )
+  parser$add_argument(
+    "-fa",
+    "--fastapath",
+    type = "character",
+    default = "/data1/core006/access/production/resources/reference/versions/hg19_virus_special/hg19_virus.fasta",
+    help = "Reference fasta path [default]"
+  )
+  parser$add_argument(
+    "-gt",
+    "--genotyperpath",
+    type = "character",
+    default = "/data1/core006/access/production/resources/tools/GetBaseCountsMultiSample/versions/GetBaseCountsMultiSample-1.2.5/GetBaseCountsMultiSample",
+    help = "Genotyper executable path [default]"
+  )
+  parser$add_argument(
+    "-dmp",
+    "--dmpdir",
+    type = "character",
+    default = "/data1/core006/access/production/resources/cbioportal/versions/msk-impact/msk_solid_heme/",
+    help = "Directory of clinical DMP repository [default]"
+  )
+  parser$add_argument(
+    "-mb",
+    "--mirrorbamdir",
+    type = "character",
+    default = "/data1/share001/share/impact_12_245/",
+    help = "Mirror BAM file directory [default]"
+  )
+  parser$add_argument(
+    "-mab",
+    "--mirroraccessbamdir",
+    type = "character",
+    default = "/data1/share001/share/access_12_245/",
+    help = "Mirror BAM file directory for MSK-ACCESS [default]"
+  )
+  parser$add_argument(
+    "-dmpk",
+    "--dmpkeypath",
+    type = "character",
+    default = "/data1/share001/share/impact_12_245.key.txt",
+    help = "DMP mirror BAM key file [default]"
+  )
+  parser$add_argument(
+    "-dmpak",
+    "--dmpaccesskeypath",
+    type = "character",
+    default = "/data1/share001/share/access_12_245.key.txt",
+    help = "DMP mirror BAM key file for MSK-ACCESS [default]"
+  )
+  args <- parser$parse_args()
+
+  master.ref <- args$masterref
+  results.dir <- args$resultsdir
+  project.ID <- args$projectid
+  pooled.bam.dir <- args$pooledbamdir
+  fasta.path <- args$fastapath
+  genotyper.path <- args$genotyperpath
+  dmp.dir <- args$dmpdir
+  mirror.bam.dir <- args$mirrorbamdir
+  mirror.access.bam.dir <- args$mirroraccessbamdir
+  dmp.key.path <- args$dmpkeypath
+  access.key.path <- args$dmpaccesskeypath
+
+
+  if (project.ID == "") {
+    project.ID <-
+      paste0(sample(c(0:9), size = 10, replace = T), collapse = "")
+  }
+
+  print(paste0("Input parameters for run ", project.ID))
+  print(master.ref)
+  print(results.dir)
+  print(pooled.bam.dir)
+  print(fasta.path)
+  print(genotyper.path)
+  print(dmp.dir)
+  print(mirror.bam.dir)
+  print(mirror.access.bam.dir)
+  print(dmp.key.path)
+  print(access.key.path)
+  suppressWarnings(
+    compile_reads_all(
+      fread(master.ref),
+      results.dir,
+      project.ID,
+      pooled.bam.dir,
+      fasta.path,
+      genotyper.path,
+      dmp.dir,
+      mirror.bam.dir,
+      mirror.access.bam.dir,
+      dmp.key.path,
+      access.key.path
+    )
+  )
+  print("compile reads function finished")
+}
\ No newline at end of file
diff --git a/R_Iris/filter_calls.R b/R_Iris/filter_calls.R
new file mode 100644
index 0000000..0c62683
--- /dev/null
+++ b/R_Iris/filter_calls.R
@@ -0,0 +1,319 @@
+# library(data.table)
+# library(tidyr)
+# library(stringr)
+# library(dplyr)
+
+#' @export
+filter_calls = function(
+  master.ref,results.dir,
+  CH.path = '/data1/core006/access/production/resources/dmp_signedout_CH/versions/10Feb2020/signedout_CH.txt',
+  criteria = 'stringent'
+){
+  # # test input section -----------------------------------------------------------
+  # master.ref = fread('/juno/work/bergerm1/bergerlab/zhengy1/access_data_analysis/data/example_master_file.csv')
+  # results.dir = paste0('/juno/work/bergerm1/MSK-ACCESS/ACCESS-Projects/test_access/access_data_analysis/output_042020/')
+  # dmp.key.path = '/ifs/dmprequest/12-245/key.txt'
+  # CH.calls = fread('/ifs/work/bergerm1/zhengy1/RET_all/Original_files/signedout_CH.txt')
+  # # criteria <- 'permissive'
+  # criteria <- 'stringent'
+  #
+  # criteria definition -----------------------------------------------------
+  if(criteria == 'permissive'){
+    hotspot.support <- 1
+    non.hotspot.support <- 3
+  }else{
+    hotspot.support <- 3
+    non.hotspot.support <- 5
+  }
+
+  dir.create(paste0(results.dir,'/results_',criteria))
+
+  # inputs ---------------------------------------------------------------
+  # DMP.key <- fread(dmp.key.path)
+  CH.calls = fread(CH.path)
+  pooled.normal.mafs <-
+    fread(paste0(results.dir,'/pooled/all_all_unique.maf')) %>%
+    mutate(Tumor_Sample_Barcode = paste0(Tumor_Sample_Barcode,'___pooled')) %>%
+    select(Hugo_Symbol,Tumor_Sample_Barcode,Chromosome,Start_Position,End_Position,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,t_alt_count) %>%
+    group_by(Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Reference_Allele,Tumor_Seq_Allele2) %>%
+    summarise(duplex_support_num = length(which(t_alt_count >= 2))) %>%
+    filter(duplex_support_num > 0,.preserve = T) %>%
+    transmute(Hugo_Symbol, Chromosome=as.character(Chromosome), Start_Position, End_Position, Variant_Classification, Reference_Allele, Tumor_Seq_Allele2, duplex_support_num) %>%
+    data.table()
+
+  # for each patient produce the correct results ----------------------------
+  # x <- unique(master.ref$cmo_patient_id)[1]
+  all.fillout.dim <- lapply(unique(master.ref$cmo_patient_id),function(x){
+    print(paste0('Processing patient ',x))
+
+    # Inputs and sanity checks ------------------------------------------------
+    fillouts.filenames <- list.files(paste0(results.dir,'/',x,'/'),'ORG-STD_genotyped.maf|ORG-SIMPLEX-DUPLEX_genotyped.maf',full.names = T)
+
+    # compiling a sample sheet with duplex, simplex, normal, DMP tumor and DMP normal
+    sample.sheet <- fread(paste0(results.dir,'/',x,'/',x,'_sample_sheet.tsv'))[,.(Sample_Barcode,cmo_patient_id,Sample_Type)]
+    simplex.sample.sheet = sample.sheet[Sample_Type == 'duplex',.(Sample_Barcode,cmo_patient_id,Sample_Type = 'simplex')]
+    sample.sheet = rbind(sample.sheet,simplex.sample.sheet) %>%
+      mutate(column.names = paste0(Sample_Barcode,'___',Sample_Type)) %>%
+      data.table()
+
+    # compiling different genotype files from step 1
+    fillouts.dt <- do.call(rbind,lapply(fillouts.filenames,function(y){
+      sample.name = gsub('.*./|-ORG.*.','',y)
+      sample.type = unique(sample.sheet[Sample_Barcode == sample.name]$Sample_Type)
+
+      # t_alt_count,t_ref_count,t_depth these columns are useless, have to use duplex/simplex/standard columms
+      maf.file <- fread(y) %>%
+        select(-c(t_alt_count,t_ref_count)) %>%
+        data.table()
+
+      if (nrow(maf.file) == 0) {
+
+        columns <- c(
+          "Hugo_Symbol", "Tumor_Sample_Barcode", "Chromosome", "Start_Position",
+          "End_Position", "Variant_Classification", "HGVSp_Short",
+          "Reference_Allele", "Tumor_Seq_Allele2", "t_var_freq", "ExAC_AF")
+        df <- data.frame(matrix(ncol = length(columns), nrow = 0))
+        colnames(df) <- columns
+
+        return(df)
+      }
+
+      # fragment counts replacing actual allele counts
+      if(grepl('SIMPLEX-DUPLEX_genotyped',y)){
+        melt.id.vars = colnames(maf.file)[!grepl('fragment',colnames(maf.file))]
+
+        # get rid of simplex duplex aggregate columns
+        maf.file %>%
+          select(-c(contains('simplex_duplex'))) %>%
+          # melting and dcasting columns back but separating duplex and simplex columns
+          # t_alt_duplex, t_depth_duplex, t_alt_simplex, t_depth_simplex --> t_alt, t_depth
+          melt.data.table(id.vars = melt.id.vars,variable.name = 'variable',value.name = 'value') %>%
+          mutate(variable = gsub('fragment','_',variable)) %>%
+          separate(variable,c('variable','Sample_Type'),sep = '___') %>%
+          mutate(Tumor_Sample_Barcode = paste0(sample.name,'___',Sample_Type)) %>%
+          select(-Sample_Type) %>%
+          data.table() %>%
+          unique() %>%
+          dcast.data.table(as.formula(paste0(paste0(melt.id.vars,collapse = ' + '),' ~ variable')),value.var = 'value') -> maf.file
+      }else{
+        maf.file <- maf.file %>%
+          mutate(Tumor_Sample_Barcode = paste0(sample.name,'___',sample.type)) %>%
+          # swaping the t_alt_count(etc)_standard for t_alt_count(etc)
+          mutate(t_alt_count = t_alt_count_standard,t_total_count = t_total_count_standard)
+      }
+
+      maf.file = maf.file %>%
+        mutate(t_var_freq = paste0(t_alt_count,'/',t_total_count,'(',round(t_alt_count/t_total_count,4),')')) %>%
+        transmute(Hugo_Symbol,Tumor_Sample_Barcode,Chromosome = as.character(Chromosome),Start_Position,End_Position,Variant_Classification,
+                  HGVSp_Short=as.character(HGVSp_Short),Reference_Allele,Tumor_Seq_Allele2,t_var_freq,ExAC_AF) %>%
+        data.table()
+
+      return(maf.file)
+    })) %>%
+      unique() %>%
+      data.table()
+
+    # merging and melting -----------------------------------------------------
+    hotspot.maf <- fread(paste0(results.dir,'/',x,'/',x,'_all_unique_calls_hotspots.maf')) %>%
+      rowwise() %>%
+      transmute(Hugo_Symbol,Chromosome = as.character(Chromosome),Start_Position,End_Position,Variant_Classification,
+                # HGVSp_Short,
+                Reference_Allele,Tumor_Seq_Allele2,Hotspot = ifelse(hotspot_whitelist,'Hotspot',NA)) %>%
+      data.table()
+
+    dmp.maf <- fread(paste0(results.dir,'/',x,'/',x,'_impact_calls.maf')) %>%
+      transmute(Hugo_Symbol,Chromosome = as.character(Chromosome),Start_Position,End_Position,Variant_Classification,
+                # HGVSp_Short,
+                Reference_Allele,Tumor_Seq_Allele2) %>%
+      mutate(DMP = 'Signed out') %>%
+      unique() %>%
+      data.table()
+
+    if((nrow(dmp.maf) > 0) && (nrow(fillouts.dt) > 0)){
+        fillouts.dt <- fillouts.dt %>%
+          dcast.data.table(Hugo_Symbol + Chromosome + Start_Position + End_Position + Variant_Classification +
+                           HGVSp_Short + Reference_Allele + Tumor_Seq_Allele2 + ExAC_AF ~ Tumor_Sample_Barcode,
+                           value.var = 't_var_freq') %>%
+          # hotspot information
+          merge(
+            hotspot.maf,
+            by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'),
+            all.x = T) %>%
+          # Identifying signed out calls
+          merge(
+            dmp.maf,
+            by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'),
+            all.x = T) %>%
+          # pooled normal for systemic artifacts
+          merge(
+            pooled.normal.mafs,
+            by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'),
+            all.x = T) %>%
+          data.table()
+    } else if (nrow(fillouts.dt) > 0){
+      fillouts.dt <- fillouts.dt %>%
+        dcast.data.table(
+          Hugo_Symbol + Chromosome + Start_Position + End_Position + Variant_Classification +
+          HGVSp_Short + Reference_Allele + Tumor_Seq_Allele2 + ExAC_AF ~ Tumor_Sample_Barcode,
+          value.var = 't_var_freq') %>%
+        # hotspot information
+        merge(
+          hotspot.maf,
+          by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'),
+          all.x = T) %>%
+        # pooled normal for systemic artifacts
+        merge(
+          pooled.normal.mafs,
+          by = c('Hugo_Symbol','Chromosome','Start_Position','End_Position','Variant_Classification','Reference_Allele','Tumor_Seq_Allele2'),
+          all.x = T) %>%
+        mutate(DMP = NA) %>%
+        data.table()
+    } else {
+
+      print(paste0("Found no tumor or DMP mutations for ", x, ". Writing an empty data.frame to CSV."))
+
+      # if fillouts.dt has no data, then add the needed columns with no data
+      fillouts.dt[,c("DMP", "Hotspot", "duplex_support_num", "call_confidence", "CH") := NA]
+
+      fillouts.dt <- fillouts.dt %>% select(
+        Hugo_Symbol,Chromosome,Start_Position,End_Position,
+        Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,
+        ExAC_AF,Hotspot,DMP,CH,duplex_support_num,call_confidence,sort(everything()))
+
+      write.csv(
+        fillouts.dt,
+        paste0(results.dir,'/results_',criteria,'/',x,'_SNV_table.csv'),
+        row.names = F)
+
+      return()
+    }
+
+    # Interesting cases where DMP signed out calls are artifacets
+    if(any(!is.na(fillouts.dt$DMP) & !is.na(fillouts.dt$duplex_support_num))){
+      print(paste0('Look at ',x,' for DMP signed out plasma artifacts...'))
+    }
+
+    # germline filtering for matched and unmatched ----------------------------
+    plasma.samples <- sample.sheet[Sample_Type %in% c('duplex')]$column.names
+    normal.samples <- sample.sheet[Sample_Type %in% c('unfilterednormal','normal_DMP')]$column.names
+    fillouts.dt[,c(
+      paste0(plasma.samples,'.called')
+      # paste0(gsub('duplex','simplex',plasma.samples),'.called')
+
+    ) := 'Not Called']
+
+    # preliminary calling
+    # tmp.col.name <- plasma.samples[1]
+    lapply(plasma.samples,function(tmp.col.name){
+      # genotyping (signed out stuff)
+      fillouts.dt[(as.numeric(gsub("/.*.$",'',get(tmp.col.name))) >= 1 | as.numeric(gsub("/.*.$",'',get(paste0(gsub('duplex','simplex',tmp.col.name))))) > 1) & DMP == 'Signed out',
+                  eval(paste0(tmp.col.name,'.called')) := 'Genotyped']
+      # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')]
+      # hotspot reads
+      fillouts.dt[as.numeric(gsub("/.*.$",'',get(tmp.col.name))) >= hotspot.support & Hotspot == 'Hotspot',
+                  eval(paste0(tmp.col.name,'.called')) := 'Called']
+      # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')]
+      # non hotspot reads
+      fillouts.dt[as.numeric(gsub("/.*.$",'',get(tmp.col.name))) >= non.hotspot.support & is.na(Hotspot),
+                  eval(paste0(tmp.col.name,'.called')) := 'Called']
+      # c(eval(paste0(tmp.col.name,'.called')),eval(paste0(gsub('duplex','simplex',tmp.col.name),'.called'))) := list('Called','Called')]
+      # print(table(fillouts.dt[,get(paste0(tmp.col.name,'.called'))]))
+    })
+
+    if(all(!c('unfilterednormal','normal_DMP') %in% sample.sheet$Sample_Type)){
+      tmp.col.name <- plasma.samples[1]
+      lapply(plasma.samples,function(tmp.col.name){
+        #fillouts.dt[as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name),"\\(.*.\\)"))) >= 0.3 | ExAC_AF >= 0.0001,eval(paste0(tmp.col.name,'.called')) := 'Not Called']
+        fillouts.dt[get(tmp.col.name)  == '0/0(NaN)',eval(paste0(tmp.col.name,'.called')) := 'Not Covered']
+      })
+    }else{
+      lapply(plasma.samples,function(tmp.col.name){
+        lapply(normal.samples,function(tmp.col.name.normal){
+          # duplex tvar/nvar > 5
+          fillouts.dt[(as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name),"\\(.*.\\)")))/as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name.normal),"\\(.*.\\)"))) < 2) |
+                        # if duplex have no reads, use simplex tvar
+                        (as.numeric(gsub("\\(|\\)",'',str_extract(get(gsub('duplex','simplex',tmp.col.name)),"\\(.*.\\)")))/as.numeric(gsub("\\(|\\)",'',str_extract(get(tmp.col.name.normal),"\\(.*.\\)"))) < 2 &
+                           as.numeric(gsub("/.*.$",'',get(tmp.col.name)))  == 0),
+                      eval(paste0(tmp.col.name,'.called')) := 'Not Called']
+          fillouts.dt[get(tmp.col.name)  == '0/0(NaN)',eval(paste0(tmp.col.name,'.called')) := 'Not Covered']
+        })
+      })
+    }
+
+    # final processing --------------------------------------------------------
+    # Save only the useful column
+    #print(fillouts.dt)
+    #print("#######")
+    fillouts.dt <-  fillouts.dt[DMP == 'Signed out' | fillouts.dt[,apply(.SD,1,function(x){any(x == 'Called')})]]
+    #print(fillouts.dt)
+    # combining duplex and simplex counts
+    lapply(plasma.samples,function(tmp.col.name){
+      # hotspot reads
+      fillouts.dt[,eval(gsub('duplex','total',tmp.col.name)) := paste0(
+        as.numeric(gsub("/.*.$",'',get(tmp.col.name)))+as.numeric(gsub("/.*.$",'',get(gsub('duplex','simplex',tmp.col.name)))),'/',
+        as.numeric(gsub("^.*./|\\(.*.$",'',get(tmp.col.name)))+as.numeric(gsub("^.*./|\\(.*.$",'',get(gsub('duplex','simplex',tmp.col.name)))),'(',
+        round((as.numeric(gsub("/.*.$",'',get(tmp.col.name)))+as.numeric(gsub("/.*.$",'',get(gsub('duplex','simplex',tmp.col.name)))))/
+                (as.numeric(gsub("^.*./|\\(.*.$",'',get(tmp.col.name)))+as.numeric(gsub("^.*./|\\(.*.$",'',get(gsub('duplex','simplex',tmp.col.name))))),4),')'
+      )]
+      fillouts.dt[,c(eval(gsub('duplex','simplex',tmp.col.name)),eval(tmp.col.name)):= list(NULL,NULL)]
+    })
+
+    fillouts.dt <- fillouts.dt[,order(colnames(fillouts.dt)),with = F] %>%
+      # filter for artifacts
+      mutate(call_confidence = case_when(
+        (Hugo_Symbol == 'TERT' & is.na(Hotspot)) | (Hugo_Symbol == 'ERBB2' & grepl('[A-Z]90[0-9][A-Z]',HGVSp_Short)) |
+          (Hugo_Symbol == 'BRAF' & grepl('711',HGVSp_Short)) | (Hugo_Symbol == 'NF1' & grepl('[A-Z]106[0-9][A-Z]',HGVSp_Short)) ~ 'Low',
+        DMP == 'Signed out' ~ 'High',
+        TRUE ~ ''
+      )) %>%
+      merge(CH.calls[,.(Hugo_Symbol = Gene,Chromosome = Chrom,Start_Position = Start,Reference_Allele = Ref,Tumor_Seq_Allele2 = Alt,HGVSp_Short = AAchange,Variant_Classification = VariantClass,CH = 'Yes')],
+            by = c('Hugo_Symbol','Chromosome','Start_Position','Variant_Classification','HGVSp_Short','Reference_Allele','Tumor_Seq_Allele2'),
+            all.x = T) %>%
+      mutate(CH = ifelse(is.na(CH),'No','Yes')) %>%
+      select(Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,
+             ExAC_AF,Hotspot,DMP,CH,duplex_support_num,call_confidence,sort(everything()))
+
+    write.csv(fillouts.dt,paste0(results.dir,'/results_',criteria,'/',x,'_SNV_table.csv'),row.names = F)
+  })
+
+  if(all(unlist(all.fillout.dim))){
+    print('All dimension of  fillout mafs for each patient looks correct')
+  }
+
+}
+
+# Executable -----------------------------------------------------------------------------------------------------------
+suppressPackageStartupMessages({
+  library(data.table)
+  library(tidyr)
+  library(stringr)
+  library(dplyr)
+  library(argparse)
+})
+
+if (!interactive()) {
+
+  parser=ArgumentParser()
+  parser$add_argument('-m', '--masterref', type='character', help='File path to master reference file')
+  parser$add_argument('-o', '--resultsdir', type='character', help='Output directory')
+  parser$add_argument('-ch', '--chlist', type='character', default = '/data1/core006/access/production/resources/dmp_signedout_CH/versions/10Feb2020/signedout_CH.txt',
+                      help='List of signed out CH calls [default]')
+  parser$add_argument('-c', '--criteria', type='character', default = 'stringent',
+                      help='Calling criteria [default]')
+  args=parser$parse_args()
+
+  master.ref = args$masterref
+  results.dir = args$resultsdir
+  chlist = args$chlist
+  criteria = args$criteria
+
+  cat(paste0(paste0(c(paste0(rep('-',15),collapse = ''),'Arguments input: ',master.ref,results.dir,chlist,criteria,
+                      paste0(rep('-',15),collapse = '')),collapse = "\n"),'\n'))
+
+  if(!criteria %in% c('stringent','permissive')){
+    stop('Criteria argument should be either stringent or permissive')
+  }
+
+  suppressWarnings(filter_calls(fread(master.ref),results.dir,chlist,criteria))
+
+}