PMCC-BioinformaticsCore · nbartonicek · Oct 19, 2025 · Oct 19, 2025 · Oct 13, 2025 · Oct 20, 2025
diff --git a/NAMESPACE b/NAMESPACE
@@ -35,6 +35,7 @@ export(plot_qc_metrics_heatmap)
 export(plot_rle)
 export(plot_volcano)
 export(read_metadata)
+export(select_robust_controls)
 export(subsample_genes)
 export(summarise_de)
 export(validate_metadata)
@@ -66,6 +67,9 @@ importFrom(dplyr,reframe)
 importFrom(dplyr,rename)
 importFrom(dplyr,select)
 importFrom(dplyr,where)
+importFrom(edgeR,DGEList)
+importFrom(edgeR,calcNormFactors)
+importFrom(edgeR,cpm)
 importFrom(ggiraph,geom_point_interactive)
 importFrom(ggiraph,girafe)
 importFrom(ggplot2,aes)
@@ -124,6 +128,7 @@ importFrom(stringr,str_trim)
 importFrom(tibble,column_to_rownames)
 importFrom(tibble,rownames_to_column)
 importFrom(tidyr,drop_na)
+importFrom(tidyr,pivot_longer)
 importFrom(tidyr,pivot_wider)
 importFrom(tidyr,unnest)
 importFrom(utils,capture.output)

diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R
@@ -157,9 +157,9 @@ check_zeroinflation <- function(data = NULL,
       group            = unique(df$group),
       n_genes          = nrow(df),
       n_wells          = sum(combined_id == unique(df$group)),
-      median_p0_obs    = median(df$p0_obs),
-      median_p0_nb     = median(df$p0_nb),
-      median_ZI        = median(df$ZI),
+      mean_p0_obs    = mean(df$p0_obs),
+      mean_p0_nb     = mean(df$p0_nb),
+      mean_ZI        = mean(df$ZI),
       observed_zeros_num       = sum(df$obs_zeros_num),
       expected_zeros_num  = sum(df$expected_zeros_num) 
     )

diff --git a/R/compute_multi_de.R b/R/compute_multi_de.R
@@ -58,7 +58,7 @@ compute_multi_de <- function(data = NULL,
     method <- if (is.null(method)) "edgeR" else method
     if (!method %in% c("Seurat_wilcox", "DESeq2", "edgeR",
                        "RUVg", "RUVs", "RUVr",
-                       "limma_voom")) {
+                       "limma_voom","limma_trend")) {
       stop("Your normalization method is not available.")
     }
     if (is.null(control_samples)) {

diff --git a/R/compute_normalised_counts.R b/R/compute_normalised_counts.R
@@ -5,7 +5,8 @@
 #' @param data A tidyseurat object merged with metadata. Must contain columns
 #'   "Well_ID", "Row", "Column"
 #' @param method One of "raw", "logNorm", "cpm", "clr", "SCT", "DESeq2",
-#'   "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "zinb"
+#'   "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", 
+#'   "limma_voom_zinb", "edgeR_zinb"
 #' @param batch Either empty, a single value, or a vector corresponding to the
 #'   number of samples
 #' @param k Parameter k for RUVSeq and zinb methods
@@ -53,7 +54,8 @@ compute_normalised_counts <- function(data = NULL,
                        "cpm", "clr", "SCT",
                        "DESeq2", "edgeR",
                        "RUVg", "RUVs", "RUVr",
-                       "limma_voom", "limma_trend", "zinb")) {
+                       "limma_voom", "limma_trend", "limma_voom_zinb",
+                       "edgeR_zinb")) {
       stop("Your normalization method is not available.")
     }
     batch <- if (is.null(batch)) "1" else as.character(batch)
@@ -96,32 +98,32 @@ compute_normalised_counts <- function(data = NULL,
 
   normalize_lognorm <- function(data) {
     lognorm <- NormalizeData(data, normalization.method = "LogNormalize", scale.factor = 10000)
-    lognorm@assays$RNA$data
+    return(lognorm@assays$RNA$data)
   }
 
   normalize_cpm <- function(data) {
     cpm <- NormalizeData(data, normalization.method = "RC", scale.factor = 1e6)
-    cpm@assays$RNA$data
+    return(cpm@assays$RNA$data)
   }
 
   normalize_clr <- function(data) {
     clr <- NormalizeData(data, normalization.method = "CLR", margin = 2)
-    clr@assays$RNA$data
+    return(clr@assays$RNA$data)
   }
 
   normalize_sct <- function(data, batch) {
     sct <- SCTransform(data, do.scale = TRUE,
                        return.only.var.genes = FALSE,
                        vars.to.regress = if (length(batch) == 1) NULL else batch, verbose = FALSE)
-    sct@assays$SCT$data
+    return(sct@assays$SCT$data)
   }
 
   normalize_deseq2 <- function(data, batch) {
     coldata <- coldata
     design <- model_matrix
     dds <- DESeqDataSetFromMatrix(countData = data@assays$RNA$counts, colData = coldata, design = design)
     dds <- estimateSizeFactors(dds)
-    counts(dds, normalized = TRUE)
+    return(counts(dds, normalized = TRUE))
   }
 
   normalize_edger <- function(data, batch) {
@@ -135,15 +137,15 @@ compute_normalised_counts <- function(data = NULL,
     dge <- calcNormFactors(dge, methods = "TMM")
     design <- model_matrix
     dge <- estimateDisp(dge, design, BPPARAM = p)
-    cpm(dge, log = FALSE)
+    return(cpm(dge, log = FALSE))
   }
 
   normalize_limma_voom <- function(data, batch) {
     dge <- DGEList(counts = data@assays$RNA$counts, samples = coldata$condition, group = coldata$condition)
     dge <- calcNormFactors(dge, methods = "TMMwsp")
     design <- model_matrix
     dge <- voom(dge, design)
-    dge$E
+    return(dge$E)
   }
 
   normalize_limma_trend <- function(data, batch) {
@@ -154,7 +156,7 @@ compute_normalised_counts <- function(data = NULL,
     fit <- lmFit(logCPM, design)
     fit <- eBayes(fit, trend=TRUE)
     dge <- fit
-    logCPM
+    return(logCPM)
   }
 
   normalize_ruvg <- function(data, batch, spikes, k) {
@@ -180,7 +182,7 @@ compute_normalised_counts <- function(data = NULL,
     }
     if (!all(spikes %in% row.names(data@assays$RNA$counts))) {
       warning("Some or all of your control genes are not present in the dataset.")
-      spikes <- intersect(spikes, rownames(counts(set)))
+      spikes <- intersect(spikes, rownames(counts))
     }
     #k defines number of sources of variation, two have been chosen for row and column
     set <- EDASeq::newSeqExpressionSet(counts = as.matrix(counts),
@@ -236,7 +238,7 @@ compute_normalised_counts <- function(data = NULL,
     EDASeq::normCounts(set)
   }
 
-  normalize_zinb <- function(data, batch) {
+  normalize_limmavoom_zinb <- function(data, batch) {
 
     message("Please allow extra time for zinb mode.")
     if (ncol(data) > 50) {
@@ -247,34 +249,54 @@ compute_normalised_counts <- function(data = NULL,
     cl <- makeCluster(num_cores)
     doParallel::registerDoParallel(cl)
     p <- BiocParallel::DoparParam()
-    system.time(zinb <- zinbwave::zinbwave(filtered_sce, K = k,
+    suppressWarnings(zinb <- zinbwave::zinbwave(filtered_sce, K = k,
                                  epsilon=12000,
                                  BPPARAM = p,
-                                 observationalWeights = TRUE))
+                                 observationalWeights = TRUE, verbose = F))
     counts <- zinb@assays@data$counts
     weights <- zinb@assays@data$weights 
-
-    # approximate denoised counts (downweighting dropouts)
-    #dge <- DGEList(counts = data@assays$RNA$counts, samples = coldata$condition, group = coldata$condition)
-    #dge <- calcNormFactors(dge, methods = "TMM")
-    #design <- model_matrix
-    #dge <- estimateDisp(dge, design, BPPARAM = p)
-    #dge$weights <- assay(zinb, "weights")
-    #fit <- glmQLFit(dge, design, BPPARAM = p)
-    #norm_counts <- fitted(fit)
-
+
     dge <- DGEList(counts = counts(filtered_sce), samples = coldata$condition, group = coldata$condition)
     dge <- calcNormFactors(dge, methods = "TMMwsp")
     design <- model_matrix
     dge <- voom(dge, design, weights = weights)
     normalised_values <- dge$E
-
-    #normalised_values <- zinb@assays@data$normalizedValues
     stopCluster(cl)
     doParallel::registerDoParallel()
     return(normalised_values)
   }
 
+  normalize_edger_zinb <- function(data, batch) {
+
+    message("Please allow extra time for zinb mode.")
+    if (ncol(data) > 50) {
+      message("zinb with over 50 samples takes a long time. Consider reducing the number of samples or genes.")
+    }
+    data_sce <- as.SingleCellExperiment(data)
+    filtered_sce <- subset(data_sce, rowSums(as.data.frame(counts(data_sce))) > 0)
+    cl <- makeCluster(num_cores)
+    doParallel::registerDoParallel(cl)
+    p <- BiocParallel::DoparParam()
+    suppressMessages(zinb <- zinbwave::zinbwave(filtered_sce, K = k,
+                                                epsilon=12000,
+                                                BPPARAM = p,
+                                                observationalWeights = TRUE))
+    counts <- zinb@assays@data$counts
+    weights <- zinb@assays@data$weights 
+
+    # approximate denoised counts (downweighting dropouts)
+    dge <- DGEList(counts = counts, samples = coldata$condition, group = coldata$condition)
+    dge <- calcNormFactors(dge, methods = "TMM")
+    design <- model_matrix
+    dge <- estimateDisp(dge, design, BPPARAM = p)
+    dge$weights <- weights
+    fit <- glmQLFit(dge, design, BPPARAM = p)
+    normalised_values <- fitted(fit)
+    stopCluster(cl)
+    doParallel::registerDoParallel()
+    return(normalised_values)
+  }
+
   # Main function logic
   validated <- validate_inputs(data, method, batch, k, max_counts, num_cores)
   data <- validated$data
@@ -297,7 +319,8 @@ compute_normalised_counts <- function(data = NULL,
     RUVg = normalize_ruvg(data, batch, spikes, k),
     RUVs = normalize_ruvs(data, batch, k),
     RUVr = normalize_ruvr(data, batch, k),
-    zinb = normalize_zinb(data, batch),
+    limma_voom_zinb = normalize_limmavoom_zinb(data, batch),
+    edgeR_zinb = normalize_edger_zinb(data, batch),
     stop("Unsupported normalization method.")
   )
   return(norm_data)

diff --git a/R/compute_single_de.R b/R/compute_single_de.R
@@ -12,6 +12,7 @@ utils::globalVariables(c("model_matrix"))
 #'   number of samples
 #' @param k Parameter k for RUVSeq methods, check RUVSeq tutorial
 #' @param spikes List of genes to use as spike controls
+#' @param num_cores Number of cores for parallel processing
 #' @importFrom limma makeContrasts eBayes contrasts.fit topTable
 #' @importFrom tibble rownames_to_column
 #' @import DESeq2
@@ -34,7 +35,8 @@ compute_single_de <- function(data = NULL,
                                         method = NULL,
                                         batch = 1,
                                         k = 2,
-                                        spikes = NULL) {
+                                        spikes = NULL,
+                                        num_cores = 1) {
   if (!requireNamespace("SummarizedExperiment", quietly = TRUE)) {
     stop(
       "compute_single_de(): the following package is required but not installed: SummarizedExperiment",
@@ -48,7 +50,7 @@ compute_single_de <- function(data = NULL,
     method <- if (is.null(method)) "limma_voom" else method
     if (!method %in% c("Seurat_wilcox", "DESeq2", "edgeR",
                        "RUVg", "RUVs", "RUVr",
-                       "limma_voom", "limma_trend")) {
+                       "limma_voom", "limma_trend","edgeR_zinb", "limma_zinb")) {
       stop("Your normalization method is not available.")
     }
     if (is.null(treatment_samples) || is.null(control_samples)) {
@@ -157,7 +159,8 @@ compute_single_de <- function(data = NULL,
     combined_id <- data$combined_id
     dds <- DESeqDataSetFromMatrix(countData = data@assays$RNA$counts,
                                   colData = pheno_data,
-                                  design = ~ condition)
+                                  # design = ~ condition)
+                                  design = if (length(batch) == 1) ~ condition else ~ condition + batch)
     dds <- DESeq(dds)
     res <- results(dds, contrast = c("condition", treatment_samples, control_samples))
     top_table <- as.data.frame(res) %>%
@@ -178,7 +181,7 @@ compute_single_de <- function(data = NULL,
                              latent.vars = "batch",
                              test.use = "wilcox") %>%
       select("avg_log2FC", "p_val", "p_val_adj") %>%
-      rename("log2FC" = "avg_log2FC", "p_val" = "p_val", "p_value_adj" = "p_val_adj") %>%
+      rename("log2FC" = "avg_log2FC", "p_value" = "p_val", "p_value_adj" = "p_val_adj") %>%
       mutate(metric = 1) %>%
       rownames_to_column("gene")
 
@@ -190,7 +193,7 @@ compute_single_de <- function(data = NULL,
     model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else
       model.matrix(~0 + combined_id + batch)
     if (length(spikes) == 0) {
-      warning("List of control genes not provided for RUVg, using default.")
+      warning("List of control genes not provided for RUVg, using default human housekeeping genes.")
       spikes <- c(
         "ACTB",
         "GAPDH",
@@ -316,20 +319,25 @@ compute_single_de <- function(data = NULL,
     return(as.data.frame(top_table))
   }
 
-  de_zinb <- function(data, pheno_data, treatment_samples, control_samples, batch, k) {
+  de_edgeR_zinb <- function(data, pheno_data, treatment_samples, control_samples, batch, k) {
 
     data_sce<-as.SingleCellExperiment(data)
-    filtered_sce <- data_sce[rowSums(counts(data_sce)) > 50, ]
-    num_cores <- 8 # Change this based on your system
+    filtered_sce <- data_sce[rowSums(counts(data_sce)) > 0, ]
     cl <- makeCluster(num_cores)
     doParallel::registerDoParallel(cl)
     p <- BiocParallel::DoparParam()
     system.time(zinb <- zinbwave::zinbwave(filtered_sce, K = 2,
-                     epsilon=1000,
+                     epsilon=12000,
                      BPPARAM = p,
                      observationalWeights = TRUE))
 
     weights <- SummarizedExperiment::assay(zinb, "weights")
+
+    #perform_edgeR
+    combined_id <- data$combined_id
+    model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else
+      model.matrix(~0 + combined_id + batch)
+
     dge <- DGEList(SummarizedExperiment::assay(zinb))
     dge <- calcNormFactors(dge, method = "TMMwsp")
 
@@ -375,7 +383,8 @@ compute_single_de <- function(data = NULL,
     RUVg = de_ruvg(data, pheno_data, treatment_samples, control_samples, batch, spikes, k),
     RUVs = de_ruvs(data, pheno_data, treatment_samples, control_samples, batch, k),
     RUVr = de_ruvr(data, pheno_data, treatment_samples, control_samples, batch, k),
-    zinb = de_zinb(data, pheno_data, treatment_samples, control_samples, batch, k),
+    edgeR_zinb = de_edgeR_zinb(data, pheno_data, treatment_samples, control_samples, batch, k),
+    limma_zinb = limma_zinb(data, pheno_data, treatment_samples, control_samples, batch, k),
     stop("Unsupported DE method.")
   )
   return(de_data)

diff --git a/R/plot_rle.R b/R/plot_rle.R
@@ -15,7 +15,8 @@
 #' @param batch Either empty, a single value, or a vector corresponding to the
 #'   number of samples.
 #' @param normalisation One of "raw", "logNorm", "cpm", "clr", "SCT", "DESeq2",
-#'   "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "zinb". If empty, defaults to raw.
+#'   "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "limma_voom_zinb",
+#'   "edgeR_zinb". If empty, defaults to raw.
 #' @param spikes List of genes to use as spike controls in RUVg
 #' @param num_cores Number of cores for edgeR and zinb calculations
 
@@ -205,7 +206,8 @@ plot_rle <- function(data, barcodes = NULL,
     RUVg = compute_normalised_counts(data, method = "RUVg", batch = batch, spikes = spikes, num_cores = num_cores),
     RUVs = compute_normalised_counts(data, method = "RUVs", batch = batch, num_cores = num_cores),
     RUVr = compute_normalised_counts(data, method = "RUVr", batch = batch, num_cores = num_cores),
-    zinb = compute_normalised_counts(data, method = "zinb", batch = batch, num_cores = num_cores),
+    limma_voom_zinb = compute_normalised_counts(data, method = "limma_voom_zinb", batch = batch, num_cores = num_cores),
+    edgeR_zinb = compute_normalised_counts(data, method = "edgeR_zinb", batch = batch, num_cores = num_cores),
     stop("Unsupported normalization method.")
   )