From eabe4930b2dfae9a05a30d2fd21c375d135c610f Mon Sep 17 00:00:00 2001 From: nbartonicek Date: Sun, 19 Oct 2025 14:15:39 +1100 Subject: [PATCH 01/16] Fix plot_rle --- R/compute_normalised_counts.R | 16 ++++++++-------- R/plot_rle.R | 3 ++- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/R/compute_normalised_counts.R b/R/compute_normalised_counts.R index 238f12b..60ad0ad 100644 --- a/R/compute_normalised_counts.R +++ b/R/compute_normalised_counts.R @@ -96,24 +96,24 @@ compute_normalised_counts <- function(data = NULL, normalize_lognorm <- function(data) { lognorm <- NormalizeData(data, normalization.method = "LogNormalize", scale.factor = 10000) - lognorm@assays$RNA$data + return(lognorm@assays$RNA$data) } normalize_cpm <- function(data) { cpm <- NormalizeData(data, normalization.method = "RC", scale.factor = 1e6) - cpm@assays$RNA$data + return(cpm@assays$RNA$data) } normalize_clr <- function(data) { clr <- NormalizeData(data, normalization.method = "CLR", margin = 2) - clr@assays$RNA$data + return(clr@assays$RNA$data) } normalize_sct <- function(data, batch) { sct <- SCTransform(data, do.scale = TRUE, return.only.var.genes = FALSE, vars.to.regress = if (length(batch) == 1) NULL else batch, verbose = FALSE) - sct@assays$SCT$data + return(sct@assays$SCT$data) } normalize_deseq2 <- function(data, batch) { @@ -121,7 +121,7 @@ compute_normalised_counts <- function(data = NULL, design <- model_matrix dds <- DESeqDataSetFromMatrix(countData = data@assays$RNA$counts, colData = coldata, design = design) dds <- estimateSizeFactors(dds) - counts(dds, normalized = TRUE) + return(counts(dds, normalized = TRUE)) } normalize_edger <- function(data, batch) { @@ -135,7 +135,7 @@ compute_normalised_counts <- function(data = NULL, dge <- calcNormFactors(dge, methods = "TMM") design <- model_matrix dge <- estimateDisp(dge, design, BPPARAM = p) - cpm(dge, log = FALSE) + return(cpm(dge, log = FALSE)) } normalize_limma_voom <- function(data, batch) { @@ -143,7 +143,7 @@ compute_normalised_counts <- function(data = NULL, dge <- calcNormFactors(dge, methods = "TMMwsp") design <- model_matrix dge <- voom(dge, design) - dge$E + return(dge$E) } normalize_limma_trend <- function(data, batch) { @@ -154,7 +154,7 @@ compute_normalised_counts <- function(data = NULL, fit <- lmFit(logCPM, design) fit <- eBayes(fit, trend=TRUE) dge <- fit - logCPM + return(logCPM) } normalize_ruvg <- function(data, batch, spikes, k) { diff --git a/R/plot_rle.R b/R/plot_rle.R index 301a438..1477067 100644 --- a/R/plot_rle.R +++ b/R/plot_rle.R @@ -205,7 +205,8 @@ plot_rle <- function(data, barcodes = NULL, RUVg = compute_normalised_counts(data, method = "RUVg", batch = batch, spikes = spikes, num_cores = num_cores), RUVs = compute_normalised_counts(data, method = "RUVs", batch = batch, num_cores = num_cores), RUVr = compute_normalised_counts(data, method = "RUVr", batch = batch, num_cores = num_cores), - zinb = compute_normalised_counts(data, method = "zinb", batch = batch, num_cores = num_cores), + limma_voom_zinb = compute_normalised_counts(data, method = "zinb", batch = batch, num_cores = num_cores), + edgeR_zinb = compute_normalised_counts(data, method = "zinb", batch = batch, num_cores = num_cores), stop("Unsupported normalization method.") ) From 2106dfe02d07067e533dbfd687ed47d0b8fc18b3 Mon Sep 17 00:00:00 2001 From: nbartonicek Date: Sun, 19 Oct 2025 17:22:48 +1100 Subject: [PATCH 02/16] Fix normalisation and plot_rle --- R/compute_normalised_counts.R | 61 ++++++++++++++++++++++++----------- R/plot_rle.R | 7 ++-- 2 files changed, 46 insertions(+), 22 deletions(-) diff --git a/R/compute_normalised_counts.R b/R/compute_normalised_counts.R index 60ad0ad..813a045 100644 --- a/R/compute_normalised_counts.R +++ b/R/compute_normalised_counts.R @@ -5,7 +5,8 @@ #' @param data A tidyseurat object merged with metadata. Must contain columns #' "Well_ID", "Row", "Column" #' @param method One of "raw", "logNorm", "cpm", "clr", "SCT", "DESeq2", -#' "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "zinb" +#' "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", +#' "limma_voom_zinb", "edgeR_zinb" #' @param batch Either empty, a single value, or a vector corresponding to the #' number of samples #' @param k Parameter k for RUVSeq and zinb methods @@ -53,7 +54,8 @@ compute_normalised_counts <- function(data = NULL, "cpm", "clr", "SCT", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", - "limma_voom", "limma_trend", "zinb")) { + "limma_voom", "limma_trend", "limma_voom_zinb", + "edgeR_zinb")) { stop("Your normalization method is not available.") } batch <- if (is.null(batch)) "1" else as.character(batch) @@ -180,7 +182,7 @@ compute_normalised_counts <- function(data = NULL, } if (!all(spikes %in% row.names(data@assays$RNA$counts))) { warning("Some or all of your control genes are not present in the dataset.") - spikes <- intersect(spikes, rownames(counts(set))) + spikes <- intersect(spikes, rownames(counts)) } #k defines number of sources of variation, two have been chosen for row and column set <- EDASeq::newSeqExpressionSet(counts = as.matrix(counts), @@ -236,7 +238,7 @@ compute_normalised_counts <- function(data = NULL, EDASeq::normCounts(set) } - normalize_zinb <- function(data, batch) { + normalize_limmavoom_zinb <- function(data, batch) { message("Please allow extra time for zinb mode.") if (ncol(data) > 50) { @@ -247,34 +249,54 @@ compute_normalised_counts <- function(data = NULL, cl <- makeCluster(num_cores) doParallel::registerDoParallel(cl) p <- BiocParallel::DoparParam() - system.time(zinb <- zinbwave::zinbwave(filtered_sce, K = k, + suppressWarnings(zinb <- zinbwave::zinbwave(filtered_sce, K = k, epsilon=12000, BPPARAM = p, - observationalWeights = TRUE)) + observationalWeights = TRUE, verbose = F)) counts <- zinb@assays@data$counts weights <- zinb@assays@data$weights - - # approximate denoised counts (downweighting dropouts) - #dge <- DGEList(counts = data@assays$RNA$counts, samples = coldata$condition, group = coldata$condition) - #dge <- calcNormFactors(dge, methods = "TMM") - #design <- model_matrix - #dge <- estimateDisp(dge, design, BPPARAM = p) - #dge$weights <- assay(zinb, "weights") - #fit <- glmQLFit(dge, design, BPPARAM = p) - #norm_counts <- fitted(fit) - + dge <- DGEList(counts = counts(filtered_sce), samples = coldata$condition, group = coldata$condition) dge <- calcNormFactors(dge, methods = "TMMwsp") design <- model_matrix dge <- voom(dge, design, weights = weights) normalised_values <- dge$E - - #normalised_values <- zinb@assays@data$normalizedValues stopCluster(cl) doParallel::registerDoParallel() return(normalised_values) } + normalize_edger_zinb <- function(data, batch) { + + message("Please allow extra time for zinb mode.") + if (ncol(data) > 50) { + message("zinb with over 50 samples takes a long time. Consider reducing the number of samples or genes.") + } + data_sce <- as.SingleCellExperiment(data) + filtered_sce <- subset(data_sce, rowSums(as.data.frame(counts(data_sce))) > 0) + cl <- makeCluster(num_cores) + doParallel::registerDoParallel(cl) + p <- BiocParallel::DoparParam() + suppressMessages(zinb <- zinbwave::zinbwave(filtered_sce, K = k, + epsilon=12000, + BPPARAM = p, + observationalWeights = TRUE)) + counts <- zinb@assays@data$counts + weights <- zinb@assays@data$weights + + # approximate denoised counts (downweighting dropouts) + dge <- DGEList(counts = counts, samples = coldata$condition, group = coldata$condition) + dge <- calcNormFactors(dge, methods = "TMM") + design <- model_matrix + dge <- estimateDisp(dge, design, BPPARAM = p) + dge$weights <- weights + fit <- glmQLFit(dge, design, BPPARAM = p) + normalised_values <- fitted(fit) + stopCluster(cl) + doParallel::registerDoParallel() + return(normalised_values) + } + # Main function logic validated <- validate_inputs(data, method, batch, k, max_counts, num_cores) data <- validated$data @@ -297,7 +319,8 @@ compute_normalised_counts <- function(data = NULL, RUVg = normalize_ruvg(data, batch, spikes, k), RUVs = normalize_ruvs(data, batch, k), RUVr = normalize_ruvr(data, batch, k), - zinb = normalize_zinb(data, batch), + limma_voom_zinb = normalize_limmavoom_zinb(data, batch), + edgeR_zinb = normalize_edger_zinb(data, batch), stop("Unsupported normalization method.") ) return(norm_data) diff --git a/R/plot_rle.R b/R/plot_rle.R index 1477067..3395532 100644 --- a/R/plot_rle.R +++ b/R/plot_rle.R @@ -15,7 +15,8 @@ #' @param batch Either empty, a single value, or a vector corresponding to the #' number of samples. #' @param normalisation One of "raw", "logNorm", "cpm", "clr", "SCT", "DESeq2", -#' "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "zinb". If empty, defaults to raw. +#' "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "limma_voom_zinb", +#' "edgeR_zinb". If empty, defaults to raw. #' @param spikes List of genes to use as spike controls in RUVg #' @param num_cores Number of cores for edgeR and zinb calculations @@ -205,8 +206,8 @@ plot_rle <- function(data, barcodes = NULL, RUVg = compute_normalised_counts(data, method = "RUVg", batch = batch, spikes = spikes, num_cores = num_cores), RUVs = compute_normalised_counts(data, method = "RUVs", batch = batch, num_cores = num_cores), RUVr = compute_normalised_counts(data, method = "RUVr", batch = batch, num_cores = num_cores), - limma_voom_zinb = compute_normalised_counts(data, method = "zinb", batch = batch, num_cores = num_cores), - edgeR_zinb = compute_normalised_counts(data, method = "zinb", batch = batch, num_cores = num_cores), + limma_voom_zinb = compute_normalised_counts(data, method = "limma_voom_zinb", batch = batch, num_cores = num_cores), + edgeR_zinb = compute_normalised_counts(data, method = "edgeR_zinb", batch = batch, num_cores = num_cores), stop("Unsupported normalization method.") ) From 4e04dc59a9ba5914706c84f155007334061949df Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Mon, 13 Oct 2025 17:04:34 +1100 Subject: [PATCH 03/16] fix DESeq2 batch factor not in the design --- R/compute_single_de.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/compute_single_de.R b/R/compute_single_de.R index ea1f600..3d7043b 100644 --- a/R/compute_single_de.R +++ b/R/compute_single_de.R @@ -157,7 +157,8 @@ compute_single_de <- function(data = NULL, combined_id <- data$combined_id dds <- DESeqDataSetFromMatrix(countData = data@assays$RNA$counts, colData = pheno_data, - design = ~ condition) + # design = ~ condition) + design = if (length(batch) == 1) ~ condition else ~ condition + batch) dds <- DESeq(dds) res <- results(dds, contrast = c("condition", treatment_samples, control_samples)) top_table <- as.data.frame(res) %>% From 7ba5119f770204d989e5b6ba75110146593e4ff3 Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Tue, 21 Oct 2025 10:04:52 +1100 Subject: [PATCH 04/16] add all revision vignettes --- vignettes/DRUGseq_DE.Rmd | 1751 +++++++++ vignettes/DRUGseq_DE.html | 4347 +++++++++++++++++++++++ vignettes/DRUGseq_first500runs.Rmd | 652 ++++ vignettes/DRUGseq_first500runs.html | 2176 ++++++++++++ vignettes/DRUGseq_first500runs.knit.log | 2 + vignettes/benchmark_DMSOruntime.Rmd | 476 +++ vignettes/benchmark_DMSOruntime.html | 2016 +++++++++++ vignettes/benchmark_DMSOselection.Rmd | 422 +++ vignettes/benchmark_DMSOselection.html | 2017 +++++++++++ vignettes/check_zero_inflation.Rmd | 172 +- vignettes/check_zero_inflation.html | 2051 +++++++++++ vignettes/model_selection.Rmd | 177 + 12 files changed, 16257 insertions(+), 2 deletions(-) create mode 100644 vignettes/DRUGseq_DE.Rmd create mode 100644 vignettes/DRUGseq_DE.html create mode 100644 vignettes/DRUGseq_first500runs.Rmd create mode 100644 vignettes/DRUGseq_first500runs.html create mode 100644 vignettes/DRUGseq_first500runs.knit.log create mode 100644 vignettes/benchmark_DMSOruntime.Rmd create mode 100644 vignettes/benchmark_DMSOruntime.html create mode 100644 vignettes/benchmark_DMSOselection.Rmd create mode 100644 vignettes/benchmark_DMSOselection.html create mode 100644 vignettes/check_zero_inflation.html create mode 100644 vignettes/model_selection.Rmd diff --git a/vignettes/DRUGseq_DE.Rmd b/vignettes/DRUGseq_DE.Rmd new file mode 100644 index 0000000..004dc20 --- /dev/null +++ b/vignettes/DRUGseq_DE.Rmd @@ -0,0 +1,1751 @@ +--- +title: "Benchmarking" +output: + rmarkdown::html_document: + theme: flatly + toc: true + toc_float: true + toc_depth: 5 +vignette: > + %\VignetteIndexEntry{benchmarking} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} + %\VignetteBuild{true} +--- + +```{r set_wd, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +dir <- "/Users/liuxin/macpie_Dev/" +devtools::load_all(paste0(dir, "macpie/")) + +``` + + +```{r setup} +library(macpie) +library(tibble) +library(stringr) +library(pheatmap) +library(ggiraph) +library(tidyseurat) +library(purrr) +library(DT) +library(ggrepel) +library(ggalluvial) +library(ggplot2) +library(ggvenn) +library(limma) +``` + + +## DRUGseq results + + +```{r} +# batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds")) +``` + +```{r} +# load(paste0(dir,"DRUGseqData/de.RData")) +# batch24_de <- de_res%>%filter(batch_id=="24") +# saveRDS(batch24_de, file = paste0(dir,"DRUGseqData/DE_batch24.Rds")) +batch24_de <- readRDS(paste0(dir,"DRUGseqData/DE_batch24.Rds")) +head(batch24_de) + +``` + +```{r} + +FF86_res <- batch24_de %>% filter(cmpd_sample_id=="FF-86-NH56") +ff86_res_toptable <- FF86_res[,13:18] +ff86_res_toptable <- ff86_res_toptable %>% + separate(gene.ID, into = c("gene", "chrom"), sep = ",") %>% + select(-chrom) %>% mutate(combined_id ="FF_86_NH56_10") %>% + rename(log2FC=logFC, p_value_adj = adj.P.Val) + +``` + + +```{r} +head(ff86_res_toptable) +``` + +```{r, fig.height=6} +plot_volcano(ff86_res_toptable, max.overlaps =5) +``` + +```{r} +ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% nrow() + +drugseq_deg <- ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% select(gene) %>% pull() +``` + +```{r} +#show histogram of p values +ggplot(ff86_res_toptable, aes(x = P.Value)) + + geom_histogram(binwidth = 0.01, fill = "blue", color = "black", alpha = 0.7) + + labs(title = "Histogram of Adjusted P-values", x = "Adjusted P-value", y = "Frequency") + + theme_minimal() +``` + + + +## macpie results + +## All DMSO wells + +### Load filered data + + +```{r filter_genes_batch24} +# mac_filtered <- filter_genes_by_expression(as_mac, +# group_by = "combined_id", min_counts = 10, +# min_samples = min_sample_num) + +# saveRDS(mac_filtered, +# file = paste0(dir, "DRUGseqData/macpie_filtered_batch24.Rds")) + + +mac_filtered <- readRDS(paste0(dir, "/DRUGseqData/macpie_filtered_batch24.Rds")) +``` + + + + + +```{r violin_plots_batch24, fig.width=10, fig.height=6} +mac_filtered[["percent.mt"]] <- PercentageFeatureSet(mac_filtered, pattern = "^mt-|^MT-") +mac_filtered[["percent.ribo"]] <- PercentageFeatureSet(mac_filtered, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA") +``` + + +```{r} +mac_filtered$combined_id <- str_replace_all(mac_filtered$combined_id, "-","_") +``` + + +#### Correction of batch effect + +According to DRUGseq metadata: + + - Wells with water are labelled as EC-27-RY89 + + - Wells with DMSO are labelled as CB-43-EP73 + + + + + + + + + + + + + + + + + + + + + + +*Note:* instead of discussing which correction methods we should use for this data, we only show the ways we detected and corrected batch effect here. As batch effect adjustment for sequencing data has been implemented in different methods, such as DESeq2, RUVSeq, edgeR. We highly recommend users thoroughly checking any batch effects and exploring different methods. + +In the next part of the vignette, we demonstrate a `batch` parameter has implemented in the differential expression test for batch correction. + + + +```{r} +##Limma trend + +# data <- mac_badDSMOremoved +# treatment_samples <- "FF_86_NH56_10" +# control_samples <- "CB_43_EP73_0" + + limma.trend<- function(data = data, treatment_samples, control_samples){ + data <- data[, grepl(paste0(treatment_samples, "|", control_samples), data$combined_id)] + batch <- data$orig.ident +if (length(unique(data$combined_id)) < 2) { + stop("Insufficient factors for differential expression analysis.") + } +pheno_data <- data.frame(batch = as.factor(batch), condition = as.factor(data$combined_id)) +combined_id <- data$combined_id +model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0+combined_id + batch) + + dge <- DGEList(counts = data@assays$RNA$counts, + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- calcNormFactors(dge, method="TMMwsp") + # keep <- edgeR::filterByExpr(dge, design = model_matrix) + # dge <- dge[keep, , keep.lib.sizes = FALSE] + # clean.TMM<-log2(edgeR::cpm(dge, normalized.lib.sizes=T,log=F)+1) + clean.TMM <- cpm(dge, log=TRUE, prior.count=3) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + + lmfit <- lmFit(clean.TMM, model_matrix) + lmfit <- contrasts.fit(lmfit, contrasts) + lmfit <- eBayes(lmfit, trend = TRUE) + #Warning: Zero sample variances detected, have been offset away from zero + top_table<- topTable(lmfit, number = nrow(data)) %>% + as.data.frame() %>% + select("logFC", "t", "P.Value", "adj.P.Val") %>% + rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val")%>% + rownames_to_column("gene") + + return(top_table)} + + # top_table%>%filter(p_value_adj < 0.05 & log2FC>0) +``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +```{r} +# make a function of alluvial plot with ggalluvial to compare the ranking of DEGs from macpie and DRUGseq +plot_alluvial <- function(macpie_tbl, drugseq_tbl){ + df <- macpie_tbl %>% + transmute(gene, p_value_adj_mac = p_value_adj, log2FC_mac = log2FC) %>% + inner_join( + drugseq_tbl %>% transmute(gene, p_value_adj_drg = p_value_adj, log2FC_drg = log2FC), + by = "gene" + ) + +# Rank within each method by (padj, |log2FC|), tie-safe +rank_by_two <- function(padj, lfc) { + ord <- order(padj, -abs(lfc), na.last = TRUE) + r <- integer(length(padj)) + r[ord] <- seq_along(ord) + r +} +df <- df %>% + mutate( + rank_mac = rank_by_two(p_value_adj_mac, log2FC_mac), + rank_drg = rank_by_two(p_value_adj_drg, log2FC_drg) + ) + +# Define rank bins +cuts <- c(0, 25, 50, 100, 200, 300, 500, Inf) +labs <- c("Top25","26–50","51–100","101–200","201–300", "301-500",">500") + +df <- df %>% + mutate( + mac_bin = cut(rank_mac, breaks = cuts, labels = labs, right = TRUE, include.lowest = TRUE), + drg_bin = cut(rank_drg, breaks = cuts, labels = labs, right = TRUE, include.lowest = TRUE) + ) + +# Ranking movement category (color) +bin_index <- function(x) match(x, labs) # lower index = "more top" +df <- df %>% + mutate( + mac_idx = bin_index(mac_bin), + drg_idx = bin_index(drg_bin), + movement = case_when( + is.na(mac_idx) | is.na(drg_idx) ~ NA_character_, + mac_idx < drg_idx ~ "Higher rank in macpie", # moved up into a more-top bin + mac_idx > drg_idx ~ "Lower rank in macpie", # moved down + TRUE ~ "Same rank" + ), + movement = factor(movement, levels = c("Higher rank in macpie","Same rank","Lower rank in macpie")) + ) %>% + tidyr::drop_na(mac_bin, drg_bin, movement) + +# Aggregate flows for alluvial +flows <- df %>% + count(mac_bin, drg_bin, movement, name = "n") %>% + mutate( + mac_bin = factor(mac_bin, levels = labs), + drg_bin = factor(drg_bin, levels = labs) + ) + +ggplot(flows, aes(axis1 = mac_bin, axis2 = drg_bin, y = n)) + + geom_alluvium(aes(fill = movement), alpha = 0.75, width = 0.14, knot.pos = 0.4) + + geom_stratum(width = 0.14, color = "grey30") + + geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) + + scale_x_discrete(limits = c("Macpie rank", "DRUGseq rank"), expand = c(.05, .05)) + + scale_fill_manual(values = c("Higher rank in macpie" = "orange", + "Same rank" = "#7570b3", + "Lower rank in macpie" = "grey60")) + + labs(x = NULL, y = "Gene count", + fill = "Movement vs DRUGseq", + subtitle = "Only DEGs with padj < 0.05 & log2FC>0 (both methods)") + + theme_classic() + +} +``` + + +#### Differential gene expression + + +In here, you can specify a single condition in the combined_id column and compare with DMSO (i.e.CB_43_EP73_0). By using the plate IDs in the column of orig.ident as the input for batch parameter, `compute_singe_de` function can perform differential expression analysis using the preferred method (limma voom in this example) with batch information. + + + +```{r} +methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") + +methods_res <- lapply(methods, function(m){ + + message("\n","Processing method: ", m,"\n") + # m<-"limma_voom" + treatment_samples <- "FF_86_NH56_10" + control_samples <- "CB_43_EP73_0" + subset <- mac_filtered%>%filter(combined_id%in%c(treatment_samples,control_samples)) + + batch <- subset$orig.ident + if (m!="limma_trend"){ + subset <- filter_genes_by_expression(subset, + group_by = "combined_id", min_counts = 5, + min_samples = 1) + top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch) + } else{ + top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples) + } + + # plot(plot_volcano(top_table, max.overlaps = 5)) + alldmso_degs <- top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + length(intersect(alldmso_degs, drugseq_deg)) + + top_table <- top_table %>% + arrange(p_value_adj, desc(log2FC)) %>% + mutate(gene = factor(gene, levels = unique(gene))) + # add a column if there are in the intersect(alldmso_degs, drugseq_deg) + top_table <- top_table %>% + mutate(in_drugseq_deg = ifelse(gene %in% intersect(alldmso_degs, drugseq_deg), "yes", "no")) + + plt_ecdf <- ggplot(top_table, aes(x = p_value_adj, color = in_drugseq_deg)) + + stat_ecdf(size = 1) + + scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) + + labs(x = "Adjusted p-value (log10 scale)", y = "ECDF", + color = "In DRUGseq DEGs") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + theme_classic() + + + ks_test_results <- ks.test(top_table$p_value_adj[top_table$in_drugseq_deg=="yes"], + top_table$p_value_adj[top_table$in_drugseq_deg=="no"], alternative = "greater") + + # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + + # geom_point(alpha = 0.6, size = 1.2) + + # geom_hline(yintercept = -log10(0.05), linetype = "dashed") + + # geom_vline(xintercept = c(-1, 1), linetype = "dashed") + + # scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) + + # labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") + + # theme_classic() + # + # label a few top overlapping genes + lab_genes <- top_table[top_table$in_drugseq_deg=="yes", ] |> + dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) + + volcano_overlap <- ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + + geom_point(alpha = 0.6, size = 1.2) + + geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 50) + + scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+ + theme_classic() + + ks <- c(25,50,100,200,500,1000) + prec_at_k <- sapply(ks, function(k) + mean(top_table$in_drugseq_deg[1:k] == "yes") + ) + # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)", + # ylab = "Precision@k (fraction in DRUGseq set)") + + ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) + + geom_point() + geom_line() + + labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+ + theme_classic() + + # alluvial plot + macpie_tbl <- top_table %>% filter(p_value_adj < 0.05 & log2FC>0) + drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0) + alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl) + + #return + result_list <- list(top_table = top_table, + num_degs_macpie = length(alldmso_degs), + n_overlap = length(intersect(alldmso_degs, drugseq_deg)), + ecdf_plot = plt_ecdf, + ks_test_results = ks_test_results, + volcano_plot = volcano_overlap, + ks_plot = ks_plot, + alluvial_plot = alluvial_plot) + return(result_list) + +}) + +names(methods_res) <- methods + +``` + + +#### Summary table + + +```{r} +#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method +deg_summary <- map_df(methods_res, function(x) { + data.frame( + num_degs_macpie = x$num_degs_macpie, + n_overlap = x$n_overlap, + num_degs_DRUGseq = length(drugseq_deg) + ) +}, .id = paste0("macpie_methods")) + +deg_summary +``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +### Overlapping volcano plot + +```{r} +methods_res$limma_voom$volcano_plot +methods_res$DESeq2$volcano_plot +methods_res$edgeR$volcano_plot +methods_res$limma_trend$volcano_plot +``` + + + + + + + + + + + + + +### Alluvial plot + +```{r} +methods_res$limma_voom$alluvial_plot +methods_res$DESeq2$alluvial_plot +methods_res$edgeR$alluvial_plot +methods_res$limma_trend$alluvial_plot +``` + + +```{r} +#append top_table from each method into a single data frame +all_methods_top_table <- map_df(methods_res, "top_table", .id = "method") +head(all_methods_top_table) + +#if in_drugseq_deg is "no", change method to "DRUGseq" +all_methods_top_table <- all_methods_top_table %>% + mutate(method = ifelse(in_drugseq_deg == "no", "not in DRUGseq", method)) + +#rename methods +all_methods_top_table$method <- recode(all_methods_top_table$method, + "limma_voom" = "macpie:limma_voom", + "DESeq2" = "macpie:DESeq2", + "edgeR" = "macpie:edgeR", + "limma_trend" = "macpie:limma_trend") + + +ggplot(all_methods_top_table, aes(x = p_value_adj, color = method)) + + stat_ecdf(size = 1) + + scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) + + labs(x = "Adjusted p-value (log10 scale)", y = "ECDF", + color = "In DRUGseq DEGs") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + theme_classic() + + +``` + + +```{r} +rm(top_table, alldmso_degs, deg_summary) +``` + + + + +## Good DMSO - top 5 from macpie + +From public DRUG-seq analysis pipeline, authors identified two reference controls: all DMSO wells and the ‘good DMSO’ wells. + +We know these good DMSO wells for batch 24 from their published data: + + - VH02012942: I23, M23 + + - VH02012944: D23, H23 + + - VH02012956: I23, J23 + + +```{r} +batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds")) +names(batch24) +``` + +```{r metdata} +#make a combined metadata for three plates +batch24_metadata <- batch24 %>% + map_dfr(~ { + .x$Annotation %>% + mutate( + Plate_ID = plate_barcode, + Well_ID = well_id, + Barcode = paste0(plate_barcode, "_", well_index), + Row = LETTERS[row], + Column = as.integer(col), + Species = "human", + Cell_type = "U2OS", + Model_type = "2D_adherent", + Time = as.factor(hours_post_treatment), + Unit = "h", + Treatment_1 = cmpd_sample_id, + Concentration_1 = as.numeric(concentration), + Unit_1 = unit, + Sample_type = if_else(well_type == "SA" & col == 24, + "Positive Control", + well_type) + ) + }) + + +batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id, + well_index, col, row, biosample_id, external_biosample_id, + cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample)) +``` + + + +```{r count_matrix} +# create a combined UMI matrix for 3 plates +batch24_counts <- batch24 %>% + map(~ { + .x$UMI.counts %>% + as.data.frame() %>% + rownames_to_column("gene_id") %>% + separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>% + mutate(gene_name = make.unique(gene_name)) %>% + select(-chrom) %>% + tibble::column_to_rownames(var = "gene_name") %>% + as.matrix() + }) + +binded_counts <- do.call(cbind, batch24_counts) + + + +``` + + + +```{r load_data} +as_mac <- CreateSeuratObject(counts = binded_counts, + min.cells = 1, + min.features = 1) + +as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode")) +``` + + +Filtering steps + +Include only good DMSO wells as controls + +```{r} + +as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1) + +badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>% + filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23", "K23", "J23","C23"))) | + (Plate_ID == "VH02012944" & !(Well_ID %in% c("I23", "M23", "J23", "G23", "K23")))| + (Plate_ID == "VH02012956" & ! (Well_ID %in% c("I23", "J23", "O23","M23","K23")))) + + + +keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO)) + + +mac_badDSMOremoved <- as_mac[,keep_wells] + +mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_") + + +``` + + +```{r} +min_sample_num <- min(table(mac_badDSMOremoved$combined_id)) +mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved, + group_by = "combined_id", min_counts =10, + min_samples = min_sample_num) + +``` + + +```{r} +mac_badDSMOremoved[["percent.mt"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^mt-|^MT-") +mac_badDSMOremoved[["percent.ribo"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA") + +``` + +```{r} +p <- plot_plate_layout(mac_badDSMOremoved, "nCount_RNA", "combined_id") + facet_wrap(~orig.ident, ncol = 1) + + theme(strip.text = element_text(size=10), + axis.text.x = element_text(size=10), + axis.text.y = element_text(size=8), + legend.title = element_text(size=10), + legend.text = element_text(size=8), + trip.background = element_blank()) +girafe(ggobj = p, + fonts = list(sans = "sans"), + options = list( + opts_hover(css = "stroke:black; stroke-width:1px;") + )) +``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +```{r} +methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") + +five_dmso_methods_res <- lapply(methods, function(m){ + message("Processing method: ", m) + # m<-"limma_voom" + treatment_samples <- "FF_86_NH56_10" + control_samples <- "CB_43_EP73_0" + subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples)) + + batch <- subset$orig.ident + if (m!="limma_trend"){ + # subset <- filter_genes_by_expression(subset, + # group_by = "combined_id", min_counts = 10, + # min_samples = 1) + badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch) + } else{ + badDMSO_out_top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples) + } + + # plot(plot_volcano(top_table, max.overlaps = 5)) + badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + length(intersect(badDMSO_out_degs, drugseq_deg)) + + badDMSO_out_top_table <- badDMSO_out_top_table %>% + arrange(p_value_adj, desc(log2FC)) %>% + mutate(gene = factor(gene, levels = unique(gene))) + # add a column if there are in the intersect(alldmso_degs, drugseq_deg) + badDMSO_out_top_table <- badDMSO_out_top_table %>% + mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no")) + + plt_ecdf <- ggplot(badDMSO_out_top_table, aes(x = p_value_adj, color = in_drugseq_deg)) + + stat_ecdf(size = 1) + + scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) + + labs(x = "Adjusted p-value (log10 scale)", y = "ECDF", + color = "In DRUGseq DEGs") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + theme_classic() + + + ks_test_results <- ks.test(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="yes"], + badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="no"], alternative = "greater") + + # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + + # geom_point(alpha = 0.6, size = 1.2) + + # geom_hline(yintercept = -log10(0.05), linetype = "dashed") + + # geom_vline(xintercept = c(-1, 1), linetype = "dashed") + + # scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) + + # labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") + + # theme_classic() + # + # label a few top overlapping genes + lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |> + dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) + + volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + + geom_point(alpha = 0.6, size = 1.2) + + geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) + + scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+ + theme_classic() + + ks <- c(25,50,100,200,500,1000) + prec_at_k <- sapply(ks, function(k) + mean(badDMSO_out_top_table$in_drugseq_deg[1:k] == "yes") + ) + # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)", + # ylab = "Precision@k (fraction in DRUGseq set)") + + ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) + + geom_point() + geom_line() + + labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+ + theme_classic() + + # alluvial plot + macpie_tbl <- badDMSO_out_top_table %>% filter(p_value_adj < 0.05 & log2FC>0) + drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0) + alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl) + + #return + result_list <- list(top_table = badDMSO_out_top_table, + num_degs_macpie = length(badDMSO_out_degs), + n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)), + ecdf_plot = plt_ecdf, + ks_test_results = ks_test_results, + volcano_plot = volcano_overlap, + ks_plot = ks_plot, + alluvial_plot = alluvial_plot) + return(result_list) + +}) + +names(five_dmso_methods_res) <- methods + +``` + +### Summary table + +```{r} +#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method +deg_summary <- map_df(five_dmso_methods_res, function(x) { + data.frame( + num_degs_macpie = x$num_degs_macpie, + n_overlap = x$n_overlap, + num_degs_DRUGseq = length(drugseq_deg) + ) +}, .id = paste0("macpie_methods")) + +deg_summary +``` + + + + +### ECDF plot + +```{r} +#append top_table from each method into a single data frame +good_dmso_top_table <- map_df(five_dmso_methods_res, "top_table", .id = "method") +head(good_dmso_top_table) + +#if in_drugseq_deg is "no", change method to "DRUGseq" +good_dmso_top_table<- good_dmso_top_table %>% + mutate(method = ifelse(in_drugseq_deg == "no", "not_in_DRUGseq", method)) + +#rename methods +good_dmso_top_table$method <- recode(good_dmso_top_table$method, + "limma_voom" = "macpie:limma_voom", + "DESeq2" = "macpie:DESeq2", + "edgeR" = "macpie:edgeR", + "limma_trend" = "macpie:limma_trend") + + +ggplot(good_dmso_top_table, aes(x = p_value_adj, color = method)) + + stat_ecdf(size = 1) + + scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) + + labs(x = "Adjusted p-value (log10 scale)", y = "ECDF", + color = "In DRUGseq DEGs") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + theme_classic() + + +``` + + + + + + + + +### Overlapping volcano plot + +```{r} +five_dmso_methods_res$limma_voom$volcano_plot +five_dmso_methods_res$DESeq2$volcano_plot +five_dmso_methods_res$edgeR$volcano_plot +five_dmso_methods_res$limma_trend$volcano_plot +``` + + + + + + + + + + + + + +### Alluvial plot + +```{r} +five_dmso_methods_res$limma_voom$alluvial_plot +five_dmso_methods_res$DESeq2$alluvial_plot +five_dmso_methods_res$edgeR$alluvial_plot +five_dmso_methods_res$limma_trend$alluvial_plot +``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +## Good DMSO - DRUGseq + +From public DRUG-seq analysis pipeline, authors identified two reference controls: all DMSO wells and the ‘good DMSO’ wells. + +We know these good DMSO wells for batch 24 from their published data: + + - VH02012942: I23, M23 + + - VH02012944: D23, H23 + + - VH02012956: I23, J23 + + +```{r} +batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds")) +names(batch24) +``` + +```{r metdata_for_batch24} +#make a combined metadata for three plates +batch24_metadata <- batch24 %>% + map_dfr(~ { + .x$Annotation %>% + mutate( + Plate_ID = plate_barcode, + Well_ID = well_id, + Barcode = paste0(plate_barcode, "_", well_index), + Row = LETTERS[row], + Column = as.integer(col), + Species = "human", + Cell_type = "U2OS", + Model_type = "2D_adherent", + Time = as.factor(hours_post_treatment), + Unit = "h", + Treatment_1 = cmpd_sample_id, + Concentration_1 = as.numeric(concentration), + Unit_1 = unit, + Sample_type = if_else(well_type == "SA" & col == 24, + "Positive Control", + well_type) + ) + }) + + +batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id, + well_index, col, row, biosample_id, external_biosample_id, + cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample)) +``` + + + +```{r count_matrix_for_batch24} +# create a combined UMI matrix for 3 plates +batch24_counts <- batch24 %>% + map(~ { + .x$UMI.counts %>% + as.data.frame() %>% + rownames_to_column("gene_id") %>% + separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>% + mutate(gene_name = make.unique(gene_name)) %>% + select(-chrom) %>% + tibble::column_to_rownames(var = "gene_name") %>% + as.matrix() + }) + +binded_counts <- do.call(cbind, batch24_counts) + + + +``` + + + +```{r load_data_batch24} +as_mac <- CreateSeuratObject(counts = binded_counts, + min.cells = 1, + min.features = 1) + +as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode")) +``` + + +Filtering steps + +Include only good DMSO wells as controls + +```{r} + +as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1) + +badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>% + filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23"))) | + (Plate_ID == "VH02012944" & !(Well_ID %in% c("D23", "H23")))| + (Plate_ID == "VH02012956" & ! (Well_ID %in% c("I23", "J23")))) + + + +keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO)) + + +mac_badDSMOremoved <- as_mac[,keep_wells] + +mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_") + + +``` + + +```{r} +min_sample_num <- min(table(mac_badDSMOremoved$combined_id)) +mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved, + group_by = "combined_id", min_counts =10, + min_samples = min_sample_num) + +``` + + +```{r} +mac_badDSMOremoved[["percent.mt"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^mt-|^MT-") +mac_badDSMOremoved[["percent.ribo"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA") + +``` + + + + + + + + +```{r} +methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") + +good_dmso_methods_res <- lapply(methods, function(m){ + message("Processing method: ", m) + # m<-"limma_voom" + treatment_samples <- "FF_86_NH56_10" + control_samples <- "CB_43_EP73_0" + subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples)) + + batch <- subset$orig.ident + if (m!="limma_trend"){ + # subset <- filter_genes_by_expression(subset, + # group_by = "combined_id", min_counts = 10, + # min_samples = 1) + badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch) + } else{ + badDMSO_out_top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples) + } + + # plot(plot_volcano(top_table, max.overlaps = 5)) + badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + length(intersect(badDMSO_out_degs, drugseq_deg)) + + badDMSO_out_top_table <- badDMSO_out_top_table %>% + arrange(p_value_adj, desc(log2FC)) %>% + mutate(gene = factor(gene, levels = unique(gene))) + # add a column if there are in the intersect(alldmso_degs, drugseq_deg) + badDMSO_out_top_table <- badDMSO_out_top_table %>% + mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no")) + + plt_ecdf <- ggplot(badDMSO_out_top_table, aes(x = p_value_adj, color = in_drugseq_deg)) + + stat_ecdf(size = 1) + + scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) + + labs(x = "Adjusted p-value (log10 scale)", y = "ECDF", + color = "In DRUGseq DEGs") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + theme_classic() + + + ks_test_results <- ks.test(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="yes"], + badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="no"], alternative = "greater") + + # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + + # geom_point(alpha = 0.6, size = 1.2) + + # geom_hline(yintercept = -log10(0.05), linetype = "dashed") + + # geom_vline(xintercept = c(-1, 1), linetype = "dashed") + + # scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) + + # labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") + + # theme_classic() + # + # label a few top overlapping genes + lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |> + dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) + + volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + + geom_point(alpha = 0.6, size = 1.2) + + geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) + + scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+ + theme_classic() + + ks <- c(25,50,100,200,500,1000) + prec_at_k <- sapply(ks, function(k) + mean(badDMSO_out_top_table$in_drugseq_deg[1:k] == "yes") + ) + # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)", + # ylab = "Precision@k (fraction in DRUGseq set)") + + ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) + + geom_point() + geom_line() + + labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+ + theme_classic() + + # alluvial plot + macpie_tbl <- badDMSO_out_top_table %>% filter(p_value_adj < 0.05 & log2FC>0) + drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0) + alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl) + + #return + result_list <- list(top_table = badDMSO_out_top_table, + num_degs_macpie = length(badDMSO_out_degs), + n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)), + ecdf_plot = plt_ecdf, + ks_test_results = ks_test_results, + volcano_plot = volcano_overlap, + ks_plot = ks_plot, + alluvial_plot = alluvial_plot) + return(result_list) + +}) + +names(good_dmso_methods_res) <- methods + +``` + +### Summary table + +```{r} +#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method +deg_summary <- map_df(good_dmso_methods_res, function(x) { + data.frame( + num_degs_macpie = x$num_degs_macpie, + n_overlap = x$n_overlap, + num_degs_DRUGseq = length(drugseq_deg) + ) +}, .id = paste0("macpie_methods")) + +deg_summary +``` + + + + +### ECDF plot + +```{r} +#append top_table from each method into a single data frame +good_dmso_top_table <- map_df(good_dmso_methods_res, "top_table", .id = "method") +head(good_dmso_top_table) + +#if in_drugseq_deg is "no", change method to "DRUGseq" +good_dmso_top_table<- good_dmso_top_table %>% + mutate(method = ifelse(in_drugseq_deg == "no", "not_in_DRUGseq", method)) + +#rename methods +good_dmso_top_table$method <- recode(good_dmso_top_table$method, + "limma_voom" = "macpie:limma_voom", + "DESeq2" = "macpie:DESeq2", + "edgeR" = "macpie:edgeR", + "limma_trend" = "macpie:limma_trend") + + +ggplot(good_dmso_top_table, aes(x = p_value_adj, color = method)) + + stat_ecdf(size = 1) + + scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) + + labs(x = "Adjusted p-value (log10 scale)", y = "ECDF", + color = "In DRUGseq DEGs") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + theme_classic() + + +``` + + +```{r} +good_dmso_methods_res$limma_voom$ecdf_plot +good_dmso_methods_res$DESeq2$ecdf_plot +good_dmso_methods_res$edgeR$ecdf_plot +good_dmso_methods_res$limma_trend$ecdf_plot +``` +### Overlapping volcano plot + +```{r} +good_dmso_methods_res$limma_voom$volcano_plot +good_dmso_methods_res$DESeq2$volcano_plot +good_dmso_methods_res$edgeR$volcano_plot +good_dmso_methods_res$limma_trend$volcano_plot +``` + + + + + + + + + + + +### Alluvial plot + +```{r} +good_dmso_methods_res$limma_voom$alluvial_plot +good_dmso_methods_res$DESeq2$alluvial_plot +good_dmso_methods_res$edgeR$alluvial_plot +good_dmso_methods_res$limma_trend$alluvial_plot +``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +## Comparison + +To compare DEGs with different replicate numbers and different methods + +```{r} +methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") + +get_jaccard <- function(deg_set, drugseq_deg){ + intersection <- length(intersect(deg_set, drugseq_deg)) + union <- length(union(deg_set, drugseq_deg)) + jaccard_index <- intersection / union + return(jaccard_index) +} + +jaccard_index <- lapply(methods, function(m){ + # all dmso + degs <- methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + jaccard_all <- get_jaccard(degs, drugseq_deg) + # five dmso + degs <- five_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + jaccard_five <- get_jaccard(degs, drugseq_deg) + # three dmso + degs <- good_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + jaccard_three <- get_jaccard(degs, drugseq_deg) + jaccard_index <- data.frame( + method = m, + jaccard_all = jaccard_all, + jaccard_five = jaccard_five, + jaccard_three = jaccard_three + ) + return(jaccard_index) +}) + +df <- as.data.frame(do.call(rbind, jaccard_index)) +rownames(df) <- df$method +df <- df %>% select(-method) +colnames(df) <- c("All DMSO", "macpie: 15 DMSO", "DRUGseq: 6 DMSO") +pheatmap::pheatmap(df, + cluster_rows = FALSE, + cluster_cols = FALSE, + display_numbers = TRUE, + main = "Jaccard Index between macpie DEGs and DRUGseq DEGs") + +``` + +```{r} +library(UpSetR) +all_dmso <- list( + limma_voom = methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DESeq2 = methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + edgeR = methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + limma_trend = methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DRUGseq = drugseq_deg +) +upset(fromList(all_dmso), + nsets = 5, + order.by = "freq", + main.bar.color = "black", + sets.bar.color = "gray23", + text.scale = c(2, 2, 2, 1.5, 2, 1.5)) +``` + +```{r} +five_dmso <- list( + limma_voom = five_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DESeq2 = five_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + edgeR = five_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + limma_trend = five_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DRUGseq = drugseq_deg +) +upset(fromList(five_dmso), + nsets = 5, + order.by = "freq", + main.bar.color = "black", + sets.bar.color = "gray23", + text.scale = c(2, 2, 2, 1.5, 2, 1.5)) +``` + + +```{r} +good_dmso <- list( + limma_voom = good_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DESeq2 = good_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + edgeR = good_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + limma_trend = good_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DRUGseq = drugseq_deg +) +upset(fromList(good_dmso), + nsets = 5, + order.by = "freq", + main.bar.color = "black", + sets.bar.color = "gray23", + text.scale = c(2, 2, 2, 1.5, 2, 1.5)) +``` + diff --git a/vignettes/DRUGseq_DE.html b/vignettes/DRUGseq_DE.html new file mode 100644 index 0000000..a5965dd --- /dev/null +++ b/vignettes/DRUGseq_DE.html @@ -0,0 +1,4347 @@ + + + + + + + + + + + + + +Benchmarking + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + +
library(macpie)
+library(tibble)
+library(stringr)
+library(pheatmap)
+library(ggiraph)
+library(tidyseurat)
+
## Loading required package: ttservice
+
## 
+## Attaching package: 'ttservice'
+
## The following objects are masked from 'package:macpie':
+## 
+##     bind_cols, bind_rows, plot_ly
+
## Loading required package: SeuratObject
+
## Loading required package: sp
+
## 
+## Attaching package: 'SeuratObject'
+
## The following objects are masked from 'package:base':
+## 
+##     intersect, t
+
## ========================================
+## tidyseurat version 0.8.0
+## If you use TIDYSEURAT in published research, please cite:
+## 
+## Mangiola et al. Interfacing Seurat with the R tidy universe. Bioinformatics 2021.
+## 
+## This message can be suppressed by:
+##   suppressPackageStartupMessages(library(tidyseurat))
+##   
+## To restore the Seurat default display use options("restore_Seurat_show" = TRUE) 
+## ========================================
+
## 
+## Attaching package: 'tidyseurat'
+
## The following object is masked from 'package:ttservice':
+## 
+##     plot_ly
+
library(purrr)
+
## 
+## Attaching package: 'purrr'
+
## The following object is masked from 'package:testthat':
+## 
+##     is_null
+
library(DT)
+
## 
+## Attaching package: 'DT'
+
## The following object is masked from 'package:SeuratObject':
+## 
+##     JS
+
## The following object is masked from 'package:macpie':
+## 
+##     JS
+
library(ggrepel)
+library(ggalluvial) 
+library(ggplot2)
+library(ggvenn)
+
## Loading required package: grid
+
library(limma)
+
## 
+## Attaching package: 'limma'
+
## The following object is masked from 'package:macpie':
+## 
+##     plotMA
+
+

DRUGseq results

+
# batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds"))
+
# load(paste0(dir,"DRUGseqData/de.RData")) 
+# batch24_de <- de_res%>%filter(batch_id=="24")
+# saveRDS(batch24_de, file = paste0(dir,"DRUGseqData/DE_batch24.Rds"))
+batch24_de <- readRDS(paste0(dir,"DRUGseqData/DE_batch24.Rds"))
+head(batch24_de)
+
##   investigation_id analysis_id pipeline_run_key pipeline_group_run_key method
+## 1             2384          24               27                     27  limma
+## 2             2384          24               27                     27  limma
+## 3             2384          24               27                     27  limma
+## 4             2384          24               27                     27  limma
+## 5             2384          24               27                     27  limma
+## 6             2384          24               27                     27  limma
+##                 normalization
+## 1 quantile=0.75;TMM;log2(CPM)
+## 2 quantile=0.75;TMM;log2(CPM)
+## 3 quantile=0.75;TMM;log2(CPM)
+## 4 quantile=0.75;TMM;log2(CPM)
+## 5 quantile=0.75;TMM;log2(CPM)
+## 6 quantile=0.75;TMM;log2(CPM)
+##                                                  comparison_name_public
+## 1 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
+## 2 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
+## 3 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
+## 4 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
+## 5 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
+## 6 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
+##   cmpd_sample_id concentration unit hours_post_treatment batch_id
+## 1     UD-20-ER54            10   uM                   24       24
+## 2     UD-20-ER54            10   uM                   24       24
+## 3     UD-20-ER54            10   uM                   24       24
+## 4     UD-20-ER54            10   uM                   24       24
+## 5     UD-20-ER54            10   uM                   24       24
+## 6     UD-20-ER54            10   uM                   24       24
+##            gene.ID     logFC    adj.P.Val      P.Value   AveExpr         t
+## 1   MYL6,grch38_12  1.726500 6.467501e-43 4.413170e-47 11.768734 15.518950
+## 2   APRT,grch38_16  2.334604 2.164408e-24 2.953815e-28  8.730357 11.508262
+## 3  KRT18,grch38_12  1.157740 1.327225e-15 2.716939e-19  9.335040  9.239790
+## 4    SCD,grch38_10  1.199936 1.352286e-13 3.690990e-17  7.732632  8.636743
+## 5  TUBB6,grch38_18 -1.005931 3.723618e-11 1.270426e-14  8.083550 -7.873065
+## 6 TIMM8B,grch38_11  1.109810 3.654986e-10 1.496412e-13  7.957763  7.532087
+
FF86_res <- batch24_de %>% filter(cmpd_sample_id=="FF-86-NH56")
+ff86_res_toptable <- FF86_res[,13:18]
+ff86_res_toptable <- ff86_res_toptable %>% 
+  separate(gene.ID, into = c("gene", "chrom"), sep = ",") %>%
+  select(-chrom) %>% mutate(combined_id ="FF_86_NH56_10") %>%
+  rename(log2FC=logFC, p_value_adj = adj.P.Val)
+
head(ff86_res_toptable)
+
##         gene    log2FC   p_value_adj       P.Value     AveExpr         t
+## 1 AC122713.2  4.862957 1.397952e-191 9.539077e-196  0.04920401  41.95785
+## 2      TRIP6 -3.328079 5.156775e-166 7.037564e-170  9.40624753 -37.18794
+## 3   MIR7-3HG  8.513923 2.799813e-151 5.731449e-155  0.20792310  34.52594
+## 4       TUBB -2.898698 1.317157e-145 3.595107e-149 11.65399547 -33.50503
+## 5     CYP4F2  6.454706 7.730175e-142 2.637385e-145  0.10131583  32.82814
+## 6       HDGF -3.828482 2.967781e-140 1.215059e-143  9.05238910 -32.53773
+##     combined_id
+## 1 FF_86_NH56_10
+## 2 FF_86_NH56_10
+## 3 FF_86_NH56_10
+## 4 FF_86_NH56_10
+## 5 FF_86_NH56_10
+## 6 FF_86_NH56_10
+
plot_volcano(ff86_res_toptable, max.overlaps =5)
+
## Warning: ggrepel: 4342 unlabeled data points (too many overlaps). Consider
+## increasing max.overlaps
+

+
ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% nrow()
+
## [1] 1423
+
drugseq_deg <- ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% select(gene) %>% pull()
+
#show histogram of p values
+ggplot(ff86_res_toptable, aes(x = P.Value)) +
+  geom_histogram(binwidth = 0.01, fill = "blue", color = "black", alpha = 0.7) +
+  labs(title = "Histogram of Adjusted P-values", x = "Adjusted P-value", y = "Frequency") +
+  theme_minimal()
+

+
+
+

macpie results

+
+
+

All DMSO wells

+
+

Load filered data

+
# mac_filtered <- filter_genes_by_expression(as_mac,
+#                 group_by = "combined_id", min_counts = 10,
+#                 min_samples = min_sample_num)
+
+# saveRDS(mac_filtered,
+# file = paste0(dir, "DRUGseqData/macpie_filtered_batch24.Rds"))
+
+
+mac_filtered <- readRDS(paste0(dir, "/DRUGseqData/macpie_filtered_batch24.Rds"))
+
mac_filtered[["percent.mt"]] <- PercentageFeatureSet(mac_filtered, pattern = "^mt-|^MT-")
+mac_filtered[["percent.ribo"]] <- PercentageFeatureSet(mac_filtered, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA")
+
mac_filtered$combined_id <- str_replace_all(mac_filtered$combined_id, "-","_")
+
+

Correction of batch effect

+

According to DRUGseq metadata:

+
    +
  • Wells with water are labelled as EC-27-RY89

  • +
  • Wells with DMSO are labelled as CB-43-EP73

  • +
+
# mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0","FF_86_NH56_10"))
+mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0"))
+plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "raw")+ scale_x_discrete(drop = FALSE) + 
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
+
## Scale for x is already present.
+## Adding another scale for x, which will replace the existing scale.
+

+
plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "limma_voom")+ scale_x_discrete(drop = FALSE) + 
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
+## Scale for x is already present.
+## Adding another scale for x, which will replace the existing scale.
+

+
plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "DESeq2")+ scale_x_discrete(drop = FALSE) + 
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
+## converting counts to integer mode
+## Scale for x is already present.
+## Adding another scale for x, which will replace the existing scale.
+

+
plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "edgeR")+ scale_x_discrete(drop = FALSE) + 
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
+## Scale for x is already present.
+## Adding another scale for x, which will replace the existing scale.
+

+

Note: instead of discussing which correction methods we +should use for this data, we only show the ways we detected and +corrected batch effect here. As batch effect adjustment for sequencing +data has been implemented in different methods, such as DESeq2, RUVSeq, +edgeR. We highly recommend users thoroughly checking any batch effects +and exploring different methods.

+

In the next part of the vignette, we demonstrate a batch +parameter has implemented in the differential expression test for batch +correction.

+
##Limma trend 
+
+# data <- mac_badDSMOremoved
+# treatment_samples <- "FF_86_NH56_10"
+# control_samples <- "CB_43_EP73_0"
+
+ limma.trend<- function(data = data, treatment_samples, control_samples){
+   data <- data[, grepl(paste0(treatment_samples, "|", control_samples), data$combined_id)]
+   batch <- data$orig.ident
+if (length(unique(data$combined_id)) < 2) {
+      stop("Insufficient factors for differential expression analysis.")
+    }
+pheno_data <- data.frame(batch = as.factor(batch), condition = as.factor(data$combined_id))
+combined_id <- data$combined_id
+model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else
+      model.matrix(~0+combined_id + batch)
+
+    dge <- DGEList(counts = data@assays$RNA$counts,
+                   samples = pheno_data$condition,
+                   group = pheno_data$condition)
+    dge <- calcNormFactors(dge, method="TMMwsp")
+    # keep <- edgeR::filterByExpr(dge, design = model_matrix)
+    # dge  <- dge[keep, , keep.lib.sizes = FALSE]
+    clean.TMM<-log2(edgeR::cpm(dge,  normalized.lib.sizes=T,log=F)+1) 
+    myargs <- list(paste0("combined_id",
+                          treatment_samples, "-",
+                          paste0("combined_id", control_samples)),
+                   levels = model_matrix)
+    contrasts <- do.call(makeContrasts, myargs)
+    
+    lmfit <- lmFit(clean.TMM, model_matrix)
+    lmfit <- contrasts.fit(lmfit, contrasts)
+    lmfit <- eBayes(lmfit, trend = TRUE)
+    #Warning: Zero sample variances detected, have been offset away from zero
+    top_table<- topTable(lmfit, number = nrow(data)) %>%
+      as.data.frame() %>%
+      select("logFC", "t", "P.Value", "adj.P.Val") %>%
+      rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val")%>%
+      rownames_to_column("gene")
+
+    return(top_table)}
+
+    # top_table%>%filter(p_value_adj < 0.05 & log2FC>0)
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
# make a function of alluvial plot with ggalluvial to compare the ranking of DEGs from macpie and DRUGseq
+plot_alluvial <- function(macpie_tbl, drugseq_tbl){
+  df <- macpie_tbl %>%
+  transmute(gene, p_value_adj_mac = p_value_adj, log2FC_mac = log2FC) %>%
+  inner_join(
+    drugseq_tbl %>% transmute(gene, p_value_adj_drg = p_value_adj, log2FC_drg = log2FC),
+    by = "gene"
+  )
+
+#  Rank within each method by (padj, |log2FC|), tie-safe
+rank_by_two <- function(padj, lfc) {
+  ord <- order(padj, -abs(lfc), na.last = TRUE)
+  r   <- integer(length(padj))
+  r[ord] <- seq_along(ord)
+  r
+}
+df <- df %>%
+  mutate(
+    rank_mac = rank_by_two(p_value_adj_mac, log2FC_mac),
+    rank_drg = rank_by_two(p_value_adj_drg, log2FC_drg)
+  )
+
+# Define rank bins
+cuts <- c(0, 25, 50, 100, 200, 300, 500, Inf)
+labs <- c("Top25","26–50","51–100","101–200","201–300", "301-500",">500")
+
+df <- df %>%
+  mutate(
+    mac_bin = cut(rank_mac, breaks = cuts, labels = labs, right = TRUE, include.lowest = TRUE),
+    drg_bin = cut(rank_drg, breaks = cuts, labels = labs, right = TRUE, include.lowest = TRUE)
+  )
+
+# Ranking movement category (color)
+bin_index <- function(x) match(x, labs)   # lower index = "more top"
+df <- df %>%
+  mutate(
+    mac_idx = bin_index(mac_bin),
+    drg_idx = bin_index(drg_bin),
+    movement = case_when(
+      is.na(mac_idx) | is.na(drg_idx) ~ NA_character_,
+      mac_idx < drg_idx ~ "Higher rank in macpie",   # moved up into a more-top bin
+      mac_idx > drg_idx ~ "Lower rank in macpie",    # moved down
+      TRUE               ~ "Same rank"
+    ),
+    movement = factor(movement, levels = c("Higher rank in macpie","Same rank","Lower rank in macpie"))
+  ) %>%
+  tidyr::drop_na(mac_bin, drg_bin, movement)
+
+#  Aggregate flows for alluvial
+flows <- df %>%
+  count(mac_bin, drg_bin, movement, name = "n") %>%
+  mutate(
+    mac_bin = factor(mac_bin, levels = labs),
+    drg_bin = factor(drg_bin, levels = labs)
+  )
+
+ggplot(flows, aes(axis1 = mac_bin, axis2 = drg_bin, y = n)) +
+  geom_alluvium(aes(fill = movement), alpha = 0.75, width = 0.14, knot.pos = 0.4) +
+  geom_stratum(width = 0.14, color = "grey30") +
+  geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
+  scale_x_discrete(limits = c("Macpie rank", "DRUGseq rank"), expand = c(.05, .05)) +
+  scale_fill_manual(values = c("Higher rank in macpie" = "orange",
+                               "Same rank"        = "#7570b3",
+                               "Lower rank in macpie" = "grey60")) +
+  labs(x = NULL, y = "Gene count",
+       fill = "Movement vs DRUGseq",
+       subtitle = "Only DEGs with padj < 0.05 & log2FC>0 (both methods)") +
+  theme_classic()
+  
+}
+
+
+

Differential gene expression

+

In here, you can specify a single condition in the combined_id column +and compare with DMSO (i.e.CB_43_EP73_0). By using the plate IDs in the +column of orig.ident as the input for batch parameter, +compute_singe_de function can perform differential +expression analysis using the preferred method (limma voom in this +example) with batch information.

+
methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")
+
+methods_res <- lapply(methods, function(m){
+  
+  message("\n","Processing method: ", m,"\n") 
+  # m<-"limma_voom"
+  treatment_samples <- "FF_86_NH56_10"
+  control_samples <- "CB_43_EP73_0"
+  subset <- mac_filtered%>%filter(combined_id%in%c(treatment_samples,control_samples))
+
+  batch <- subset$orig.ident
+  if (m!="limma_trend"){
+    subset <- filter_genes_by_expression(subset,
+                           group_by = "combined_id", min_counts = 5,
+                           min_samples = 1)
+    top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch)
+  } else{
+    top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples)
+  }
+  
+  # plot(plot_volcano(top_table, max.overlaps = 5))
+  alldmso_degs <- top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
+  length(intersect(alldmso_degs, drugseq_deg))
+  
+  top_table <- top_table %>%
+    arrange(p_value_adj, desc(log2FC)) %>%
+    mutate(gene = factor(gene, levels = unique(gene)))
+  # add a column if there are in the intersect(alldmso_degs, drugseq_deg)
+  top_table <- top_table %>%
+    mutate(in_drugseq_deg = ifelse(gene %in% intersect(alldmso_degs, drugseq_deg), "yes", "no"))
+  
+  plt_ecdf <- ggplot(top_table, aes(x = p_value_adj, color = in_drugseq_deg)) +
+    stat_ecdf(size = 1) +
+    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
+    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
+         color = "In DRUGseq DEGs") +
+    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+    theme_classic()
+  
+  
+  ks_test_results <- ks.test(top_table$p_value_adj[top_table$in_drugseq_deg=="yes"],
+          top_table$p_value_adj[top_table$in_drugseq_deg=="no"], alternative = "greater")
+  
+  # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
+  #   geom_point(alpha = 0.6, size = 1.2) +
+  #   geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
+  #   geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
+  #   scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) +
+  #   labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") +
+  #   theme_classic()
+  # 
+  # label a few top overlapping genes
+  lab_genes <- top_table[top_table$in_drugseq_deg=="yes", ] |>
+    dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) 
+  
+  volcano_overlap <- ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
+    geom_point(alpha = 0.6, size = 1.2) +
+    geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 50) +
+    scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+
+    theme_classic()
+  
+  ks <- c(25,50,100,200,500,1000)
+  prec_at_k <- sapply(ks, function(k)
+    mean(top_table$in_drugseq_deg[1:k] == "yes")
+  )
+  # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)",
+  #      ylab = "Precision@k (fraction in DRUGseq set)")
+  
+  ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) +
+    geom_point() + geom_line() +
+    labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+
+    theme_classic()
+  
+  # alluvial plot
+  macpie_tbl <- top_table %>% filter(p_value_adj < 0.05 & log2FC>0)
+  drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0)
+  alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl)
+  
+  #return 
+  result_list <- list(top_table = top_table,
+                      num_degs_macpie = length(alldmso_degs),
+                      n_overlap = length(intersect(alldmso_degs, drugseq_deg)),
+                      ecdf_plot = plt_ecdf,
+                      ks_test_results = ks_test_results,
+                      volcano_plot = volcano_overlap,
+                      ks_plot = ks_plot,
+                      alluvial_plot = alluvial_plot)
+  return(result_list)
+  
+})
+
## 
+## Processing method: limma_voom
+
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
+## ℹ Please use `linewidth` instead.
+## This warning is displayed once every 8 hours.
+## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
+## generated.
+
## Warning in ks.test.default(top_table$p_value_adj[top_table$in_drugseq_deg == :
+## p-value will be approximate in the presence of ties
+
## 
+## Processing method: DESeq2
+
## converting counts to integer mode
+
## estimating size factors
+
## estimating dispersions
+
## gene-wise dispersion estimates
+
## mean-dispersion relationship
+
## final dispersion estimates
+
## fitting model and testing
+
## -- replacing outliers and refitting for 1186 genes
+## -- DESeq argument 'minReplicatesForReplace' = 7 
+## -- original counts are preserved in counts(dds)
+
## estimating dispersions
+
## fitting model and testing
+
## Warning in ks.test.default(top_table$p_value_adj[top_table$in_drugseq_deg == :
+## p-value will be approximate in the presence of ties
+
## 
+## Processing method: edgeR
+
## Warning in ks.test.default(top_table$p_value_adj[top_table$in_drugseq_deg == :
+## p-value will be approximate in the presence of ties
+
## 
+## Processing method: limma_trend
+
## Warning: Zero sample variances detected, have been offset away from zero
+## Warning: p-value will be approximate in the presence of ties
+
names(methods_res) <- methods
+
+
+

Summary table

+
#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method
+deg_summary <- map_df(methods_res, function(x) {
+  data.frame(
+    num_degs_macpie = x$num_degs_macpie,
+    n_overlap = x$n_overlap,
+    num_degs_DRUGseq = length(drugseq_deg)
+  )
+}, .id = paste0("macpie_methods"))
+
+deg_summary
+
##   macpie_methods num_degs_macpie n_overlap num_degs_DRUGseq
+## 1     limma_voom            4280       778             1423
+## 2         DESeq2            1959       779             1423
+## 3          edgeR            2321       721             1423
+## 4    limma_trend            1574       400             1423
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+

Overlapping volcano plot

+
methods_res$limma_voom$volcano_plot
+
## Warning: ggrepel: 746 unlabeled data points (too many overlaps). Consider
+## increasing max.overlaps
+

+
methods_res$DESeq2$volcano_plot
+
## Warning: Removed 831 rows containing missing values or values outside the scale range
+## (`geom_point()`).
+
## Warning: ggrepel: 738 unlabeled data points (too many overlaps). Consider
+## increasing max.overlaps
+

+
methods_res$edgeR$volcano_plot
+
## Warning: ggrepel: 654 unlabeled data points (too many overlaps). Consider
+## increasing max.overlaps
+

+
methods_res$limma_trend$volcano_plot
+
## Warning: ggrepel: 368 unlabeled data points (too many overlaps). Consider
+## increasing max.overlaps
+

+ + + + + + + +
+
+

Alluvial plot

+
methods_res$limma_voom$alluvial_plot
+
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+

+
methods_res$DESeq2$alluvial_plot
+
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+

+
methods_res$edgeR$alluvial_plot
+
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+

+
methods_res$limma_trend$alluvial_plot
+
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+

+
#append top_table from each method into a single data frame
+all_methods_top_table <- map_df(methods_res, "top_table", .id = "method")
+head(all_methods_top_table)
+
##       method       gene   log2FC   metric      p_value  p_value_adj
+## 1 limma_voom AL031963.2 1.483560 27.66417 4.119762e-36 8.820411e-32
+## 2 limma_voom   Z97192.4 5.026037 26.15530 9.565172e-35 1.023952e-30
+## 3 limma_voom    SLC45A4 1.720577 20.68042 3.573334e-29 2.550170e-25
+## 4 limma_voom AC007066.1 1.844768 18.51969 1.356781e-22 7.262170e-19
+## 5 limma_voom    CYCSP44 1.696008 14.95493 5.065287e-22 1.807463e-18
+## 6 limma_voom AP003068.2 1.679486 17.96392 4.522782e-22 1.807463e-18
+##   in_drugseq_deg
+## 1             no
+## 2            yes
+## 3            yes
+## 4            yes
+## 5             no
+## 6             no
+
#if in_drugseq_deg is "no", change method to "DRUGseq"
+all_methods_top_table <- all_methods_top_table %>%
+  mutate(method = ifelse(in_drugseq_deg == "no", "not in DRUGseq", method))
+
+#rename methods
+all_methods_top_table$method <- recode(all_methods_top_table$method,
+                                        "limma_voom" = "macpie:limma_voom",
+                                        "DESeq2" = "macpie:DESeq2",
+                                        "edgeR" = "macpie:edgeR",
+                                        "limma_trend" = "macpie:limma_trend")
+
+
+ggplot(all_methods_top_table, aes(x = p_value_adj, color = method)) +
+    stat_ecdf(size = 1) +
+    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
+    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
+         color = "In DRUGseq DEGs") +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+    theme_classic()
+
## Warning: Removed 831 rows containing non-finite outside the scale range
+## (`stat_ecdf()`).
+

+
rm(top_table, alldmso_degs, deg_summary)
+
## Warning in rm(top_table, alldmso_degs, deg_summary): object 'top_table' not
+## found
+
## Warning in rm(top_table, alldmso_degs, deg_summary): object 'alldmso_degs' not
+## found
+
+
+
+

Good DMSO - top 5 from macpie

+

From public DRUG-seq analysis pipeline, authors identified two +reference controls: all DMSO wells and the ‘good DMSO’ wells.

+

We know these good DMSO wells for batch 24 from their published +data:

+
    +
  • VH02012942: I23, M23

  • +
  • VH02012944: D23, H23

  • +
  • VH02012956: I23, J23

  • +
+
batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds"))
+names(batch24)
+
## [1] "VH02012956" "VH02012942" "VH02012944"
+
#make a combined metadata for three plates
+batch24_metadata <- batch24 %>% 
+  map_dfr(~ {
+    .x$Annotation %>%
+      mutate(
+        Plate_ID        = plate_barcode,
+        Well_ID         = well_id,
+        Barcode         = paste0(plate_barcode, "_", well_index),
+        Row             = LETTERS[row],
+        Column          = as.integer(col),
+        Species         = "human",
+        Cell_type       = "U2OS",
+        Model_type      = "2D_adherent",
+        Time            = as.factor(hours_post_treatment),
+        Unit            = "h",
+        Treatment_1     = cmpd_sample_id,
+        Concentration_1 = as.numeric(concentration),
+        Unit_1          = unit,
+        Sample_type     = if_else(well_type == "SA" & col == 24,
+                                  "Positive Control",
+                                  well_type)
+      )
+  })
+
+
+batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id,
+                                                 well_index, col, row, biosample_id, external_biosample_id,
+                                                 cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample))
+
# create a combined UMI matrix for 3 plates
+batch24_counts <- batch24 %>%
+  map(~ {
+    .x$UMI.counts %>%
+      as.data.frame() %>% 
+      rownames_to_column("gene_id") %>%
+      separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>%
+      mutate(gene_name = make.unique(gene_name)) %>%
+      select(-chrom) %>%
+      tibble::column_to_rownames(var = "gene_name") %>%
+      as.matrix()
+  })
+
+binded_counts <- do.call(cbind, batch24_counts)
+
as_mac <- CreateSeuratObject(counts = binded_counts, 
+  min.cells = 1, 
+  min.features = 1)
+
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
+## ('-')
+
## Warning: Data is of class matrix. Coercing to dgCMatrix.
+
as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode"))
+

Filtering steps

+

Include only good DMSO wells as controls

+
as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1)
+
+badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>% 
+ filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23", "K23", "J23","C23"))) |
+        (Plate_ID == "VH02012944" & !(Well_ID %in% c("I23", "M23", "J23", "G23", "K23")))|
+        (Plate_ID == "VH02012956" & !  (Well_ID %in% c("I23", "J23", "O23","M23","K23"))))
+
+
+
+keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO))
+
+
+mac_badDSMOremoved <- as_mac[,keep_wells]
+
+mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_")
+
min_sample_num <- min(table(mac_badDSMOremoved$combined_id))
+mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved,
+                                           group_by = "combined_id", min_counts =10,
+                                           min_samples = min_sample_num)
+
mac_badDSMOremoved[["percent.mt"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^mt-|^MT-")
+mac_badDSMOremoved[["percent.ribo"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA")
+
p <- plot_plate_layout(mac_badDSMOremoved, "nCount_RNA", "combined_id") + facet_wrap(~orig.ident, ncol = 1) + 
+  theme(strip.text = element_text(size=10),
+        axis.text.x = element_text(size=10), 
+        axis.text.y = element_text(size=8),
+        legend.title = element_text(size=10),
+        legend.text = element_text(size=8),
+        trip.background = element_blank())
+girafe(ggobj = p, 
+  fonts = list(sans = "sans"),
+  options = list(
+    opts_hover(css = "stroke:black; stroke-width:1px;")
+  ))
+
## Warning in plot_theme(plot): The `trip.background` theme element is not defined
+## in the element hierarchy.
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")
+
+five_dmso_methods_res <- lapply(methods, function(m){
+  message("Processing method: ", m) 
+  # m<-"limma_voom"
+  treatment_samples <- "FF_86_NH56_10"
+  control_samples <- "CB_43_EP73_0"
+  subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples))
+
+  batch <- subset$orig.ident
+  if (m!="limma_trend"){
+      # subset <- filter_genes_by_expression(subset,
+      #                        group_by = "combined_id", min_counts = 10,
+      #                        min_samples = 1)
+     badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method =  m, batch = batch)
+  } else{
+    badDMSO_out_top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples)
+  }
+ 
+  # plot(plot_volcano(top_table, max.overlaps = 5))
+  badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
+  length(intersect(badDMSO_out_degs, drugseq_deg))
+  
+  badDMSO_out_top_table <- badDMSO_out_top_table %>%
+    arrange(p_value_adj, desc(log2FC)) %>%
+    mutate(gene = factor(gene, levels = unique(gene)))
+  # add a column if there are in the intersect(alldmso_degs, drugseq_deg)
+  badDMSO_out_top_table <- badDMSO_out_top_table %>%
+    mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no"))
+  
+  plt_ecdf <- ggplot(badDMSO_out_top_table, aes(x = p_value_adj, color = in_drugseq_deg)) +
+    stat_ecdf(size = 1) +
+    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
+    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
+         color = "In DRUGseq DEGs") +
+    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+    theme_classic()
+  
+  
+  ks_test_results <- ks.test(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="yes"],
+          badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="no"], alternative = "greater")
+  
+  # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
+  #   geom_point(alpha = 0.6, size = 1.2) +
+  #   geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
+  #   geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
+  #   scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) +
+  #   labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") +
+  #   theme_classic()
+  # 
+  # label a few top overlapping genes
+  lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |>
+    dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) 
+  
+  volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
+    geom_point(alpha = 0.6, size = 1.2) +
+    geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) +
+    scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+
+    theme_classic()
+  
+  ks <- c(25,50,100,200,500,1000)
+  prec_at_k <- sapply(ks, function(k)
+    mean(badDMSO_out_top_table$in_drugseq_deg[1:k] == "yes")
+  )
+  # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)",
+  #      ylab = "Precision@k (fraction in DRUGseq set)")
+  
+  ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) +
+    geom_point() + geom_line() +
+    labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+
+    theme_classic()
+  
+  # alluvial plot
+  macpie_tbl <- badDMSO_out_top_table %>% filter(p_value_adj < 0.05 & log2FC>0)
+  drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0)
+  alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl)
+  
+  #return 
+  result_list <- list(top_table = badDMSO_out_top_table,
+                      num_degs_macpie = length(badDMSO_out_degs),
+                      n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)),
+                      ecdf_plot = plt_ecdf,
+                      ks_test_results = ks_test_results,
+                      volcano_plot = volcano_overlap,
+                      ks_plot = ks_plot,
+                      alluvial_plot = alluvial_plot)
+  return(result_list)
+  
+})
+
## Processing method: limma_voom
+
## Warning in
+## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
+## == : p-value will be approximate in the presence of ties
+
## Processing method: DESeq2
+
## converting counts to integer mode
+
## estimating size factors
+
## estimating dispersions
+
## gene-wise dispersion estimates
+
## mean-dispersion relationship
+
## final dispersion estimates
+
## fitting model and testing
+
## Warning in
+## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
+## == : p-value will be approximate in the presence of ties
+
## Processing method: edgeR
+
## Warning in
+## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
+## == : p-value will be approximate in the presence of ties
+
## Processing method: limma_trend
+
## Warning: Zero sample variances detected, have been offset away from zero
+## Warning: p-value will be approximate in the presence of ties
+
names(five_dmso_methods_res) <- methods
+
+

Summary table

+
#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method
+deg_summary <- map_df(five_dmso_methods_res, function(x) {
+  data.frame(
+    num_degs_macpie = x$num_degs_macpie,
+    n_overlap = x$n_overlap,
+    num_degs_DRUGseq = length(drugseq_deg)
+  )
+}, .id = paste0("macpie_methods"))
+
+deg_summary
+
##   macpie_methods num_degs_macpie n_overlap num_degs_DRUGseq
+## 1     limma_voom            3456       592             1423
+## 2         DESeq2            1107       549             1423
+## 3          edgeR            2185       690             1423
+## 4    limma_trend               5         2             1423
+
+
+

ECDF plot

+
#append top_table from each method into a single data frame
+good_dmso_top_table <- map_df(five_dmso_methods_res, "top_table", .id = "method")
+head(good_dmso_top_table)
+
##       method       gene   log2FC   metric      p_value  p_value_adj
+## 1 limma_voom AL031963.2 1.477162 34.73174 4.135992e-26 1.048226e-21
+## 2 limma_voom    SLC45A4 1.746060 25.83312 2.602120e-22 3.297406e-18
+## 3 limma_voom   Z97192.4 5.194480 22.09739 2.424226e-20 2.047986e-16
+## 4 limma_voom AP003068.2 1.691204 26.07754 6.263377e-20 3.968475e-16
+## 5 limma_voom AL583807.1 2.808626 22.32748 2.810497e-18 1.187154e-14
+## 6 limma_voom AC007066.1 1.833326 23.05883 2.624228e-18 1.187154e-14
+##   in_drugseq_deg
+## 1             no
+## 2            yes
+## 3            yes
+## 4             no
+## 5             no
+## 6            yes
+
#if in_drugseq_deg is "no", change method to "DRUGseq"
+good_dmso_top_table<- good_dmso_top_table %>%
+  mutate(method = ifelse(in_drugseq_deg == "no", "not_in_DRUGseq", method))
+
+#rename methods
+good_dmso_top_table$method <- recode(good_dmso_top_table$method,
+                                        "limma_voom" = "macpie:limma_voom",
+                                        "DESeq2" = "macpie:DESeq2",
+                                        "edgeR" = "macpie:edgeR",
+                                        "limma_trend" = "macpie:limma_trend")
+
+
+ggplot(good_dmso_top_table, aes(x = p_value_adj, color = method)) +
+    stat_ecdf(size = 1) +
+    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
+    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
+         color = "In DRUGseq DEGs") +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+    theme_classic()
+
## Warning: Removed 12331 rows containing non-finite outside the scale range
+## (`stat_ecdf()`).
+

+ + + + + + +
+
+

Overlapping volcano plot

+
five_dmso_methods_res$limma_voom$volcano_plot
+
## Warning: ggrepel: 575 unlabeled data points (too many overlaps). Consider
+## increasing max.overlaps
+

+
five_dmso_methods_res$DESeq2$volcano_plot
+
## Warning: Removed 12331 rows containing missing values or values outside the scale range
+## (`geom_point()`).
+
## Warning: ggrepel: 536 unlabeled data points (too many overlaps). Consider
+## increasing max.overlaps
+

+
five_dmso_methods_res$edgeR$volcano_plot
+
## Warning: ggrepel: 662 unlabeled data points (too many overlaps). Consider
+## increasing max.overlaps
+

+
five_dmso_methods_res$limma_trend$volcano_plot
+

+ + + + + + + +
+
+

Alluvial plot

+
five_dmso_methods_res$limma_voom$alluvial_plot
+
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+

+
five_dmso_methods_res$DESeq2$alluvial_plot
+
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+

+
five_dmso_methods_res$edgeR$alluvial_plot
+
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+

+
five_dmso_methods_res$limma_trend$alluvial_plot
+
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+

Good DMSO - DRUGseq

+

From public DRUG-seq analysis pipeline, authors identified two +reference controls: all DMSO wells and the ‘good DMSO’ wells.

+

We know these good DMSO wells for batch 24 from their published +data:

+
    +
  • VH02012942: I23, M23

  • +
  • VH02012944: D23, H23

  • +
  • VH02012956: I23, J23

  • +
+
batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds"))
+names(batch24)
+
## [1] "VH02012956" "VH02012942" "VH02012944"
+
#make a combined metadata for three plates
+batch24_metadata <- batch24 %>% 
+  map_dfr(~ {
+    .x$Annotation %>%
+      mutate(
+        Plate_ID        = plate_barcode,
+        Well_ID         = well_id,
+        Barcode         = paste0(plate_barcode, "_", well_index),
+        Row             = LETTERS[row],
+        Column          = as.integer(col),
+        Species         = "human",
+        Cell_type       = "U2OS",
+        Model_type      = "2D_adherent",
+        Time            = as.factor(hours_post_treatment),
+        Unit            = "h",
+        Treatment_1     = cmpd_sample_id,
+        Concentration_1 = as.numeric(concentration),
+        Unit_1          = unit,
+        Sample_type     = if_else(well_type == "SA" & col == 24,
+                                  "Positive Control",
+                                  well_type)
+      )
+  })
+
+
+batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id,
+                                                 well_index, col, row, biosample_id, external_biosample_id,
+                                                 cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample))
+
# create a combined UMI matrix for 3 plates
+batch24_counts <- batch24 %>%
+  map(~ {
+    .x$UMI.counts %>%
+      as.data.frame() %>% 
+      rownames_to_column("gene_id") %>%
+      separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>%
+      mutate(gene_name = make.unique(gene_name)) %>%
+      select(-chrom) %>%
+      tibble::column_to_rownames(var = "gene_name") %>%
+      as.matrix()
+  })
+
+binded_counts <- do.call(cbind, batch24_counts)
+
as_mac <- CreateSeuratObject(counts = binded_counts, 
+  min.cells = 1, 
+  min.features = 1)
+
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
+## ('-')
+
## Warning: Data is of class matrix. Coercing to dgCMatrix.
+
as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode"))
+

Filtering steps

+

Include only good DMSO wells as controls

+
as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1)
+
+badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>% 
+ filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23"))) |
+        (Plate_ID == "VH02012944" & !(Well_ID %in% c("D23", "H23")))|
+        (Plate_ID == "VH02012956" & !  (Well_ID %in% c("I23", "J23"))))
+
+
+
+keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO))
+
+
+mac_badDSMOremoved <- as_mac[,keep_wells]
+
+mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_")
+
min_sample_num <- min(table(mac_badDSMOremoved$combined_id))
+mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved,
+                                           group_by = "combined_id", min_counts =10,
+                                           min_samples = min_sample_num)
+
mac_badDSMOremoved[["percent.mt"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^mt-|^MT-")
+mac_badDSMOremoved[["percent.ribo"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA")
+
methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")
+
+good_dmso_methods_res <- lapply(methods, function(m){
+  message("Processing method: ", m) 
+  # m<-"limma_voom"
+  treatment_samples <- "FF_86_NH56_10"
+  control_samples <- "CB_43_EP73_0"
+  subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples))
+
+  batch <- subset$orig.ident
+  if (m!="limma_trend"){
+      # subset <- filter_genes_by_expression(subset,
+      #                        group_by = "combined_id", min_counts = 10,
+      #                        min_samples = 1)
+     badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method =  m, batch = batch)
+  } else{
+    badDMSO_out_top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples)
+  }
+ 
+  # plot(plot_volcano(top_table, max.overlaps = 5))
+  badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
+  length(intersect(badDMSO_out_degs, drugseq_deg))
+  
+  badDMSO_out_top_table <- badDMSO_out_top_table %>%
+    arrange(p_value_adj, desc(log2FC)) %>%
+    mutate(gene = factor(gene, levels = unique(gene)))
+  # add a column if there are in the intersect(alldmso_degs, drugseq_deg)
+  badDMSO_out_top_table <- badDMSO_out_top_table %>%
+    mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no"))
+  
+  plt_ecdf <- ggplot(badDMSO_out_top_table, aes(x = p_value_adj, color = in_drugseq_deg)) +
+    stat_ecdf(size = 1) +
+    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
+    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
+         color = "In DRUGseq DEGs") +
+    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+    theme_classic()
+  
+  
+  ks_test_results <- ks.test(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="yes"],
+          badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="no"], alternative = "greater")
+  
+  # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
+  #   geom_point(alpha = 0.6, size = 1.2) +
+  #   geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
+  #   geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
+  #   scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) +
+  #   labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") +
+  #   theme_classic()
+  # 
+  # label a few top overlapping genes
+  lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |>
+    dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) 
+  
+  volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
+    geom_point(alpha = 0.6, size = 1.2) +
+    geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) +
+    scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+
+    theme_classic()
+  
+  ks <- c(25,50,100,200,500,1000)
+  prec_at_k <- sapply(ks, function(k)
+    mean(badDMSO_out_top_table$in_drugseq_deg[1:k] == "yes")
+  )
+  # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)",
+  #      ylab = "Precision@k (fraction in DRUGseq set)")
+  
+  ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) +
+    geom_point() + geom_line() +
+    labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+
+    theme_classic()
+  
+  # alluvial plot
+  macpie_tbl <- badDMSO_out_top_table %>% filter(p_value_adj < 0.05 & log2FC>0)
+  drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0)
+  alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl)
+  
+  #return 
+  result_list <- list(top_table = badDMSO_out_top_table,
+                      num_degs_macpie = length(badDMSO_out_degs),
+                      n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)),
+                      ecdf_plot = plt_ecdf,
+                      ks_test_results = ks_test_results,
+                      volcano_plot = volcano_overlap,
+                      ks_plot = ks_plot,
+                      alluvial_plot = alluvial_plot)
+  return(result_list)
+  
+})
+
## Processing method: limma_voom
+
## Warning in
+## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
+## == : p-value will be approximate in the presence of ties
+
## Processing method: DESeq2
+
## converting counts to integer mode
+
## estimating size factors
+
## estimating dispersions
+
## gene-wise dispersion estimates
+
## mean-dispersion relationship
+
## final dispersion estimates
+
## fitting model and testing
+
## Warning in
+## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
+## == : p-value will be approximate in the presence of ties
+
## Processing method: edgeR
+
## Warning in
+## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
+## == : p-value will be approximate in the presence of ties
+
## Processing method: limma_trend
+
## Warning: Zero sample variances detected, have been offset away from zero
+## Warning: p-value will be approximate in the presence of ties
+
names(good_dmso_methods_res) <- methods
+
+

Summary table

+
#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method
+deg_summary <- map_df(good_dmso_methods_res, function(x) {
+  data.frame(
+    num_degs_macpie = x$num_degs_macpie,
+    n_overlap = x$n_overlap,
+    num_degs_DRUGseq = length(drugseq_deg)
+  )
+}, .id = paste0("macpie_methods"))
+
+deg_summary
+
##   macpie_methods num_degs_macpie n_overlap num_degs_DRUGseq
+## 1     limma_voom            1352       204             1423
+## 2         DESeq2              10         8             1423
+## 3          edgeR            1757       604             1423
+## 4    limma_trend               2         1             1423
+
+
+

ECDF plot

+
#append top_table from each method into a single data frame
+good_dmso_top_table <- map_df(good_dmso_methods_res, "top_table", .id = "method")
+head(good_dmso_top_table)
+
##       method       gene   log2FC   metric       p_value   p_value_adj
+## 1 limma_voom AL031963.2 1.450009 52.04866  0.000000e+00  0.000000e+00
+## 2 limma_voom  RNU6-633P 1.254947 72.59594  0.000000e+00  0.000000e+00
+## 3 limma_voom    ELOCP10 1.269453 35.96512 2.086923e-279 1.762685e-275
+## 4 limma_voom AP003068.2 1.677015 35.65408 1.072200e-274 6.792120e-271
+## 5 limma_voom AC007066.1 1.821885 32.63913 4.558729e-220 2.310273e-216
+## 6 limma_voom    SLC45A4 1.745313 31.67705 6.854635e-218 2.894826e-214
+##   in_drugseq_deg
+## 1             no
+## 2             no
+## 3             no
+## 4             no
+## 5            yes
+## 6            yes
+
#if in_drugseq_deg is "no", change method to "DRUGseq"
+good_dmso_top_table<- good_dmso_top_table %>%
+  mutate(method = ifelse(in_drugseq_deg == "no", "not_in_DRUGseq", method))
+
+#rename methods
+good_dmso_top_table$method <- recode(good_dmso_top_table$method,
+                                        "limma_voom" = "macpie:limma_voom",
+                                        "DESeq2" = "macpie:DESeq2",
+                                        "edgeR" = "macpie:edgeR",
+                                        "limma_trend" = "macpie:limma_trend")
+
+
+ggplot(good_dmso_top_table, aes(x = p_value_adj, color = method)) +
+    stat_ecdf(size = 1) +
+    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
+    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
+         color = "In DRUGseq DEGs") +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+    theme_classic()
+
## Warning in scale_x_continuous(trans = "log10", breaks = c(1e-06, 1e-04, :
+## log-10 transformation introduced infinite values.
+
## Warning: Removed 19707 rows containing non-finite outside the scale range
+## (`stat_ecdf()`).
+

+
good_dmso_methods_res$limma_voom$ecdf_plot
+
## Warning in scale_x_continuous(trans = "log10", breaks = c(1e-06, 1e-04, :
+## log-10 transformation introduced infinite values.
+
## Warning: Removed 2 rows containing non-finite outside the scale range
+## (`stat_ecdf()`).
+

+
good_dmso_methods_res$DESeq2$ecdf_plot
+
## Warning: Removed 19705 rows containing non-finite outside the scale range
+## (`stat_ecdf()`).
+

+
good_dmso_methods_res$edgeR$ecdf_plot
+

+
good_dmso_methods_res$limma_trend$ecdf_plot
+

+### Overlapping volcano plot

+
good_dmso_methods_res$limma_voom$volcano_plot
+
## Warning: ggrepel: 196 unlabeled data points (too many overlaps). Consider
+## increasing max.overlaps
+

+
good_dmso_methods_res$DESeq2$volcano_plot
+
## Warning: Removed 19705 rows containing missing values or values outside the scale range
+## (`geom_point()`).
+

+
good_dmso_methods_res$edgeR$volcano_plot
+
## Warning: ggrepel: 583 unlabeled data points (too many overlaps). Consider
+## increasing max.overlaps
+

+
good_dmso_methods_res$limma_trend$volcano_plot
+

+ + + + + + + +
+
+

Alluvial plot

+
good_dmso_methods_res$limma_voom$alluvial_plot
+
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+

+
good_dmso_methods_res$DESeq2$alluvial_plot
+
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+

+
good_dmso_methods_res$edgeR$alluvial_plot
+
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+

+
good_dmso_methods_res$limma_trend$alluvial_plot
+
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
+## params$discern): Some strata appear at multiple axes.
+

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+

Comparison

+

To compare DEGs with different replicate numbers and different +methods

+
methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")
+
+get_jaccard <- function(deg_set, drugseq_deg){
+  intersection <- length(intersect(deg_set, drugseq_deg))
+  union <- length(union(deg_set, drugseq_deg))
+  jaccard_index <- intersection / union
+  return(jaccard_index)
+}
+
+jaccard_index <- lapply(methods, function(m){
+  # all dmso
+  degs <- methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
+  jaccard_all <- get_jaccard(degs, drugseq_deg)
+  # five dmso
+  degs <- five_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
+  jaccard_five <- get_jaccard(degs, drugseq_deg)
+  # three dmso
+  degs <- good_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
+  jaccard_three <- get_jaccard(degs, drugseq_deg)
+  jaccard_index <- data.frame(
+    method = m,
+    jaccard_all = jaccard_all,
+    jaccard_five = jaccard_five,
+    jaccard_three = jaccard_three
+  )
+  return(jaccard_index)
+})
+
+df <- as.data.frame(do.call(rbind, jaccard_index))
+rownames(df) <- df$method
+df <- df %>% select(-method)
+colnames(df) <- c("All DMSO", "macpie: 15 DMSO", "DRUGseq: 6 DMSO")
+pheatmap::pheatmap(df,
+         cluster_rows = FALSE,
+         cluster_cols = FALSE,
+         display_numbers = TRUE,
+         main = "Jaccard Index between macpie DEGs and DRUGseq DEGs")
+

+
library(UpSetR)
+all_dmso <- list(
+  limma_voom = methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
+  DESeq2 = methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
+  edgeR = methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
+  limma_trend = methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
+  DRUGseq = drugseq_deg
+)
+upset(fromList(all_dmso), 
+      nsets = 5, 
+      order.by = "freq",
+      main.bar.color = "black",
+      sets.bar.color = "gray23",
+      text.scale = c(2, 2, 2, 1.5, 2, 1.5))
+

+
five_dmso <- list(
+  limma_voom = five_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
+  DESeq2 = five_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
+  edgeR = five_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
+  limma_trend = five_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
+  DRUGseq = drugseq_deg
+)
+upset(fromList(five_dmso), 
+      nsets = 5, 
+      order.by = "freq",
+      main.bar.color = "black",
+      sets.bar.color = "gray23",
+      text.scale = c(2, 2, 2, 1.5, 2, 1.5))
+

+
good_dmso <- list(
+  limma_voom = good_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
+  DESeq2 = good_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
+  edgeR = good_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
+  limma_trend = good_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
+  DRUGseq = drugseq_deg
+)
+upset(fromList(good_dmso), 
+      nsets = 5, 
+      order.by = "freq",
+      main.bar.color = "black",
+      sets.bar.color = "gray23",
+      text.scale = c(2, 2, 2, 1.5, 2, 1.5))
+

+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + diff --git a/vignettes/DRUGseq_first500runs.Rmd b/vignettes/DRUGseq_first500runs.Rmd new file mode 100644 index 0000000..3878bba --- /dev/null +++ b/vignettes/DRUGseq_first500runs.Rmd @@ -0,0 +1,652 @@ +--- +title: "DRUG-seq data analysis pipeline" +output: + html_document: + toc: true + toc_float: true +--- + +```{r} + +suppressPackageStartupMessages({ + library(dplyr); library(tidyr); library(limma); library(edgeR) +}) +``` + + +```{r} +library(stringr) +library(DT) +library(tidyr) +library(dplyr) +library(foreach) +library(limma) +library(edgeR) +library(umap) +library(data.table) +library(ggplot2) +library(GGally) +library(plotly) +library(ggrepel) +library(viridis) +library(RANN) +library(igraph) +library(RColorBrewer) +library(patchwork) +``` + +# DRUGseq data analysis pipeline - DMSO selection + +## data path setup +```{r setup,include=FALSE} +dir <- "/Users/liuxin/macpie_Dev/" +setwd(paste0(dir,"/DRUGseqData/DRUG-seq-main")) +analysis_dir <- getwd() +# dataset <- "flowcell_4000_UMI_decode_VH02001612.RData" +input_data_dir <- file.path(analysis_dir,'data') +knitr::opts_knit$set(root.dir = analysis_dir) +out_data_dir <- file.path(analysis_dir,'out') +if(!dir.exists(out_data_dir)) {dir.create(out_data_dir)} +``` + +**project folder:** +`r analysis_dir` + + +## 1.2 functions +```{r limma toptable} +get_union_toptable <- function(fit3) { + contrast_names <- colnames(coef(fit3)) + n_contrasts <- length(contrast_names) + + tt <- foreach(i=1:n_contrasts, .combine=rbind) %do% + { topTable(fit3, coef = i, + p.value = 1, number = Inf) %>% + mutate(GeneId=rownames(.), contrast=colnames(coef(fit3))[i]) } + + tt$contrast <- factor(tt$contrast, levels=contrast_names) + return(tt) +} +``` + + +```{r DE.limma trend} +DE.limma_trend<-function(Exp=Exp,CTRL,meta,best_DMSO=NULL,random_run=FALSE){ + if(!is.null(best_DMSO)){ + print('Starting DE calculations using best DMSO.') + }else{print('Starting DE calculations.') + } + + de<-lapply(1:length(Exp), function(i){ + b<-names(Exp)[i] + meta_sub<- meta %>% filter(batch==b) %>% mutate(plate_barcode_short = substr(plate_barcode, nchar(plate_barcode)-2, nchar(plate_barcode))) + if(!is.null(best_DMSO)) { + meta_sub <- meta_sub %>% filter(treatment!=CTRL|barcode_well %in% best_DMSO$barcode_well) #only use best DMSO as control + } + + rownames(meta_sub)<-meta_sub$plate_id + UMI_batch<-lapply(1:length(Exp[[i]]), function(j){ + #print(paste0('i ',i,' / j ',j)) + x<-Exp[[i]][[j]] + mat<-x$UMI.counts + mat<-mat[!rownames(mat) %in% grep("ERCC-",rownames(mat),value = T),] + mat<-mat[order(row.names(mat)),] + return(mat) + }) + # active checking that the objects in the list have exactly the same names + unames <- unique(unlist(lapply(UMI_batch, rownames))) + stopifnot( all( sapply(UMI_batch, function(x) identical(rownames(x), unames)) ) ) + UMI_batch<-do.call(cbind,UMI_batch) + + + UMI_batch<-UMI_batch[,meta_sub$plate_id] + clean<-UMI_batch[apply(UMI_batch,1,max)>quantile(apply(UMI_batch,1,max),0.75),] + clean<-clean[,colSums(clean)>0] + + d<-DGEList(counts=clean) + d <- calcNormFactors(d, method="TMM") + clean.TMM<-log2(edgeR::cpm(d, normalized.lib.sizes=T,log=F)+1) + + meta_sub<-meta_sub[colnames(clean.TMM),] + meta_sub$Sample<-sapply(meta_sub$Sample, function(k) gsub("-",".",k)) + + + + design<-model.matrix(~0+factor(meta_sub$Sample)+factor(meta_sub$replicate)) + colnames(design)<-c(levels(factor(meta_sub$Sample)),levels(factor(meta_sub$replicate))[-1]) + rownames(design)<-colnames(clean) + samples<-unique(meta_sub$Sample) + ctrl<-grep(CTRL,samples,value = T) + #print(ctrl) + samples<-samples[!samples %in% ctrl] + + contrasts<-sapply(samples, function(ctrs)paste0(ctrs,"-",ctrl)) + contrast.matrix<-limma::makeContrasts(contrasts=contrasts, levels = design) + + fit = limma::lmFit(clean.TMM, design) + fit2 = limma::contrasts.fit(fit,contrast.matrix) + fit3 = limma::eBayes(fit2, trend = T) + + DE.table <- get_union_toptable(fit3) %>% + separate(contrast,c('contrast','ctrl_arm'),sep="-") %>% + select(-ctrl_arm) %>% + merge(meta_sub %>% + filter(!treatment ==CTRL) %>% + select(plate_barcode_short,treatment,treatment_dose,timepoint,Sample,plate_well,batch) %>% + distinct() %>% + group_by(treatment,treatment_dose,timepoint,Sample,batch) %>% + summarise(plate_well = paste(paste(plate_barcode_short,plate_well,sep = " "), collapse=",")), + sort=FALSE, by.x='contrast',by.y='Sample') %>% mutate(Sample=contrast) # combine metadata with toptable + + }) + if (random_run==TRUE){ + de<-do.call(rbind,de) %>% + filter(Sample=="RC_to_SA") %>% + group_by(batch,plate_well) %>% + filter(abs(logFC)>1,adj.P.Val<0.1) %>% + summarise(count=n()) %>% + mutate(run_no=n,contrast='RC_to_SA') %>% + ungroup() + }else{ + de<-do.call(rbind,de) + } + return(de) +} +``` + +```{r volcano plot} +vpPlot_DElabel <- function(this_contrast_toptabel,batch_select=1,this_arms){ + + pcut<-0.1 + #color= black,red, blue + # 0 1 2 + color_custum<-c(rgb(0,0,0,0.5),rgb(1,0,0,1),rgb(0,0,1,1)) + + this_contrast_toptabel <- this_contrast_toptabel %>% + filter(treatment %in% this_arms,batch==batch_select) %>% + mutate(label=0) + + + + high_concentration<-this_contrast_toptabel %>% select(treatment_dose) %>% max() + + this_contrast_toptabel_plot<-NULL + + for (SampleName_lib_temp in this_arms) { + upregulate_gene <- this_contrast_toptabel %>% + filter(treatment == SampleName_lib_temp) %>% + filter(treatment_dose==high_concentration, adj.P.Val1) %>% + select(GeneId) %>% + .$GeneId + downregulate_gene <-this_contrast_toptabel %>% + filter(treatment == SampleName_lib_temp) %>% + filter(treatment_dose==high_concentration, adj.P.Val% + select(GeneId) %>% + .$GeneId + + this_contrast_toptabel_plot <- rbind(this_contrast_toptabel_plot,this_contrast_toptabel + %>% filter(treatment==SampleName_lib_temp) %>% + mutate(label=ifelse(GeneId %in% upregulate_gene,1,label)) %>% + mutate(label=ifelse(GeneId %in% downregulate_gene,2,label))) + } + + this_contrast_toptabel_plot$label<-as.factor(this_contrast_toptabel_plot$label) + levels(this_contrast_toptabel_plot$label)<-c("not significant ","upregulated genes","downregulated genes") + this_contrast_toptabel_plot$treatment<-factor(this_contrast_toptabel_plot$treatment) + + + + vp<-this_contrast_toptabel_plot %>% arrange(label) %>% + ggplot(aes(x=logFC, y=-log10(adj.P.Val), color=label))+ + scale_color_manual(values = color_custum)+ + geom_point(fill="white",shape=21, size=1, stroke=0.5) + + geom_abline(slope = 0, intercept = -log10(pcut), size=0.3, linetype='dashed') + + geom_vline(xintercept = -1, size=0.3, linetype='dashed') + + geom_vline(xintercept = 1, size=0.3, linetype='dashed') + + xlim(-8,8)+ + theme_bw() + + theme(axis.text.x = element_text(face="bold",size=14), + axis.text.y = element_text(face="bold", size=10), + axis.title=element_text(size=14,face="bold"), + legend.text=element_text(size=12) ) + + facet_grid(treatment~treatment_dose)+ + theme(strip.text = element_text(face="bold", size=12)) + + return(vp) + } +``` + +```{r VennDiagram} +## works up to 3 groups comparison now +VD_plot <- function(this_contrast_toptable,batch_select=1){ + this_contrast_toptable_plot <- this_contrast_toptable %>% + mutate(sig=ifelse(abs(logFC)>1 & adj.P.Val<0.1,-1,0)) %>% + mutate(sig=ifelse(logFC>1 & adj.P.Val<0.1,1,sig)) %>% + mutate(contrast=paste0(treatment_dose,'uM')) %>% + select(contrast,GeneId,sig) %>% + reshape2::dcast(GeneId~contrast,value.var="sig") + + + rownames(this_contrast_toptable_plot) <- this_contrast_toptable_plot$GeneId + return(this_contrast_toptable_plot %>% select(-GeneId) %>% vennDiagram(circle.col=c("turquoise", "salmon","palegreen"),cex=1)) + +} +``` + + +```{r UMAP} +pca_umap <- function(counts=NULL,center_pca=T,metric="euclidean", scale_pca=T,n_neighbors=15,n_pca=20){ + + pca<-prcomp(t(counts), scale = scale_pca, center = center_pca) + rownames(pca$x) <- colnames(counts) + umap <- umap(pca$x[,1:n_pca],method = "umap-learn",metric=metric, n_neighbors=n_neighbors) + rownames(umap$layout) <- colnames(counts) + return(umap) + + +} + +``` + +```{r customized pair plot} +ggpairs_custom<-function(toptable_meta,this_arms,p_cut,corr_size=3,batch_s=1,strip_size=NULL,label_size=NULL,exclusive_gene=NULL){ + + toptable_meta=toptable_meta %>% mutate(contrast=paste(treatment,treatment_dose,timepoint,sep="_")) + this_contrast=toptable_meta %>% + filter(Sample %in% this_arms,batch==batch_s) %>% + arrange(treatment_dose) %>% + select(contrast) %>% + distinct() %>%.$contrast + p_cut <- p_cut + + +if (is.null(exclusive_gene)) { +valid_genes <- toptable_meta %>% + filter(Sample %in% this_arms) %>% + filter(batch==batch_s) %>% + filter(adj.P.Val% + .$GeneId %>% + unique() +} else { + + valid_genes <- toptable_meta %>% + filter(Sample %in% this_arms) %>% + filter(batch==batch_s) %>% + filter(adj.P.Val% + .$GeneId %>% + unique() %>% + setdiff(exclusive_gene) + +} + +table_to_plot1<-toptable_meta %>% + filter(Sample %in% this_arms) %>% + filter(batch==batch_s) %>% + filter(GeneId %in% valid_genes) %>% + dplyr::select(GeneId, contrast, logFC) %>% + setDT %>% + data.table::dcast(GeneId ~ contrast, value.var = 'logFC') %>% + as.data.frame() + +table_to_plot1<-table_to_plot1[,c("GeneId",this_contrast)] +rownames(table_to_plot1)<-table_to_plot1$GeneId + +table_to_plot2<-toptable_meta %>% + filter(Sample %in% this_arms) %>% + filter(batch==batch_s) %>% + filter(adj.P.Val% + dplyr::select(GeneId, contrast, logFC) %>% + setDT %>% + data.table::dcast(GeneId ~ contrast, value.var = 'logFC') %>% + as.data.frame() + +missing_col<-setdiff(colnames(table_to_plot1),colnames(table_to_plot2)) +if (!identical(missing_col, character(0))) { + missing_mat<-data.frame(matrix(0, nrow = nrow(table_to_plot2), ncol =as.numeric( length(missing_col)))) + missing_mat[missing_mat==0]=NA + colnames(missing_mat)<-missing_col + table_to_plot2<-cbind(table_to_plot2,missing_mat) +} + +table_to_plot2<-table_to_plot2[,c("GeneId",this_contrast)] + +data_plot<-inner_join(table_to_plot1,table_to_plot2,by="GeneId") + + + + rownames(data_plot)<-data_plot$GeneId + data_plot<-data_plot %>% select(-"GeneId") + + axis_limt<-max(abs(data_plot),na.rm = TRUE) + + diagfun <- function(data,mapping){ + ggplot(data = data, mapping = mapping)+ + geom_density(color="darkblue", fill="lightblue") + + scale_x_continuous(limits = c(-axis_limt, axis_limt)) + # scale_y_continuous(limits = c(-axis_limt, axis_limt)) + } + + corr_heatmap <- function(data, mapping, method="p", use="pairwise", ...){ + + # grab data + x <- eval_data_col(data, mapping$x) + y <- eval_data_col(data, mapping$y) + + # calculate correlation + corr <- cor(x, y, method=method, use=use) + colFn <- colorRampPalette(c("blue", "white", "red"), interpolate ='spline') + fill <- colFn(100)[findInterval(corr, seq(-1, 1, length=100))] + ggally_cor(data = data, mapping = mapping, ...) + + theme_classic()+ + theme(panel.background = element_rect(fill=fill,colour = fill)) + } + + + pm<-ggpairs(data_plot,c(1:as.numeric(length(colnames(data_plot))/2)),columnLabels=colnames(table_to_plot1)[-1],diag=list(continuous = wrap(diagfun)), + upper = list(continuous = corr_heatmap),progress = F)+theme(axis.text = element_text(size = label_size),strip.placement = "outside", + text = element_text(size = strip_size) ) + + method<-"pearson" + pm2 <- pm + color_custum<-c(rgb(0,0,0,0.5),rgb(0,1,0,0.7),rgb(0,0,1,0.7),rgb(1,0,0,1)) + label_list<-c('not significant','x axis DE genes','y axis DE genes','shared DE genes') + for(i in 2:pm$nrow) { + for(j in 1:(i-1)) { + + x <- GGally::eval_data_col(pm[i,j]$data, pm[i,j]$mapping$x) + y <- GGally::eval_data_col(pm[i,j]$data, pm[i,j]$mapping$y) + corr<- cor(x,y,method=method,use="na.or.complete") + col_name_x_x<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$x),".x") + col_name_y_x<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$y),".x") + col_name_x_y<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$x),".y") + col_name_y_y<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$y),".y") + col_name<-c(col_name_x_x,col_name_y_x,col_name_x_y,col_name_y_y) + data_sub<-pm$data[,col_name] + + data_sub<-data_sub %>% + mutate(label = ifelse(!is.na(!!sym(col_name_x_y)) & + abs(!!sym(col_name_x_y))>1,1,0)) %>% + mutate(label = ifelse(!is.na(!!sym(col_name_y_y)) & + abs(!!sym(col_name_y_y))>1,2,label)) %>% + mutate(label = ifelse(!is.na(!!sym(col_name_x_y)) & + abs(!!sym(col_name_x_y))>1 & + !is.na(!!sym(col_name_y_y))& + abs(!!sym(col_name_y_y))>1 ,3,label)) %>% + arrange(label) + + unique_label<-sort(unique(data_sub$label)) + data_sub$label<-as.factor(data_sub$label) + levels(data_sub$label)<-label_list[unique_label+1] + + p<-ggplot(data_sub,aes(x=!!sym(pm[i,j]$labels$x),y=!!sym(pm[i,j]$labels$y),color=label))+ + geom_point(size=2,shape=21,stroke=0.5,fill="white")+theme_bw()+scale_color_manual(values = color_custum[unique_label+1])+ scale_shape(solid = TRUE) + + scale_x_continuous(limits = c(-axis_limt, axis_limt)) + + scale_y_continuous(limits = c(-axis_limt, axis_limt)) + + geom_abline(slope = 0, intercept = -1, size=0.3, linetype='dashed') + + geom_abline(slope = 0, intercept = 1, size=0.3, linetype='dashed') + + geom_vline(xintercept = -1, size=0.3, linetype='dashed') + + geom_vline(xintercept = 1, size=0.3, linetype='dashed') + pm2[i,j]=p+geom_label(data = data.frame(xlabel = -axis_limt, + ylabel = max(axis_limt, na.rm = TRUE), + lab = round(corr, digits = 3)), + mapping = ggplot2::aes(x = xlabel, y = ylabel, label = lab), + hjust = 0, vjust = 1,size = corr_size, fontface = "bold", + inherit.aes = FALSE # do not inherit anything from the ... + ) + + + } + } + return(pm2) +} + + + + +``` + + +### UMI count table preparation +```{r count matrix,message = FALSE} +meta <- read.csv(file.path(input_data_dir,'meta.csv'),stringsAsFactors = F) %>% + filter(plate_barcode %in% c("VH02001612", "VH02001614", "VH02001618")) +Exp<-lapply(unique(meta$batch), function(x){ + b<-unique(meta[meta$batch==x,]$plate_barcode) + res<-lapply(b, function(y){ + load(file.path(input_data_dir, paste0("flowcell_4000_UMI_decode_",y,".RData")),verbose=T) + annot<-meta[meta$plate_barcode==y,] + rownames(annot) <- apply( annot[ , c('plate_barcode','well_index') ] , 1 , paste , collapse = "_" ) + UMI_decode<-UMI_decode[!rownames(UMI_decode) %in% grep("ERCC-",rownames(UMI_decode),value = T),] + UMI_annot<-list(UMI_decode,annot) + names(UMI_annot)<-c("UMI.counts", "Annotation") + return(UMI_annot) + }) + names(res)<-b + return(res) +}) +names(Exp)<-unique(meta$batch) + +save(Exp, file=file.path(out_data_dir,"Exp_Init.RData"),compress = TRUE) + +``` +```{r} +check_threads <- function() { + cat("Hardware cores (logical/physical):", + parallel::detectCores(TRUE), "/", parallel::detectCores(FALSE), "\n") + cat("foreach backend/workers: ", + foreach::getDoParName(), "/", foreach::getDoParWorkers(), "\n", sep = "") + if (requireNamespace("future", quietly = TRUE)) { + cat("future plan/workers: ", capture.output(future::plan()), "\n", sep = "") + cat("future nbrOfWorkers: ", future::nbrOfWorkers(), "\n", sep = "") + } + if (requireNamespace("data.table", quietly = TRUE)) { + cat("data.table threads: ", data.table::getDTthreads(), "\n", sep = "") + } + if (requireNamespace("RhpcBLASctl", quietly = TRUE)) { + cat("BLAS threads: ", RhpcBLASctl::blas_get_num_procs(), "\n", sep = "") + cat("OMP max threads: ", RhpcBLASctl::omp_get_max_threads(), "\n", sep = "") + } else { + envs <- Sys.getenv(c("OPENBLAS_NUM_THREADS","MKL_NUM_THREADS","OMP_NUM_THREADS","VECLIB_MAXIMUM_THREADS")) + cat("BLAS/OMP env vars:\n"); print(envs[envs != ""]) + } +} +check_threads() + +``` + + +# 2 Step1 +## 2.1 run first 500 random run +```{r run using foreach, message=FALSE} + +step1_func <- function(n){ + + CTRL="DMSO" + rand_runs=TRUE + + meta <- read.csv(file.path(input_data_dir,'meta.csv'),stringsAsFactors = F) %>% + mutate(replicate=paste0('rep',plate_replicate),plate_id=paste0(plate_barcode,"_",well_index)) %>% + filter(plate_barcode %in% c("VH02001612", "VH02001614", "VH02001618")) + load(file.path(out_data_dir, 'Exp_Init.RData'),verbose = T) + + DMSO_sample <- meta %>% filter(treatment==CTRL) %>% select(batch,plate_barcode,plate_well) %>% + group_by(plate_barcode) %>% + do(sample_n(.,1)) %>% + mutate(Sample="RC_to_SA",treatment="RC_to_SA") + + + meta <- meta %>% left_join(DMSO_sample,by=c('batch','plate_barcode','plate_well'),suffix=c("","_randR")) %>% + mutate(Sample=ifelse(is.na(Sample_randR),Sample,Sample_randR)) %>% + mutate(treatment=ifelse(is.na(treatment_randR),treatment,treatment_randR)) + + DE<-DE.limma_trend(Exp=Exp,CTRL=CTRL,meta=meta,random_run =rand_runs) +} +# DE_count_500 <- foreach(n=1:10,.packages=c("dplyr","limma","tidyr","foreach","edgeR"),.combine=rbind) %do% step1_func(n) + +``` + +```{r, message=FALSE, warning=FALSE,results='hide'} +options(dplyr.summarise.inform = FALSE) +# ---- helper to pretty print seconds ---- +fmt_secs <- function(x) { + if (is.na(x)) return("NA") + if (x < 60) return(sprintf("%.2fs", x)) + if (x < 3600) return(sprintf("%dm %.1fs", floor(x/60), x %% 60)) + sprintf("%dh %dm %.0fs", floor(x/3600), floor((x%%3600)/60), x%%60) +} + +iters <- 1:50 + +# ---- total wall time + per-iteration timings ---- +# total_tm <- system.time({ +# # return a list of {data, iter, seconds} per iteration +# lst <- foreach( +# n = iters, +# .packages = c("dplyr","limma","tidyr","foreach","edgeR"), +# .combine = "c" # concatenate list-of-lists +# ) %do% { +# t0 <- proc.time()[["elapsed"]] +# res <- step1_func(n) +# dt <- proc.time()[["elapsed"]] - t0 +# list(list(data = res, iter = n, seconds = dt)) +# } +# }) + +# extract results and timings +# iter_times <- vapply(lst, function(x) x$seconds, numeric(1)) +# DE_count_500 <- dplyr::bind_rows(lapply(lst, function(x) x$data)) + + + +``` + + + + + + + + + + + + + + + + + + + + + + + + + + + +```{r, message=FALSE, warning=FALSE,results='hide'} + +# ---- RAM usage ---- +# remotes::install_github("tpq/peakRAM") +library(peakRAM) +res1 <- peakRAM({ + DE_count_500 <- foreach(n = iters, + .packages = c("dplyr","limma","tidyr","foreach","edgeR"), + .combine = rbind) %do% step1_func(n) +}) + +res2 <- peakRAM({ + DE_count_500 <- foreach(n = iters, + .packages = c("dplyr","limma","tidyr","foreach","edgeR"), + .combine = rbind) %do% step1_func(n) +}) + +res3 <- peakRAM({ + DE_count_500 <- foreach(n = iters, + .packages = c("dplyr","limma","tidyr","foreach","edgeR"), + .combine = rbind) %do% step1_func(n) +}) +``` + + +```{r} +res1 + +res2 + +res3 +``` + + + + +```{r, message=FALSE, warning=FALSE,results='hide'} +if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps") +get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2 + +rss_before <- get_rss_mb() +tm1 <- system.time({ + DE_count_500 <- foreach(n = iters, + .packages = c("dplyr","limma","tidyr","edgeR"), + .combine = rbind) %do% step1_func(n) +}) +rss_after <- get_rss_mb() +cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n", + tm1["elapsed"], rss_before, rss_after, rss_after - rss_before)) + + +``` + +```{r} +tm1 + +``` + + +```{r, message=FALSE, warning=FALSE,results='hide'} +if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps") +get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2 + +rss_before <- get_rss_mb() +tm2 <- system.time({ + DE_count_500 <- foreach(n = iters, + .packages = c("dplyr","limma","tidyr","edgeR"), + .combine = rbind) %do% step1_func(n) +}) +rss_after <- get_rss_mb() +cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n", + tm2["elapsed"], rss_before, rss_after, rss_after - rss_before)) + + +``` + +```{r} +tm2 + +``` + + +```{r, message=FALSE, warning=FALSE,results='hide'} +if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps") +get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2 + +rss_before <- get_rss_mb() +tm3 <- system.time({ + DE_count_500 <- foreach(n = iters, + .packages = c("dplyr","limma","tidyr","edgeR"), + .combine = rbind) %do% step1_func(n) +}) +rss_after <- get_rss_mb() +cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n", + tm3["elapsed"], rss_before, rss_after, rss_after - rss_before)) + + +``` + +```{r} +tm3 + +``` + diff --git a/vignettes/DRUGseq_first500runs.html b/vignettes/DRUGseq_first500runs.html new file mode 100644 index 0000000..5ad6b4b --- /dev/null +++ b/vignettes/DRUGseq_first500runs.html @@ -0,0 +1,2176 @@ + + + + + + + + + + + + + +DRUG-seq data analysis pipeline + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + +
suppressPackageStartupMessages({
+  library(dplyr); library(tidyr); library(limma); library(edgeR)
+})
+
library(stringr)
+library(DT)
+library(tidyr)
+library(dplyr)
+library(foreach)
+library(limma)
+library(edgeR)
+library(umap)
+library(data.table)
+
## 
+## Attaching package: 'data.table'
+
## The following objects are masked from 'package:dplyr':
+## 
+##     between, first, last
+
library(ggplot2)
+library(GGally)
+
## Registered S3 method overwritten by 'GGally':
+##   method from   
+##   +.gg   ggplot2
+
library(plotly)
+
## 
+## Attaching package: 'plotly'
+
## The following object is masked from 'package:ggplot2':
+## 
+##     last_plot
+
## The following object is masked from 'package:stats':
+## 
+##     filter
+
## The following object is masked from 'package:graphics':
+## 
+##     layout
+
library(ggrepel)
+library(viridis)
+
## Loading required package: viridisLite
+
library(RANN)
+library(igraph)
+
## 
+## Attaching package: 'igraph'
+
## The following object is masked from 'package:plotly':
+## 
+##     groups
+
## The following object is masked from 'package:tidyr':
+## 
+##     crossing
+
## The following objects are masked from 'package:dplyr':
+## 
+##     as_data_frame, groups, union
+
## The following objects are masked from 'package:stats':
+## 
+##     decompose, spectrum
+
## The following object is masked from 'package:base':
+## 
+##     union
+
library(RColorBrewer)
+library(patchwork)
+
+

DRUGseq data analysis pipeline - DMSO selection

+
+

data path setup

+

project folder:
+/Users/liuxin/macpie_dev/DRUGseqData/DRUG-seq-main

+
+
+

1.2 functions

+
get_union_toptable <- function(fit3) {
+  contrast_names  <-  colnames(coef(fit3))
+  n_contrasts  <-  length(contrast_names)
+  
+  tt <-  foreach(i=1:n_contrasts, .combine=rbind) %do%
+  { topTable(fit3, coef = i, 
+             p.value = 1, number = Inf) %>% 
+      mutate(GeneId=rownames(.), contrast=colnames(coef(fit3))[i]) }
+  
+  tt$contrast  <-  factor(tt$contrast, levels=contrast_names)
+  return(tt)
+}
+
DE.limma_trend<-function(Exp=Exp,CTRL,meta,best_DMSO=NULL,random_run=FALSE){
+  if(!is.null(best_DMSO)){ 
+    print('Starting DE calculations using best DMSO.')
+    }else{print('Starting DE calculations.')
+    }
+  
+  de<-lapply(1:length(Exp), function(i){
+    b<-names(Exp)[i]
+    meta_sub<- meta %>% filter(batch==b) %>% mutate(plate_barcode_short = substr(plate_barcode, nchar(plate_barcode)-2, nchar(plate_barcode)))
+    if(!is.null(best_DMSO)) {
+      meta_sub <- meta_sub %>% filter(treatment!=CTRL|barcode_well %in% best_DMSO$barcode_well) #only use best DMSO as control
+    } 
+
+    rownames(meta_sub)<-meta_sub$plate_id
+    UMI_batch<-lapply(1:length(Exp[[i]]), function(j){
+      #print(paste0('i ',i,' / j ',j))
+      x<-Exp[[i]][[j]]
+      mat<-x$UMI.counts
+      mat<-mat[!rownames(mat) %in% grep("ERCC-",rownames(mat),value = T),]
+      mat<-mat[order(row.names(mat)),]
+      return(mat)
+      })
+    # active checking that the objects in the list have exactly the same names
+    unames <- unique(unlist(lapply(UMI_batch, rownames)))
+    stopifnot( all( sapply(UMI_batch, function(x) identical(rownames(x), unames)) ) )
+    UMI_batch<-do.call(cbind,UMI_batch)
+
+    UMI_batch<-UMI_batch[,meta_sub$plate_id]
+    clean<-UMI_batch[apply(UMI_batch,1,max)>quantile(apply(UMI_batch,1,max),0.75),] 
+    clean<-clean[,colSums(clean)>0] 
+
+    d<-DGEList(counts=clean)
+    d <- calcNormFactors(d, method="TMM")
+    clean.TMM<-log2(edgeR::cpm(d,  normalized.lib.sizes=T,log=F)+1) 
+
+    meta_sub<-meta_sub[colnames(clean.TMM),]
+    meta_sub$Sample<-sapply(meta_sub$Sample, function(k) gsub("-",".",k))
+
+    design<-model.matrix(~0+factor(meta_sub$Sample)+factor(meta_sub$replicate))
+    colnames(design)<-c(levels(factor(meta_sub$Sample)),levels(factor(meta_sub$replicate))[-1])
+    rownames(design)<-colnames(clean)
+    samples<-unique(meta_sub$Sample)
+    ctrl<-grep(CTRL,samples,value = T)
+    #print(ctrl)
+    samples<-samples[!samples %in% ctrl]
+
+    contrasts<-sapply(samples, function(ctrs)paste0(ctrs,"-",ctrl))
+    contrast.matrix<-limma::makeContrasts(contrasts=contrasts, levels = design)
+
+    fit = limma::lmFit(clean.TMM, design)
+    fit2 = limma::contrasts.fit(fit,contrast.matrix)
+    fit3 = limma::eBayes(fit2,  trend = T)
+
+    DE.table <- get_union_toptable(fit3) %>% 
+    separate(contrast,c('contrast','ctrl_arm'),sep="-") %>% 
+    select(-ctrl_arm) %>% 
+    merge(meta_sub %>% 
+            filter(!treatment ==CTRL) %>% 
+            select(plate_barcode_short,treatment,treatment_dose,timepoint,Sample,plate_well,batch) %>% 
+            distinct() %>% 
+            group_by(treatment,treatment_dose,timepoint,Sample,batch) %>% 
+            summarise(plate_well = paste(paste(plate_barcode_short,plate_well,sep = " "), collapse=",")),
+          sort=FALSE, by.x='contrast',by.y='Sample') %>% mutate(Sample=contrast) # combine metadata with toptable
+    
+  })
+  if (random_run==TRUE){
+    de<-do.call(rbind,de) %>% 
+      filter(Sample=="RC_to_SA") %>% 
+      group_by(batch,plate_well) %>%
+      filter(abs(logFC)>1,adj.P.Val<0.1) %>%
+      summarise(count=n()) %>%
+      mutate(run_no=n,contrast='RC_to_SA') %>% 
+      ungroup() 
+    }else{
+    de<-do.call(rbind,de) 
+    }
+  return(de)
+}
+
vpPlot_DElabel <- function(this_contrast_toptabel,batch_select=1,this_arms){
+  
+  pcut<-0.1
+  #color= black,red, blue
+  #        0     1      2   
+  color_custum<-c(rgb(0,0,0,0.5),rgb(1,0,0,1),rgb(0,0,1,1))
+  
+  this_contrast_toptabel <- this_contrast_toptabel %>% 
+    filter(treatment %in% this_arms,batch==batch_select) %>% 
+    mutate(label=0)
+  
+  
+
+  high_concentration<-this_contrast_toptabel %>% select(treatment_dose) %>% max() 
+  
+  this_contrast_toptabel_plot<-NULL
+
+  for (SampleName_lib_temp in this_arms) { 
+    upregulate_gene <- this_contrast_toptabel %>% 
+      filter(treatment == SampleName_lib_temp) %>% 
+      filter(treatment_dose==high_concentration, adj.P.Val<pcut, logFC>1) %>% 
+      select(GeneId) %>% 
+      .$GeneId
+    downregulate_gene <-this_contrast_toptabel %>% 
+    filter(treatment == SampleName_lib_temp) %>%
+    filter(treatment_dose==high_concentration, adj.P.Val<pcut, logFC< -1) %>% 
+    select(GeneId) %>% 
+    .$GeneId
+
+    this_contrast_toptabel_plot <- rbind(this_contrast_toptabel_plot,this_contrast_toptabel 
+                                      %>%  filter(treatment==SampleName_lib_temp) %>% 
+                                        mutate(label=ifelse(GeneId %in% upregulate_gene,1,label)) %>% 
+                                        mutate(label=ifelse(GeneId %in% downregulate_gene,2,label)))
+  }
+  
+  this_contrast_toptabel_plot$label<-as.factor(this_contrast_toptabel_plot$label)
+  levels(this_contrast_toptabel_plot$label)<-c("not significant ","upregulated genes","downregulated genes")
+  this_contrast_toptabel_plot$treatment<-factor(this_contrast_toptabel_plot$treatment)
+
+ 
+  
+  vp<-this_contrast_toptabel_plot %>% arrange(label) %>% 
+    ggplot(aes(x=logFC, y=-log10(adj.P.Val), color=label))+
+    scale_color_manual(values = color_custum)+
+    geom_point(fill="white",shape=21, size=1, stroke=0.5) +
+    geom_abline(slope = 0, intercept = -log10(pcut), size=0.3, linetype='dashed') +
+    geom_vline(xintercept = -1, size=0.3, linetype='dashed') +
+    geom_vline(xintercept = 1, size=0.3, linetype='dashed') +
+    xlim(-8,8)+
+    theme_bw() +
+    theme(axis.text.x = element_text(face="bold",size=14),
+          axis.text.y = element_text(face="bold", size=10),
+          axis.title=element_text(size=14,face="bold"),
+        legend.text=element_text(size=12) ) +
+    facet_grid(treatment~treatment_dose)+
+    theme(strip.text = element_text(face="bold", size=12))
+
+  return(vp)
+  }
+
## works up to 3 groups comparison now
+VD_plot <- function(this_contrast_toptable,batch_select=1){
+  this_contrast_toptable_plot <- this_contrast_toptable %>% 
+  mutate(sig=ifelse(abs(logFC)>1 & adj.P.Val<0.1,-1,0)) %>% 
+    mutate(sig=ifelse(logFC>1 & adj.P.Val<0.1,1,sig)) %>%
+    mutate(contrast=paste0(treatment_dose,'uM')) %>% 
+    select(contrast,GeneId,sig) %>% 
+    reshape2::dcast(GeneId~contrast,value.var="sig")
+  
+  
+  rownames(this_contrast_toptable_plot) <- this_contrast_toptable_plot$GeneId
+  return(this_contrast_toptable_plot %>% select(-GeneId) %>% vennDiagram(circle.col=c("turquoise", "salmon","palegreen"),cex=1))
+
+}
+
pca_umap <- function(counts=NULL,center_pca=T,metric="euclidean", scale_pca=T,n_neighbors=15,n_pca=20){
+  
+  pca<-prcomp(t(counts), scale = scale_pca, center = center_pca)
+  rownames(pca$x)  <-  colnames(counts)
+  umap  <-  umap(pca$x[,1:n_pca],method = "umap-learn",metric=metric, n_neighbors=n_neighbors)
+  rownames(umap$layout)  <-  colnames(counts)
+  return(umap)
+
+  
+}
+
ggpairs_custom<-function(toptable_meta,this_arms,p_cut,corr_size=3,batch_s=1,strip_size=NULL,label_size=NULL,exclusive_gene=NULL){
+  
+  toptable_meta=toptable_meta %>% mutate(contrast=paste(treatment,treatment_dose,timepoint,sep="_"))
+  this_contrast=toptable_meta %>% 
+    filter(Sample %in% this_arms,batch==batch_s) %>% 
+    arrange(treatment_dose) %>% 
+    select(contrast) %>% 
+    distinct() %>%.$contrast
+  p_cut <- p_cut
+
+  
+if (is.null(exclusive_gene)) {
+valid_genes <- toptable_meta %>% 
+  filter(Sample %in% this_arms) %>%
+  filter(batch==batch_s) %>%
+  filter(adj.P.Val<p_cut) %>%
+  .$GeneId %>%
+  unique()
+} else {
+  
+  valid_genes <- toptable_meta %>% 
+    filter(Sample %in% this_arms) %>%
+    filter(batch==batch_s) %>%
+    filter(adj.P.Val<p_cut) %>%
+  .$GeneId %>%
+  unique() %>% 
+    setdiff(exclusive_gene)
+  
+}
+
+table_to_plot1<-toptable_meta %>% 
+  filter(Sample %in% this_arms) %>%
+  filter(batch==batch_s) %>%
+  filter(GeneId %in% valid_genes) %>%
+  dplyr::select(GeneId, contrast, logFC) %>%
+  setDT %>%
+  data.table::dcast(GeneId ~ contrast, value.var = 'logFC') %>%
+  as.data.frame()
+
+table_to_plot1<-table_to_plot1[,c("GeneId",this_contrast)]
+rownames(table_to_plot1)<-table_to_plot1$GeneId
+
+table_to_plot2<-toptable_meta %>% 
+  filter(Sample %in% this_arms) %>%
+  filter(batch==batch_s) %>%
+  filter(adj.P.Val<p_cut) %>%
+  dplyr::select(GeneId, contrast, logFC) %>%
+  setDT %>%
+  data.table::dcast(GeneId ~ contrast, value.var = 'logFC') %>%
+  as.data.frame()
+
+missing_col<-setdiff(colnames(table_to_plot1),colnames(table_to_plot2))
+if (!identical(missing_col, character(0))) {
+  missing_mat<-data.frame(matrix(0, nrow = nrow(table_to_plot2), ncol =as.numeric( length(missing_col))))
+  missing_mat[missing_mat==0]=NA
+  colnames(missing_mat)<-missing_col
+  table_to_plot2<-cbind(table_to_plot2,missing_mat)
+}
+
+table_to_plot2<-table_to_plot2[,c("GeneId",this_contrast)]
+
+data_plot<-inner_join(table_to_plot1,table_to_plot2,by="GeneId")
+  
+  
+
+  rownames(data_plot)<-data_plot$GeneId
+  data_plot<-data_plot %>% select(-"GeneId")
+  
+  axis_limt<-max(abs(data_plot),na.rm = TRUE)
+  
+  diagfun <- function(data,mapping){
+    ggplot(data = data, mapping = mapping)+
+      geom_density(color="darkblue", fill="lightblue") +
+       scale_x_continuous(limits = c(-axis_limt, axis_limt)) 
+      # scale_y_continuous(limits = c(-axis_limt, axis_limt))
+    }
+  
+  corr_heatmap <- function(data, mapping, method="p", use="pairwise", ...){
+
+              # grab data
+              x <- eval_data_col(data, mapping$x)
+              y <- eval_data_col(data, mapping$y)
+
+              # calculate correlation
+              corr <- cor(x, y, method=method, use=use)
+              colFn <- colorRampPalette(c("blue", "white", "red"), interpolate ='spline')
+              fill <- colFn(100)[findInterval(corr, seq(-1, 1, length=100))]
+              ggally_cor(data = data, mapping = mapping, ...) + 
+                theme_classic()+
+                theme(panel.background = element_rect(fill=fill,colour = fill))
+  }
+  
+  
+  pm<-ggpairs(data_plot,c(1:as.numeric(length(colnames(data_plot))/2)),columnLabels=colnames(table_to_plot1)[-1],diag=list(continuous = wrap(diagfun)),
+              upper = list(continuous = corr_heatmap),progress = F)+theme(axis.text = element_text(size = label_size),strip.placement = "outside", 
+                                                                          text = element_text(size = strip_size) )
+  
+  method<-"pearson"
+  pm2 <- pm
+  color_custum<-c(rgb(0,0,0,0.5),rgb(0,1,0,0.7),rgb(0,0,1,0.7),rgb(1,0,0,1))
+  label_list<-c('not significant','x axis DE genes','y axis DE genes','shared DE genes')
+  for(i in 2:pm$nrow) {
+       for(j in 1:(i-1)) {
+         
+         x <- GGally::eval_data_col(pm[i,j]$data, pm[i,j]$mapping$x)
+         y <- GGally::eval_data_col(pm[i,j]$data, pm[i,j]$mapping$y)
+         corr<- cor(x,y,method=method,use="na.or.complete")
+         col_name_x_x<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$x),".x")
+         col_name_y_x<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$y),".x")
+         col_name_x_y<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$x),".y")
+         col_name_y_y<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$y),".y")
+         col_name<-c(col_name_x_x,col_name_y_x,col_name_x_y,col_name_y_y)
+         data_sub<-pm$data[,col_name]
+         
+         data_sub<-data_sub %>% 
+           mutate(label = ifelse(!is.na(!!sym(col_name_x_y)) & 
+                                   abs(!!sym(col_name_x_y))>1,1,0)) %>%
+          mutate(label = ifelse(!is.na(!!sym(col_name_y_y)) &
+                                  abs(!!sym(col_name_y_y))>1,2,label)) %>%
+          mutate(label = ifelse(!is.na(!!sym(col_name_x_y)) &
+                        abs(!!sym(col_name_x_y))>1 &
+                          !is.na(!!sym(col_name_y_y))&
+                           abs(!!sym(col_name_y_y))>1 ,3,label)) %>% 
+           arrange(label)
+         
+          unique_label<-sort(unique(data_sub$label))
+          data_sub$label<-as.factor(data_sub$label)
+          levels(data_sub$label)<-label_list[unique_label+1]
+          
+          p<-ggplot(data_sub,aes(x=!!sym(pm[i,j]$labels$x),y=!!sym(pm[i,j]$labels$y),color=label))+
+               geom_point(size=2,shape=21,stroke=0.5,fill="white")+theme_bw()+scale_color_manual(values = color_custum[unique_label+1])+ scale_shape(solid = TRUE) +
+            scale_x_continuous(limits = c(-axis_limt, axis_limt)) +
+            scale_y_continuous(limits = c(-axis_limt, axis_limt)) +
+            geom_abline(slope = 0, intercept = -1, size=0.3, linetype='dashed') +
+            geom_abline(slope = 0, intercept =  1, size=0.3, linetype='dashed') +
+            geom_vline(xintercept = -1, size=0.3, linetype='dashed') +
+            geom_vline(xintercept = 1, size=0.3, linetype='dashed')
+          pm2[i,j]=p+geom_label(data = data.frame(xlabel = -axis_limt,
+                                          ylabel = max(axis_limt, na.rm = TRUE),
+                                          lab = round(corr, digits = 3)),
+                                mapping = ggplot2::aes(x = xlabel, y = ylabel, label = lab),
+                                hjust = 0, vjust = 1,size = corr_size, fontface = "bold",
+                          inherit.aes = FALSE # do not inherit anything from the ...
+                          ) 
+          
+          
+       }
+   }
+  return(pm2)
+}
+
+

UMI count table preparation

+
meta <- read.csv(file.path(input_data_dir,'meta.csv'),stringsAsFactors = F) %>% 
+  filter(plate_barcode %in% c("VH02001612", "VH02001614", "VH02001618"))
+Exp<-lapply(unique(meta$batch), function(x){
+  b<-unique(meta[meta$batch==x,]$plate_barcode)
+  res<-lapply(b, function(y){
+    load(file.path(input_data_dir, paste0("flowcell_4000_UMI_decode_",y,".RData")),verbose=T)
+    annot<-meta[meta$plate_barcode==y,]
+    rownames(annot) <- apply( annot[ , c('plate_barcode','well_index') ] , 1 , paste , collapse = "_" )
+    UMI_decode<-UMI_decode[!rownames(UMI_decode) %in% grep("ERCC-",rownames(UMI_decode),value = T),]
+    UMI_annot<-list(UMI_decode,annot)
+    names(UMI_annot)<-c("UMI.counts", "Annotation")
+    return(UMI_annot)
+  })
+  names(res)<-b
+  return(res)
+})
+
## Loading objects:
+##   UMI_decode
+## Loading objects:
+##   UMI_decode
+## Loading objects:
+##   UMI_decode
+
names(Exp)<-unique(meta$batch)
+
+save(Exp, file=file.path(out_data_dir,"Exp_Init.RData"),compress = TRUE)
+
check_threads <- function() {
+  cat("Hardware cores (logical/physical):",
+      parallel::detectCores(TRUE), "/", parallel::detectCores(FALSE), "\n")
+  cat("foreach backend/workers: ",
+      foreach::getDoParName(), "/", foreach::getDoParWorkers(), "\n", sep = "")
+  if (requireNamespace("future", quietly = TRUE)) {
+    cat("future plan/workers: ", capture.output(future::plan()), "\n", sep = "")
+    cat("future nbrOfWorkers: ", future::nbrOfWorkers(), "\n", sep = "")
+  }
+  if (requireNamespace("data.table", quietly = TRUE)) {
+    cat("data.table threads: ", data.table::getDTthreads(), "\n", sep = "")
+  }
+  if (requireNamespace("RhpcBLASctl", quietly = TRUE)) {
+    cat("BLAS threads: ", RhpcBLASctl::blas_get_num_procs(), "\n", sep = "")
+    cat("OMP max threads: ", RhpcBLASctl::omp_get_max_threads(), "\n", sep = "")
+  } else {
+    envs <- Sys.getenv(c("OPENBLAS_NUM_THREADS","MKL_NUM_THREADS","OMP_NUM_THREADS","VECLIB_MAXIMUM_THREADS"))
+    cat("BLAS/OMP env vars:\n"); print(envs[envs != ""])
+  }
+}
+check_threads()
+
## Hardware cores (logical/physical): 12 / 12 
+## foreach backend/workers: /1
+
## Registered S3 method overwritten by 'future':
+##   method               from      
+##   all.equal.connection parallelly
+
## future plan/workers: sequential:- args: function (..., gc = FALSE, earlySignal = FALSE, envir = parent.frame())- tweaked: FALSE- call: plan(sequential)FutureBackend to be launched
+## future nbrOfWorkers: 1
+## data.table threads: 6
+## BLAS threads: 1
+## OMP max threads: NA
+
+
+
+
+

2 Step1

+
+

2.1 run first 500 random run

+
step1_func <- function(n){
+  
+  CTRL="DMSO"
+  rand_runs=TRUE
+  
+  meta <- read.csv(file.path(input_data_dir,'meta.csv'),stringsAsFactors = F) %>%  
+    mutate(replicate=paste0('rep',plate_replicate),plate_id=paste0(plate_barcode,"_",well_index)) %>%
+    filter(plate_barcode %in% c("VH02001612", "VH02001614", "VH02001618"))
+  load(file.path(out_data_dir, 'Exp_Init.RData'),verbose = T)
+  
+  DMSO_sample <- meta %>% filter(treatment==CTRL) %>% select(batch,plate_barcode,plate_well) %>%
+    group_by(plate_barcode) %>% 
+    do(sample_n(.,1)) %>% 
+    mutate(Sample="RC_to_SA",treatment="RC_to_SA")  
+
+
+  meta <- meta %>% left_join(DMSO_sample,by=c('batch','plate_barcode','plate_well'),suffix=c("","_randR")) %>%
+    mutate(Sample=ifelse(is.na(Sample_randR),Sample,Sample_randR)) %>% 
+    mutate(treatment=ifelse(is.na(treatment_randR),treatment,treatment_randR))
+  
+  DE<-DE.limma_trend(Exp=Exp,CTRL=CTRL,meta=meta,random_run =rand_runs)
+}
+# DE_count_500 <- foreach(n=1:10,.packages=c("dplyr","limma","tidyr","foreach","edgeR"),.combine=rbind) %do% step1_func(n)
+
options(dplyr.summarise.inform = FALSE)
+# ---- helper to pretty print seconds ----
+fmt_secs <- function(x) {
+  if (is.na(x)) return("NA")
+  if (x < 60) return(sprintf("%.2fs", x))
+  if (x < 3600) return(sprintf("%dm %.1fs", floor(x/60), x %% 60))
+  sprintf("%dh %dm %.0fs", floor(x/3600), floor((x%%3600)/60), x%%60)
+}
+
+iters <- 1:50
+
+# ---- total wall time + per-iteration timings ----
+# total_tm <- system.time({
+#   # return a list of {data, iter, seconds} per iteration
+#   lst <- foreach(
+#     n = iters,
+#     .packages = c("dplyr","limma","tidyr","foreach","edgeR"),
+#     .combine  = "c"     # concatenate list-of-lists
+#   ) %do% {
+#     t0  <- proc.time()[["elapsed"]]
+#     res <- step1_func(n)
+#     dt  <- proc.time()[["elapsed"]] - t0
+#     list(list(data = res, iter = n, seconds = dt))
+#   }
+# })
+
+# extract results and timings
+# iter_times <- vapply(lst, function(x) x$seconds, numeric(1))
+# DE_count_500 <- dplyr::bind_rows(lapply(lst, function(x) x$data))
+ + + + + + + + + + + + + + + + + + + + + + + + +
# ---- RAM usage ----
+# remotes::install_github("tpq/peakRAM")
+library(peakRAM)
+res1 <- peakRAM({
+  DE_count_500 <- foreach(n = iters,
+                          .packages = c("dplyr","limma","tidyr","foreach","edgeR"),
+                          .combine  = rbind) %do% step1_func(n)
+})
+
+res2 <- peakRAM({
+  DE_count_500 <- foreach(n = iters,
+                          .packages = c("dplyr","limma","tidyr","foreach","edgeR"),
+                          .combine  = rbind) %do% step1_func(n)
+})
+
+res3 <- peakRAM({
+  DE_count_500 <- foreach(n = iters,
+                          .packages = c("dplyr","limma","tidyr","foreach","edgeR"),
+                          .combine  = rbind) %do% step1_func(n)
+})
+
res1
+
##                                                                                                             Function_Call
+## 1 {DE_count_500<-foreach(n=iters,.packages=c("dplyr","limma","tidyr","foreach","edgeR"),.combine=rbind)%do%step1_func(n)}
+##   Elapsed_Time_sec Total_RAM_Used_MiB Peak_RAM_Used_MiB
+## 1         1759.191               33.3            3220.6
+
res2
+
##                                                                                                             Function_Call
+## 1 {DE_count_500<-foreach(n=iters,.packages=c("dplyr","limma","tidyr","foreach","edgeR"),.combine=rbind)%do%step1_func(n)}
+##   Elapsed_Time_sec Total_RAM_Used_MiB Peak_RAM_Used_MiB
+## 1         1709.094                  0            2937.3
+
res3
+
##                                                                                                             Function_Call
+## 1 {DE_count_500<-foreach(n=iters,.packages=c("dplyr","limma","tidyr","foreach","edgeR"),.combine=rbind)%do%step1_func(n)}
+##   Elapsed_Time_sec Total_RAM_Used_MiB Peak_RAM_Used_MiB
+## 1         1695.753                  0            2937.4
+
if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps")
+get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2
+
+rss_before <- get_rss_mb()
+tm1 <- system.time({
+  DE_count_500 <- foreach(n = iters,
+                          .packages = c("dplyr","limma","tidyr","edgeR"),
+                          .combine  = rbind) %do% step1_func(n)
+})
+rss_after  <- get_rss_mb()
+cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n",
+            tm1["elapsed"], rss_before, rss_after, rss_after - rss_before))
+
tm1
+
##     user   system  elapsed 
+## 1539.299  130.819 1686.507
+
if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps")
+get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2
+
+rss_before <- get_rss_mb()
+tm2 <- system.time({
+  DE_count_500 <- foreach(n = iters,
+                          .packages = c("dplyr","limma","tidyr","edgeR"),
+                          .combine  = rbind) %do% step1_func(n)
+})
+rss_after  <- get_rss_mb()
+cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n",
+            tm2["elapsed"], rss_before, rss_after, rss_after - rss_before))
+
tm2
+
##     user   system  elapsed 
+## 1526.717  144.779 1697.510
+
if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps")
+get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2
+
+rss_before <- get_rss_mb()
+tm3 <- system.time({
+  DE_count_500 <- foreach(n = iters,
+                          .packages = c("dplyr","limma","tidyr","edgeR"),
+                          .combine  = rbind) %do% step1_func(n)
+})
+rss_after  <- get_rss_mb()
+cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n",
+            tm3["elapsed"], rss_before, rss_after, rss_after - rss_before))
+
tm3
+
##     user   system  elapsed 
+## 1533.484  131.026 1682.296
+
+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + diff --git a/vignettes/DRUGseq_first500runs.knit.log b/vignettes/DRUGseq_first500runs.knit.log new file mode 100644 index 0000000..c224f8b --- /dev/null +++ b/vignettes/DRUGseq_first500runs.knit.log @@ -0,0 +1,2 @@ +Error: pandoc version 1.12.3 or higher is required and was not found (see the help page ?rmarkdown::pandoc_available). +Execution halted diff --git a/vignettes/benchmark_DMSOruntime.Rmd b/vignettes/benchmark_DMSOruntime.Rmd new file mode 100644 index 0000000..148e58b --- /dev/null +++ b/vignettes/benchmark_DMSOruntime.Rmd @@ -0,0 +1,476 @@ +--- +title: "Benchmark: runtime and memory of DMSO selection" +output: + rmarkdown::html_document: + theme: flatly + toc: true + toc_float: true + toc_depth: 5 +vignette: > + %\VignetteIndexEntry{benchmarking DMSO runtime and memory} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} + %\VignetteBuild{true} +--- + +```{r set_wd, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +dir <- "/Users/liuxin/macpie_Dev/" +devtools::load_all(paste0(dir, "macpie/")) + +``` + + +```{r setup} + +suppressMessages(library(macpie)) +suppressMessages(library(tibble)) +suppressMessages(library(stringr)) +suppressMessages(library(pheatmap)) +suppressMessages(library(ggiraph)) +suppressMessages(library(tidyseurat)) +suppressMessages(library(purrr)) +suppressMessages(library(ggplot2)) +suppressMessages(library(peakRAM)) +suppressMessages(library(ps)) + +``` + + +```{r} +options(scipen=999, digits=3) +``` + + + +```{r} +input_data_dir <- file.path(paste0(dir,"/DRUGseqData/DRUG-seq-main/data")) +meta <- read.csv(file.path(input_data_dir,'meta.csv'),stringsAsFactors = F) %>% + filter(plate_barcode %in% c("VH02001612", "VH02001614", "VH02001618")) +Exp<-lapply(unique(meta$batch), function(x){ + b<-unique(meta[meta$batch==x,]$plate_barcode) + res<-lapply(b, function(y){ + load(file.path(input_data_dir, paste0("flowcell_4000_UMI_decode_",y,".RData")),verbose=T) + annot<-meta[meta$plate_barcode==y,] + rownames(annot) <- apply( annot[ , c('plate_barcode','well_index') ] , 1 , paste , collapse = "_" ) + UMI_decode<-UMI_decode[!rownames(UMI_decode) %in% grep("ERCC-",rownames(UMI_decode),value = T),] + UMI_annot<-list(UMI_decode,annot) + names(UMI_annot)<-c("UMI.counts", "Annotation") + return(UMI_annot) + }) + names(res)<-b + return(res) +}) +names(Exp)<-unique(meta$batch) +batch3 <- Exp$`3` +``` + +```{r} +metadata <- batch3 %>% + map_dfr(~ { + .x$Annotation %>% + mutate( + Plate_ID = plate_barcode, + Well_ID = plate_well, + Barcode = paste0(plate_barcode, "_", well_index), + Row = substr(plate_well, 1, 1), + Column = as.integer(substr(plate_well, 2, nchar(plate_well))), + Species = "human", + Cell_type = "U2OS", + Model_type = "2D_adherent", + Time = "24", + Unit = "h", + Treatment_1 = treatment, + Concentration_1 = treatment_dose, + Unit_1 = treatment_dose_unit, + Sample_type = ifelse(treatment == "DMSO", "Control", "Treated") + + ) + }) + + +metadata <- metadata%>%select(-c(plate_barcode, index, plate_replicate, plate_well, + step, readout.comment, well_index, cell.line.name, + batch, treatment, treatment_dose, treatment_dose_unit, timepoint)) + + +batch3_counts <- batch3 %>% + map(~ { + .x$UMI.counts %>% + as.data.frame() %>% + rownames_to_column("gene_id") %>% + separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>% + mutate(gene_name = make.unique(gene_name)) %>% + select(-chrom) %>% + tibble::column_to_rownames(var = "gene_name") %>% + as.matrix() + }) + +binded_counts <- do.call(cbind, batch3_counts) + +as_mac <- CreateSeuratObject(counts = binded_counts, + min.cells = 1, + min.features = 1) + +as_mac<- as_mac%>% inner_join(metadata, by = c(".cell"="Barcode")) +as_mac$combined_id <- paste0(as_mac$Treatment_1, "_", as_mac$Concentration_1) +batch3_mac_filtered <- filter_genes_by_expression(as_mac, + group_by = "combined_id", + min_counts = 5, + min_samples = 1) + +``` + +```{r} +select_top_replicates_tmmcpm <- function( + data, + combined_id, # e.g. "CB_43_EP73_0" + orig_ident, # e.g. "VH02012942" + cpm_filter = 1, # CPM threshold for gene filtering + min_samps = 16, # number of samples a gene must be expressed in + corr_method = c("spearman","pearson"), + top_n = 5, + make_plots = TRUE +){ + + corr_method <- match.arg(corr_method) + + # Subset to the plate/condition of interest + subgroup <- subset(data, subset = combined_id == !!combined_id & orig.ident == !!orig_ident) + + # Counts and human-friendly column names + counts_d <- GetAssayData(subgroup, assay = "RNA", layer = "counts") + well_colnames <- paste0(subgroup$orig.ident, "_", subgroup$Well_ID) + names(well_colnames) <- rownames(subgroup@meta.data) + colnames(counts_d) <- well_colnames[colnames(counts_d)] + + # edgeR container + gene filtering + y <- edgeR::DGEList(counts_d, group = subgroup$orig.ident) + keep <- rowSums(edgeR::cpm(y) > cpm_filter) >= min_samps + y <- y[keep, , keep.lib.sizes = FALSE] + + # TMMwsp → log2-CPM + y <- edgeR::calcNormFactors(y, method = "TMMwsp") + log_cpm_tmm <- edgeR::cpm(y, log = TRUE, normalized.lib.sizes = TRUE) + + # Long data for boxplot + df_long <- as.data.frame(log_cpm_tmm) |> + tibble::rownames_to_column(var = "gene") |> + tidyr::pivot_longer(-gene, names_to = "sample", values_to = "log_cpm") + + if (make_plots) { + # Boxplot + print( + ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) + + ggplot2::geom_boxplot(outlier.size = 0.5) + + ggplot2::labs(x = "Sample", y = "log2 CPM", + title = "Boxplot of log2-CPM (TMMwsp)") + + ggplot2::theme_classic() + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + ) + } + + # Correlation matrices + cors_pear <- stats::cor(log_cpm_tmm, method = "pearson") + cors_spear <- stats::cor(log_cpm_tmm, method = "spearman") + + if (make_plots) { + pheatmap::pheatmap(cors_pear, main = "Sample–sample correlation (Pearson, log2-CPM)") + pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)") + } + + # Ranking by mean Fisher-z correlation to all *other* samples + R <- stats::cor(log_cpm_tmm, method = corr_method, use = "pairwise.complete.obs") + diag(R) <- NA_real_ + + # clip, Fisher z, average, back-transform + Z <- atanh(pmin(pmax(R, -0.999999), 0.999999)) + score_z <- rowMeans(Z, na.rm = TRUE) + score_r <- tanh(score_z) + + # top-N names and scores + ord <- order(score_r, decreasing = TRUE, na.last = NA) + srt <- score_r[ord] + cutoff <- srt[top_n] # N-th best score + keep <- srt >= cutoff # keep all tied at the cutoff + topN <- srt[keep] + + # return everything useful + list( + subset_obj = subgroup, + dge = y, + log_cpm_tmm = log_cpm_tmm, + boxplot_df = df_long, + cor_pearson = cors_pear, + cor_spearman = cors_spear, + ranking_method = corr_method, + scores_mean_to_others = sort(score_r, decreasing = TRUE), + topN = topN + ) +} + + +``` + + +```{r} +res1 <- peakRAM({ +dmso_VH02001612 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001612", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + dmso_VH02001614 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001614", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + + dmso_VH02001618 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001618", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + } +) + +res2 <- peakRAM({ +dmso_VH02001612 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001612", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + dmso_VH02001614 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001614", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + + dmso_VH02001618 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001618", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + } +) + +res3 <- peakRAM({ +dmso_VH02001612 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001612", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + dmso_VH02001614 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001614", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + + dmso_VH02001618 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001618", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + } +) + + +``` + + +```{r} +res1 + +res2 + +res3 +``` + +```{r} + +if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps") +get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2 + +rss_before <- get_rss_mb() +tm1 <- system.time({ + dmso_VH02001612 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001612", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + dmso_VH02001614 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001614", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + + dmso_VH02001618 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001618", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) +}) +rss_after <- get_rss_mb() +cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n", + tm1["elapsed"], rss_before, rss_after, rss_after - rss_before)) + + +get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2 + +rss_before <- get_rss_mb() +tm2 <- system.time({ + dmso_VH02001612 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001612", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + dmso_VH02001614 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001614", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + + dmso_VH02001618 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001618", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) +}) +rss_after <- get_rss_mb() +cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n", + tm2["elapsed"], rss_before, rss_after, rss_after - rss_before)) + + +get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2 + +rss_before <- get_rss_mb() +tm3 <- system.time({ + dmso_VH02001612 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001612", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + dmso_VH02001614 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001614", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + + dmso_VH02001618 <- select_top_replicates_tmmcpm( + batch3_mac_filtered, + combined_id = "DMSO_0", + orig_ident = "VH02001618", + cpm_filter = 5, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) +}) +rss_after <- get_rss_mb() +cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n", + tm3["elapsed"], rss_before, rss_after, rss_after - rss_before)) + + + + +``` + + +```{r} +tm1 + +tm2 + +tm3 +``` + + +### Summary of runtime and memory usage + +The `select_top_replicates_tmmcpm` function runs on same plates (VH02001612, VH02001614, VH02001618) in approximately 2.463 seconds for three plates, with a peak memory usage of around 2403 Mib. This demonstrates that the function is efficient and can be used effectively for selecting DMSO controls in a reasonable timeframe without excessive memory consumption. \ No newline at end of file diff --git a/vignettes/benchmark_DMSOruntime.html b/vignettes/benchmark_DMSOruntime.html new file mode 100644 index 0000000..cff298d --- /dev/null +++ b/vignettes/benchmark_DMSOruntime.html @@ -0,0 +1,2016 @@ + + + + + + + + + + + + + +Benchmark: runtime and memory of DMSO selection + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + +
suppressMessages(library(macpie))
+suppressMessages(library(tibble))
+suppressMessages(library(stringr))
+suppressMessages(library(pheatmap))
+suppressMessages(library(ggiraph))
+suppressMessages(library(tidyseurat))
+suppressMessages(library(purrr))
+suppressMessages(library(ggplot2))
+suppressMessages(library(peakRAM))
+suppressMessages(library(ps))
+
options(scipen=999, digits=3)
+
input_data_dir <- file.path(paste0(dir,"/DRUGseqData/DRUG-seq-main/data"))
+meta <- read.csv(file.path(input_data_dir,'meta.csv'),stringsAsFactors = F) %>% 
+  filter(plate_barcode %in% c("VH02001612", "VH02001614", "VH02001618"))
+Exp<-lapply(unique(meta$batch), function(x){
+  b<-unique(meta[meta$batch==x,]$plate_barcode)
+  res<-lapply(b, function(y){
+    load(file.path(input_data_dir, paste0("flowcell_4000_UMI_decode_",y,".RData")),verbose=T)
+    annot<-meta[meta$plate_barcode==y,]
+    rownames(annot) <- apply( annot[ , c('plate_barcode','well_index') ] , 1 , paste , collapse = "_" )
+    UMI_decode<-UMI_decode[!rownames(UMI_decode) %in% grep("ERCC-",rownames(UMI_decode),value = T),]
+    UMI_annot<-list(UMI_decode,annot)
+    names(UMI_annot)<-c("UMI.counts", "Annotation")
+    return(UMI_annot)
+  })
+  names(res)<-b
+  return(res)
+})
+
## Loading objects:
+##   UMI_decode
+## Loading objects:
+##   UMI_decode
+## Loading objects:
+##   UMI_decode
+
names(Exp)<-unique(meta$batch)
+batch3 <- Exp$`3`
+
metadata <- batch3 %>% 
+  map_dfr(~ {
+    .x$Annotation %>%
+      mutate(
+        Plate_ID        = plate_barcode,
+        Well_ID         = plate_well,
+        Barcode         = paste0(plate_barcode, "_", well_index),
+        Row             = substr(plate_well, 1, 1),
+        Column          = as.integer(substr(plate_well, 2, nchar(plate_well))),
+        Species         = "human",
+        Cell_type       = "U2OS",
+        Model_type      = "2D_adherent",
+        Time            = "24",
+        Unit            = "h",
+        Treatment_1     = treatment,
+        Concentration_1 = treatment_dose,
+        Unit_1          = treatment_dose_unit,
+        Sample_type         = ifelse(treatment == "DMSO", "Control", "Treated")
+
+      )
+  })
+
+
+metadata <- metadata%>%select(-c(plate_barcode, index, plate_replicate, plate_well,
+                                 step, readout.comment, well_index, cell.line.name, 
+                                 batch, treatment, treatment_dose, treatment_dose_unit, timepoint))
+
+
+batch3_counts <- batch3 %>%
+  map(~ {
+    .x$UMI.counts %>%
+      as.data.frame() %>% 
+      rownames_to_column("gene_id") %>%
+      separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>%
+      mutate(gene_name = make.unique(gene_name)) %>%
+      select(-chrom) %>%
+      tibble::column_to_rownames(var = "gene_name") %>%
+      as.matrix()
+  })
+
+binded_counts <- do.call(cbind, batch3_counts)
+
+as_mac <- CreateSeuratObject(counts = binded_counts, 
+  min.cells = 1, 
+  min.features = 1)
+
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
+## ('-')
+
## Warning: Data is of class matrix. Coercing to dgCMatrix.
+
as_mac<- as_mac%>% inner_join(metadata, by = c(".cell"="Barcode"))
+as_mac$combined_id <- paste0(as_mac$Treatment_1, "_", as_mac$Concentration_1)
+batch3_mac_filtered <- filter_genes_by_expression(as_mac,
+                                     group_by = "combined_id",
+                                     min_counts = 5,
+                                     min_samples = 1)
+
select_top_replicates_tmmcpm <- function(
+  data,
+  combined_id,                 # e.g. "CB_43_EP73_0"
+  orig_ident,                  # e.g. "VH02012942"
+  cpm_filter    = 1,           # CPM threshold for gene filtering
+  min_samps     = 16,          # number of samples a gene must be expressed in
+  corr_method   = c("spearman","pearson"),
+  top_n         = 5,
+  make_plots    = TRUE
+){
+  
+  corr_method <- match.arg(corr_method)
+
+  # Subset to the plate/condition of interest
+  subgroup <- subset(data, subset = combined_id == !!combined_id & orig.ident == !!orig_ident)
+
+  # Counts and human-friendly column names
+  counts_d <- GetAssayData(subgroup, assay = "RNA", layer = "counts")
+  well_colnames <- paste0(subgroup$orig.ident, "_", subgroup$Well_ID)
+  names(well_colnames) <- rownames(subgroup@meta.data)
+  colnames(counts_d) <- well_colnames[colnames(counts_d)]
+
+  # edgeR container + gene filtering
+  y <- edgeR::DGEList(counts_d, group = subgroup$orig.ident)
+  keep <- rowSums(edgeR::cpm(y) > cpm_filter) >= min_samps
+  y <- y[keep, , keep.lib.sizes = FALSE]
+
+  # TMMwsp → log2-CPM
+  y <- edgeR::calcNormFactors(y, method = "TMMwsp")
+  log_cpm_tmm <- edgeR::cpm(y, log = TRUE, normalized.lib.sizes = TRUE)
+
+  #  Long data for boxplot
+  df_long <- as.data.frame(log_cpm_tmm) |>
+    tibble::rownames_to_column(var = "gene") |>
+    tidyr::pivot_longer(-gene, names_to = "sample", values_to = "log_cpm")
+
+  if (make_plots) {
+    # Boxplot
+    print(
+      ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) +
+        ggplot2::geom_boxplot(outlier.size = 0.5) +
+        ggplot2::labs(x = "Sample", y = "log2 CPM",
+                      title = "Boxplot of log2-CPM (TMMwsp)") +
+        ggplot2::theme_classic() +
+        ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
+    )
+  }
+
+  # Correlation matrices
+  cors_pear  <- stats::cor(log_cpm_tmm, method = "pearson")
+  cors_spear <- stats::cor(log_cpm_tmm, method = "spearman")
+
+  if (make_plots) {
+    pheatmap::pheatmap(cors_pear,  main = "Sample–sample correlation (Pearson, log2-CPM)")
+    pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)")
+  }
+
+  # Ranking by mean Fisher-z correlation to all *other* samples
+  R <- stats::cor(log_cpm_tmm, method = corr_method, use = "pairwise.complete.obs")
+  diag(R) <- NA_real_
+
+  # clip, Fisher z, average, back-transform
+  Z <- atanh(pmin(pmax(R, -0.999999), 0.999999))
+  score_z <- rowMeans(Z, na.rm = TRUE)
+  score_r <- tanh(score_z)
+
+  # top-N names and scores
+  ord  <- order(score_r, decreasing = TRUE, na.last = NA)
+  srt <- score_r[ord]
+  cutoff <- srt[top_n]              # N-th best score
+  keep   <- srt >= cutoff    # keep all tied at the cutoff
+  topN <- srt[keep]
+
+  # return everything useful
+  list(
+    subset_obj   = subgroup,
+    dge          = y,
+    log_cpm_tmm  = log_cpm_tmm,
+    boxplot_df   = df_long,
+    cor_pearson  = cors_pear,
+    cor_spearman = cors_spear,
+    ranking_method = corr_method,
+    scores_mean_to_others = sort(score_r, decreasing = TRUE),
+    topN = topN
+  )
+}
+
res1 <- peakRAM({
+dmso_VH02001612 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001612",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  dmso_VH02001614 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001614",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  
+  dmso_VH02001618 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001618",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  }
+)
+
+res2 <- peakRAM({
+dmso_VH02001612 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001612",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  dmso_VH02001614 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001614",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  
+  dmso_VH02001618 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001618",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  }
+)
+
+res3 <- peakRAM({
+dmso_VH02001612 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001612",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  dmso_VH02001614 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001614",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  
+  dmso_VH02001618 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001618",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  }
+)
+
res1
+
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Function_Call
+## 1 {dmso_VH02001612<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001612",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02001614<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001614",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02001618<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001618",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)}
+##   Elapsed_Time_sec Total_RAM_Used_MiB Peak_RAM_Used_MiB
+## 1             2.52               37.3              2406
+
res2
+
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Function_Call
+## 1 {dmso_VH02001612<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001612",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02001614<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001614",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02001618<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001618",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)}
+##   Elapsed_Time_sec Total_RAM_Used_MiB Peak_RAM_Used_MiB
+## 1              2.3                  0              2403
+
res3
+
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Function_Call
+## 1 {dmso_VH02001612<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001612",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02001614<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001614",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02001618<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001618",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)}
+##   Elapsed_Time_sec Total_RAM_Used_MiB Peak_RAM_Used_MiB
+## 1              2.3                  0              2403
+
if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps")
+get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2
+
+rss_before <- get_rss_mb()
+tm1 <- system.time({
+  dmso_VH02001612 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001612",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  dmso_VH02001614 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001614",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  
+  dmso_VH02001618 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001618",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+})
+rss_after  <- get_rss_mb()
+cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n",
+            tm1["elapsed"], rss_before, rss_after, rss_after - rss_before))
+
## Elapsed: 2.3s | RSS before→after: 3168→4695 MiB (Δ 1528 MiB)
+
get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2
+
+rss_before <- get_rss_mb()
+tm2 <- system.time({
+  dmso_VH02001612 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001612",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  dmso_VH02001614 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001614",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  
+  dmso_VH02001618 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001618",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+})
+rss_after  <- get_rss_mb()
+cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n",
+            tm2["elapsed"], rss_before, rss_after, rss_after - rss_before))
+
## Elapsed: 2.3s | RSS before→after: 4694→4756 MiB (Δ 62 MiB)
+
get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2
+
+rss_before <- get_rss_mb()
+tm3 <- system.time({
+  dmso_VH02001612 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001612",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  dmso_VH02001614 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001614",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  
+  dmso_VH02001618 <- select_top_replicates_tmmcpm(
+    batch3_mac_filtered,
+    combined_id = "DMSO_0",
+    orig_ident  = "VH02001618",
+    cpm_filter  = 5,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+})
+rss_after  <- get_rss_mb()
+cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n",
+            tm3["elapsed"], rss_before, rss_after, rss_after - rss_before))
+
## Elapsed: 2.3s | RSS before→after: 4756→4514 MiB (Δ -242 MiB)
+
tm1
+
##    user  system elapsed 
+##   2.208   0.119   2.328
+
tm2
+
##    user  system elapsed 
+##   2.211   0.129   2.341
+
tm3
+
##    user  system elapsed 
+##   2.199   0.123   2.346
+
+

Summary of runtime and memory usage

+

The select_top_replicates_tmmcpm function runs on same +plates (VH02001612, VH02001614, VH02001618) in approximately 2.463 +seconds for three plates, with a peak memory usage of around 2403 Mib. +This demonstrates that the function is efficient and can be used +effectively for selecting DMSO controls in a reasonable timeframe +without excessive memory consumption.

+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + diff --git a/vignettes/benchmark_DMSOselection.Rmd b/vignettes/benchmark_DMSOselection.Rmd new file mode 100644 index 0000000..287874c --- /dev/null +++ b/vignettes/benchmark_DMSOselection.Rmd @@ -0,0 +1,422 @@ +--- +title: "Benchmark: DMSO control selection" +output: + rmarkdown::html_document: + theme: flatly + toc: true + toc_float: true + toc_depth: 5 +vignette: > + %\VignetteIndexEntry{benchmarking DMSO control selection} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} + %\VignetteBuild{true} +--- + +```{r set_wd, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +dir <- "/Users/liuxin/macpie_Dev/" +devtools::load_all(paste0(dir, "macpie/")) + +``` + + +```{r setup} + +suppressMessages(library(macpie)) +suppressMessages(library(tibble)) +suppressMessages(library(stringr)) +suppressMessages(library(pheatmap)) +suppressMessages(library(ggiraph)) +suppressMessages(library(tidyseurat)) +suppressMessages(library(purrr)) +suppressMessages(library(ggplot2)) + +``` + +```{r} +options(scipen=999, digits=3) +``` + + + +## DRUGseq DMSO control selection + + +First we load the file of good DMSO control wells identified by the DRUGseq authors in their github repo. + +Here, we show the good DMSO wells for batch 24, which contains the three plates we are interested in (VH02012942, VH02012944, VH02012956). + +```{r} +good_DMSO_DRUGseq <- read.csv(paste0(dir, "DRUGseqData/robust_RC_ReferenceControl_DMSO_wells.txt"), sep="") +good_DMSO_DRUGseq %>% filter(batch_id==24) %>% select(batch_id, plate_barcode, well_id) +``` + +As from their results of good DMSO controls, these are the good DMSO wells for batch 24: + + VH02012942: I23, M23 + + VH02012944: D23, H23 + + VH02012956: I23, J23 + + +*** +
+ + + +## macpie DMSO control selection + + +Now we will use a function `select_top_replicates_tmmcpm` to identify good DMSO controls from batch 24. + +This function: + + - applies CPM filtering, + + - performs TMMwsp normalisation and computes log-CPM, + + - ranks wells by their mean Fisher-z–transformed sample–sample correlation to all other wells (Pearson or Spearman) + + - selects the top n wells (user-defined) as good controls. + +```{r} +select_top_replicates_tmmcpm <- function( + data, + combined_id, # e.g. "CB_43_EP73_0" + orig_ident, # e.g. "VH02012942" + cpm_filter = 1, # CPM threshold for gene filtering + min_samps = 16, # number of samples a gene must be expressed in + corr_method = c("spearman","pearson"), + top_n = 5, + make_plots = TRUE +){ + + corr_method <- match.arg(corr_method) + + # Subset to the plate/condition of interest + subgroup <- subset(data, subset = combined_id == !!combined_id & orig.ident == !!orig_ident) + + # Counts and human-friendly column names + counts_d <- GetAssayData(subgroup, assay = "RNA", layer = "counts") + well_colnames <- paste0(subgroup$orig.ident, "_", subgroup$Well_ID) + names(well_colnames) <- rownames(subgroup@meta.data) + colnames(counts_d) <- well_colnames[colnames(counts_d)] + + # edgeR container + gene filtering + y <- edgeR::DGEList(counts_d, group = subgroup$orig.ident) + keep <- rowSums(edgeR::cpm(y) > cpm_filter) >= min_samps + y <- y[keep, , keep.lib.sizes = FALSE] + + # TMMwsp → log2-CPM + y <- edgeR::calcNormFactors(y, method = "TMMwsp") + log_cpm_tmm <- edgeR::cpm(y, log = TRUE, normalized.lib.sizes = TRUE) + + # Long data for boxplot + df_long <- as.data.frame(log_cpm_tmm) |> + tibble::rownames_to_column(var = "gene") |> + tidyr::pivot_longer(-gene, names_to = "sample", values_to = "log_cpm") + + if (make_plots) { + # Boxplot + print( + ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) + + ggplot2::geom_boxplot(outlier.size = 0.5) + + ggplot2::labs(x = "Sample", y = "log2 CPM", + title = "Boxplot of log2-CPM (TMMwsp)") + + ggplot2::theme_classic() + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + ) + } + + # Correlation matrices + cors_pear <- stats::cor(log_cpm_tmm, method = "pearson") + cors_spear <- stats::cor(log_cpm_tmm, method = "spearman") + + if (make_plots) { + pheatmap::pheatmap(cors_pear, main = "Sample–sample correlation (Pearson, log2-CPM)") + pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)") + } + + # Ranking by mean Fisher-z correlation to all *other* samples + R <- stats::cor(log_cpm_tmm, method = corr_method, use = "pairwise.complete.obs") + diag(R) <- NA_real_ + + # clip, Fisher z, average, back-transform + Z <- atanh(pmin(pmax(R, -0.999999), 0.999999)) + score_z <- rowMeans(Z, na.rm = TRUE) + score_r <- tanh(score_z) + + # top-N names and scores + ord <- order(score_r, decreasing = TRUE, na.last = NA) + srt <- score_r[ord] + cutoff <- srt[top_n] # N-th best score + keep <- srt >= cutoff # keep all tied at the cutoff + topN <- srt[keep] + + # return everything useful + list( + subset_obj = subgroup, + dge = y, + log_cpm_tmm = log_cpm_tmm, + boxplot_df = df_long, + cor_pearson = cors_pear, + cor_spearman = cors_spear, + ranking_method = corr_method, + scores_mean_to_others = sort(score_r, decreasing = TRUE), + topN = topN + ) +} + + +``` + +```{r filter_genes_batch24} +# mac_filtered <- filter_genes_by_expression(as_mac, +# group_by = "combined_id", min_counts = 10, +# min_samples = min_sample_num) + +# saveRDS(mac_filtered, +# file = paste0(dir, "DRUGseqData/macpie_filtered_batch24.Rds")) + + +mac_filtered <- readRDS(paste0(dir, "/DRUGseqData/macpie_filtered_batch24.Rds")) +``` + + + +```{r violin_plots_batch24, fig.width=10, fig.height=6} +mac_filtered[["percent.mt"]] <- PercentageFeatureSet(mac_filtered, pattern = "^mt-|^MT-") +mac_filtered[["percent.ribo"]] <- PercentageFeatureSet(mac_filtered, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA") +``` + + +```{r} +mac_filtered$combined_id <- str_replace_all(mac_filtered$combined_id, "-","_") +``` + + +### RLE plot + +RLE plot to quickly visualise raw counts of DMSO wells in batch 24 plates + +```{r rle_plot_DMSO_batch24, fig.width=10, fig.height=6} +# mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0","FF_86_NH56_10")) +mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0")) +plot_rle(subset(mac_filtered_cp, subset=orig.ident=="VH02012942"), label_column = "Well_ID", normalisation = "raw") +plot_rle(subset(mac_filtered_cp, subset=orig.ident=="VH02012944"), label_column = "Well_ID", normalisation = "raw") +plot_rle(subset(mac_filtered_cp, subset=orig.ident=="VH02012956"), label_column = "Well_ID", normalisation = "raw") + +``` + + +From the RLE plots above, we can see that there are some level of variation among DMSO wells in each plate. It's important to select the best DMSO wells as controls for downstream analysis. + +### Select good DMSO controls using macpie + +Now we will apply the `select_top_replicates_tmmcpm` function to each of the three plates in batch 24 to identify good DMSO controls. + +This function generates three plots: + + - Boxplot of log2-CPM (TMMwsp) + + - Sample–sample correlation (Pearson, log2-CPM) + + - Sample–sample correlation (Spearman, log2-CPM) + + These plots help to visualize the distribution of gene expression and the correlation between samples, aiding in the assessment of DMSO control quality. + + +#### Plate VH02012942 + +```{r} +dmso_VH02012942 <- select_top_replicates_tmmcpm( + mac_filtered, + combined_id = "CB_43_EP73_0", + orig_ident = "VH02012942", + cpm_filter = 1, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = TRUE +) +``` + +Apart from the boxplot and correlation heatmaps, the function also returns a ranking of wells by their mean correlation to all other wells. + +```{r} +dmso_VH02012942$scores_mean_to_others +``` + +Finally, it returns the top N wells as good DMSO controls. + +```{r} +dmso_VH02012942$topN +``` + +Now we can see for plate VH02012942, the 2 of top 5 DMSO wells selected by macpie are I23 and J23, which are exactly the same as the good DMSO wells identified by the DRUGseq authors. + + +Let's repeat the same process for the other two plates in batch 24. + +#### Plate VH02012944 + + +```{r} +dmso_VH02012944 <- select_top_replicates_tmmcpm( + mac_filtered, + combined_id = "CB_43_EP73_0", + orig_ident = "VH02012944", + cpm_filter = 1, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = TRUE +) +``` + +```{r} +dmso_VH02012944$scores_mean_to_others +``` +```{r} +dmso_VH02012944$topN +``` + +For plate VH02012944, the DRUGseq selected D23 and H23 DMSO wells are not in our top 5. However, H23 is ranked 6th by macpie, which is very close to the top 5. + + +#### Plate VH02012956 + + +```{r} +dmso_VH02012956 <- select_top_replicates_tmmcpm( + mac_filtered, + combined_id = "CB_43_EP73_0", + orig_ident = "VH02012956", + cpm_filter = 1, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = TRUE +) +``` + +```{r} +dmso_VH02012956$scores_mean_to_others +``` + +```{r} +dmso_VH02012956$topN +``` + +For plate VH02012956, the DRUGseq selected I23 and J23 DMSO wells are exactly the same as our top DMSO wells selected by macpie. + + + +```{r} +library(peakRAM) +library(ps) +``` + +### Runtime and Memory + +Now we will measure the runtime and memory usage of the `select_top_replicates_tmmcpm` function using the `peakRAM` package. + + +```{r} +res <- peakRAM({ + dmso_VH02012942 <- select_top_replicates_tmmcpm( + mac_filtered, + combined_id = "CB_43_EP73_0", + orig_ident = "VH02012942", + cpm_filter = 1, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + dmso_VH02012944 <- select_top_replicates_tmmcpm( + mac_filtered, + combined_id = "CB_43_EP73_0", + orig_ident = "VH02012944", + cpm_filter = 1, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + + dmso_VH02012956 <- select_top_replicates_tmmcpm( + mac_filtered, + combined_id = "CB_43_EP73_0", + orig_ident = "VH02012956", + cpm_filter = 1, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + } +) + +res +``` + + +```{r} +if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps") +get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2 + +rss_before <- get_rss_mb() +tm <- system.time({ + dmso_VH02012942 <- select_top_replicates_tmmcpm( + mac_filtered, + combined_id = "CB_43_EP73_0", + orig_ident = "VH02012942", + cpm_filter = 1, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + dmso_VH02012944 <- select_top_replicates_tmmcpm( + mac_filtered, + combined_id = "CB_43_EP73_0", + orig_ident = "VH02012944", + cpm_filter = 1, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) + + dmso_VH02012956 <- select_top_replicates_tmmcpm( + mac_filtered, + combined_id = "CB_43_EP73_0", + orig_ident = "VH02012956", + cpm_filter = 1, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) +}) +rss_after <- get_rss_mb() +cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n", + tm["elapsed"], rss_before, rss_after, rss_after - rss_before)) + +``` + + +### Summary + +In summary, for the three plates in batch 24, macpie successfully identified 4 out of 6 good DMSO wells that were also selected by the DRUGseq authors. Only for plate VH02012944, one of the DRUGseq selected DMSO wells (D23) was not in our top 5, but the other well (H23) was ranked 6th by macpie, which is very close to the top 5. This demonstrates that macpie is effective in selecting high-quality DMSO controls for downstream analysis without running permutation tests, making it a computationally efficient choice. + +This function runs for each plate, it does not take into account any batches or plates. If it's a cross-plates design, it is recommended to either compute within-plate or remove plate effects (e.g. using ComBat, limma removeBatchEffect functions) first. + +The select_top_replicates_tmmcpm function runs in approximately 1.38 seconds for three plates, with a peak memory usage of around 313 Mib. This demonstrates that the function is efficient and can be used effectively for selecting DMSO controls in a reasonable timeframe without excessive memory consumption. + + + + diff --git a/vignettes/benchmark_DMSOselection.html b/vignettes/benchmark_DMSOselection.html new file mode 100644 index 0000000..9064b5d --- /dev/null +++ b/vignettes/benchmark_DMSOselection.html @@ -0,0 +1,2017 @@ + + + + + + + + + + + + + +Benchmark: DMSO control selection + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + +
suppressMessages(library(macpie))
+suppressMessages(library(tibble))
+suppressMessages(library(stringr))
+suppressMessages(library(pheatmap))
+suppressMessages(library(ggiraph))
+suppressMessages(library(tidyseurat))
+suppressMessages(library(purrr))
+suppressMessages(library(ggplot2))
+
options(scipen=999, digits=3)
+
+

DRUGseq DMSO control selection

+

First we load the file of good DMSO control wells identified by the +DRUGseq authors in their github repo.

+

Here, we show the good DMSO wells for batch 24, which contains the +three plates we are interested in (VH02012942, VH02012944, +VH02012956).

+
good_DMSO_DRUGseq <- read.csv(paste0(dir, "DRUGseqData/robust_RC_ReferenceControl_DMSO_wells.txt"), sep="")
+good_DMSO_DRUGseq %>% filter(batch_id==24) %>% select(batch_id, plate_barcode, well_id)
+
##   batch_id plate_barcode well_id
+## 1       24    VH02012942     I23
+## 2       24    VH02012942     M23
+## 3       24    VH02012944     D23
+## 4       24    VH02012944     H23
+## 5       24    VH02012956     I23
+## 6       24    VH02012956     J23
+

As from their results of good DMSO controls, these are the good DMSO +wells for batch 24:

+

VH02012942: I23, M23

+

VH02012944: D23, H23

+

VH02012956: I23, J23

+
+


+
+
+

macpie DMSO control selection

+

Now we will use a function select_top_replicates_tmmcpm +to identify good DMSO controls from batch 24.

+

This function:

+
    +
  • applies CPM filtering,

  • +
  • performs TMMwsp normalisation and computes log-CPM,

  • +
  • ranks wells by their mean Fisher-z–transformed sample–sample +correlation to all other wells (Pearson or Spearman)

  • +
  • selects the top n wells (user-defined) as good controls.

  • +
+
select_top_replicates_tmmcpm <- function(
+  data,
+  combined_id,                 # e.g. "CB_43_EP73_0"
+  orig_ident,                  # e.g. "VH02012942"
+  cpm_filter    = 1,           # CPM threshold for gene filtering
+  min_samps     = 16,          # number of samples a gene must be expressed in
+  corr_method   = c("spearman","pearson"),
+  top_n         = 5,
+  make_plots    = TRUE
+){
+  
+  corr_method <- match.arg(corr_method)
+
+  # Subset to the plate/condition of interest
+  subgroup <- subset(data, subset = combined_id == !!combined_id & orig.ident == !!orig_ident)
+
+  # Counts and human-friendly column names
+  counts_d <- GetAssayData(subgroup, assay = "RNA", layer = "counts")
+  well_colnames <- paste0(subgroup$orig.ident, "_", subgroup$Well_ID)
+  names(well_colnames) <- rownames(subgroup@meta.data)
+  colnames(counts_d) <- well_colnames[colnames(counts_d)]
+
+  # edgeR container + gene filtering
+  y <- edgeR::DGEList(counts_d, group = subgroup$orig.ident)
+  keep <- rowSums(edgeR::cpm(y) > cpm_filter) >= min_samps
+  y <- y[keep, , keep.lib.sizes = FALSE]
+
+  # TMMwsp → log2-CPM
+  y <- edgeR::calcNormFactors(y, method = "TMMwsp")
+  log_cpm_tmm <- edgeR::cpm(y, log = TRUE, normalized.lib.sizes = TRUE)
+
+  #  Long data for boxplot
+  df_long <- as.data.frame(log_cpm_tmm) |>
+    tibble::rownames_to_column(var = "gene") |>
+    tidyr::pivot_longer(-gene, names_to = "sample", values_to = "log_cpm")
+
+  if (make_plots) {
+    # Boxplot
+    print(
+      ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) +
+        ggplot2::geom_boxplot(outlier.size = 0.5) +
+        ggplot2::labs(x = "Sample", y = "log2 CPM",
+                      title = "Boxplot of log2-CPM (TMMwsp)") +
+        ggplot2::theme_classic() +
+        ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
+    )
+  }
+
+  # Correlation matrices
+  cors_pear  <- stats::cor(log_cpm_tmm, method = "pearson")
+  cors_spear <- stats::cor(log_cpm_tmm, method = "spearman")
+
+  if (make_plots) {
+    pheatmap::pheatmap(cors_pear,  main = "Sample–sample correlation (Pearson, log2-CPM)")
+    pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)")
+  }
+
+  # Ranking by mean Fisher-z correlation to all *other* samples
+  R <- stats::cor(log_cpm_tmm, method = corr_method, use = "pairwise.complete.obs")
+  diag(R) <- NA_real_
+
+  # clip, Fisher z, average, back-transform
+  Z <- atanh(pmin(pmax(R, -0.999999), 0.999999))
+  score_z <- rowMeans(Z, na.rm = TRUE)
+  score_r <- tanh(score_z)
+
+  # top-N names and scores
+  ord  <- order(score_r, decreasing = TRUE, na.last = NA)
+  srt <- score_r[ord]
+  cutoff <- srt[top_n]              # N-th best score
+  keep   <- srt >= cutoff    # keep all tied at the cutoff
+  topN <- srt[keep]
+
+  # return everything useful
+  list(
+    subset_obj   = subgroup,
+    dge          = y,
+    log_cpm_tmm  = log_cpm_tmm,
+    boxplot_df   = df_long,
+    cor_pearson  = cors_pear,
+    cor_spearman = cors_spear,
+    ranking_method = corr_method,
+    scores_mean_to_others = sort(score_r, decreasing = TRUE),
+    topN = topN
+  )
+}
+
# mac_filtered <- filter_genes_by_expression(as_mac,
+#                 group_by = "combined_id", min_counts = 10,
+#                 min_samples = min_sample_num)
+
+# saveRDS(mac_filtered,
+# file = paste0(dir, "DRUGseqData/macpie_filtered_batch24.Rds"))
+
+
+mac_filtered <- readRDS(paste0(dir, "/DRUGseqData/macpie_filtered_batch24.Rds"))
+
mac_filtered[["percent.mt"]] <- PercentageFeatureSet(mac_filtered, pattern = "^mt-|^MT-")
+mac_filtered[["percent.ribo"]] <- PercentageFeatureSet(mac_filtered, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA")
+
mac_filtered$combined_id <- str_replace_all(mac_filtered$combined_id, "-","_")
+
+

RLE plot

+

RLE plot to quickly visualise raw counts of DMSO wells in batch 24 +plates

+
# mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0","FF_86_NH56_10"))
+mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0"))
+plot_rle(subset(mac_filtered_cp, subset=orig.ident=="VH02012942"), label_column = "Well_ID", normalisation = "raw")
+
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
+

+
plot_rle(subset(mac_filtered_cp, subset=orig.ident=="VH02012944"), label_column = "Well_ID", normalisation = "raw")
+
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
+

+
plot_rle(subset(mac_filtered_cp, subset=orig.ident=="VH02012956"), label_column = "Well_ID", normalisation = "raw")
+
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
+

+

From the RLE plots above, we can see that there are some level of +variation among DMSO wells in each plate. It’s important to select the +best DMSO wells as controls for downstream analysis.

+
+
+

Select good DMSO controls using macpie

+

Now we will apply the select_top_replicates_tmmcpm +function to each of the three plates in batch 24 to identify good DMSO +controls.

+

This function generates three plots:

+
    +
  • Boxplot of log2-CPM (TMMwsp)

  • +
  • Sample–sample correlation (Pearson, log2-CPM)

  • +
  • Sample–sample correlation (Spearman, log2-CPM)

  • +
+

These plots help to visualize the distribution of gene expression and +the correlation between samples, aiding in the assessment of DMSO +control quality.

+
+

Plate VH02012942

+
dmso_VH02012942 <- select_top_replicates_tmmcpm(
+  mac_filtered,
+  combined_id = "CB_43_EP73_0",
+  orig_ident  = "VH02012942",
+  cpm_filter  = 1,
+  min_samps   = 8,
+  corr_method = "spearman",
+  top_n       = 5,
+  make_plots  = TRUE
+)
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <e2>
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <80>
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <93>
+

+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <e2>
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <80>
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <93>
+

+

Apart from the boxplot and correlation heatmaps, the function also +returns a ranking of wells by their mean correlation to all other +wells.

+
dmso_VH02012942$scores_mean_to_others
+
## VH02012942_K23 VH02012942_M23 VH02012942_J23 VH02012942_I23 VH02012942_C23 
+##          0.878          0.877          0.875          0.874          0.872 
+## VH02012942_N23 VH02012942_L23 VH02012942_E23 VH02012942_G23 VH02012942_H23 
+##          0.870          0.869          0.869          0.866          0.863 
+## VH02012942_O23 VH02012942_D23 VH02012942_P23 VH02012942_F23 VH02012942_B23 
+##          0.860          0.858          0.858          0.851          0.849 
+## VH02012942_A23 
+##          0.843
+

Finally, it returns the top N wells as good DMSO controls.

+
dmso_VH02012942$topN
+
## VH02012942_K23 VH02012942_M23 VH02012942_J23 VH02012942_I23 VH02012942_C23 
+##          0.878          0.877          0.875          0.874          0.872
+

Now we can see for plate VH02012942, the 2 of top 5 DMSO wells +selected by macpie are I23 and J23, which are exactly the same as the +good DMSO wells identified by the DRUGseq authors.

+

Let’s repeat the same process for the other two plates in batch +24.

+
+
+

Plate VH02012944

+
dmso_VH02012944 <- select_top_replicates_tmmcpm(
+  mac_filtered,
+  combined_id = "CB_43_EP73_0",
+  orig_ident  = "VH02012944",
+  cpm_filter  = 1,
+  min_samps   = 8,
+  corr_method = "spearman",
+  top_n       = 5,
+  make_plots  = TRUE
+)
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <e2>
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <80>
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <93>
+

+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <e2>
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <80>
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <93>
+

+
dmso_VH02012944$scores_mean_to_others
+
## VH02012944_I23 VH02012944_M23 VH02012944_J23 VH02012944_G23 VH02012944_K23 
+##          0.906          0.904          0.903          0.902          0.901 
+## VH02012944_H23 VH02012944_C23 VH02012944_L23 VH02012944_E23 VH02012944_D23 
+##          0.900          0.899          0.898          0.897          0.897 
+## VH02012944_O23 VH02012944_F23 VH02012944_N23 VH02012944_P23 VH02012944_A23 
+##          0.897          0.897          0.895          0.890          0.887 
+## VH02012944_B23 
+##          0.880
+
dmso_VH02012944$topN
+
## VH02012944_I23 VH02012944_M23 VH02012944_J23 VH02012944_G23 VH02012944_K23 
+##          0.906          0.904          0.903          0.902          0.901
+

For plate VH02012944, the DRUGseq selected D23 and H23 DMSO wells are +not in our top 5. However, H23 is ranked 6th by macpie, which is very +close to the top 5.

+
+
+

Plate VH02012956

+
dmso_VH02012956 <- select_top_replicates_tmmcpm(
+  mac_filtered,
+  combined_id = "CB_43_EP73_0",
+  orig_ident  = "VH02012956",
+  cpm_filter  = 1,
+  min_samps   = 8,
+  corr_method = "spearman",
+  top_n       = 5,
+  make_plots  = TRUE
+)
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <e2>
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <80>
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <93>
+

+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <e2>
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <80>
+
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
+## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
+## 'mbcsToSbcs': dot substituted for <93>
+

+
dmso_VH02012956$scores_mean_to_others
+
## VH02012956_J23 VH02012956_O23 VH02012956_M23 VH02012956_I23 VH02012956_K23 
+##          0.912          0.904          0.902          0.902          0.901 
+## VH02012956_N23 VH02012956_G23 VH02012956_H23 VH02012956_L23 VH02012956_P23 
+##          0.901          0.900          0.899          0.899          0.897 
+## VH02012956_F23 VH02012956_C23 VH02012956_D23 VH02012956_E23 VH02012956_A23 
+##          0.894          0.893          0.892          0.891          0.888 
+## VH02012956_B23 
+##          0.886
+
dmso_VH02012956$topN
+
## VH02012956_J23 VH02012956_O23 VH02012956_M23 VH02012956_I23 VH02012956_K23 
+##          0.912          0.904          0.902          0.902          0.901
+

For plate VH02012956, the DRUGseq selected I23 and J23 DMSO wells are +exactly the same as our top DMSO wells selected by macpie.

+
library(peakRAM)
+library(ps)
+
+
+
+

Runtime and Memory

+

Now we will measure the runtime and memory usage of the +select_top_replicates_tmmcpm function using the +peakRAM package.

+
res <- peakRAM({
+  dmso_VH02012942 <- select_top_replicates_tmmcpm(
+    mac_filtered,
+    combined_id = "CB_43_EP73_0",
+    orig_ident  = "VH02012942",
+    cpm_filter  = 1,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  dmso_VH02012944 <- select_top_replicates_tmmcpm(
+    mac_filtered,
+    combined_id = "CB_43_EP73_0",
+    orig_ident  = "VH02012944",
+    cpm_filter  = 1,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  
+  dmso_VH02012956 <- select_top_replicates_tmmcpm(
+    mac_filtered,
+    combined_id = "CB_43_EP73_0",
+    orig_ident  = "VH02012956",
+    cpm_filter  = 1,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  }
+)
+
+res
+
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             Function_Call
+## 1 {dmso_VH02012942<-select_top_replicates_tmmcpm(mac_filtered,combined_id="CB_43_EP73_0",orig_ident="VH02012942",cpm_filter=1,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02012944<-select_top_replicates_tmmcpm(mac_filtered,combined_id="CB_43_EP73_0",orig_ident="VH02012944",cpm_filter=1,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02012956<-select_top_replicates_tmmcpm(mac_filtered,combined_id="CB_43_EP73_0",orig_ident="VH02012956",cpm_filter=1,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)}
+##   Elapsed_Time_sec Total_RAM_Used_MiB Peak_RAM_Used_MiB
+## 1             1.34                9.4               313
+
if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps")
+get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2
+
+rss_before <- get_rss_mb()
+tm <- system.time({
+  dmso_VH02012942 <- select_top_replicates_tmmcpm(
+    mac_filtered,
+    combined_id = "CB_43_EP73_0",
+    orig_ident  = "VH02012942",
+    cpm_filter  = 1,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  dmso_VH02012944 <- select_top_replicates_tmmcpm(
+    mac_filtered,
+    combined_id = "CB_43_EP73_0",
+    orig_ident  = "VH02012944",
+    cpm_filter  = 1,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+  
+  dmso_VH02012956 <- select_top_replicates_tmmcpm(
+    mac_filtered,
+    combined_id = "CB_43_EP73_0",
+    orig_ident  = "VH02012956",
+    cpm_filter  = 1,
+    min_samps   = 8,
+    corr_method = "spearman",
+    top_n       = 5,
+    make_plots  = FALSE
+  )
+})
+rss_after  <- get_rss_mb()
+cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n",
+            tm["elapsed"], rss_before, rss_after, rss_after - rss_before))
+
## Elapsed: 1.4s | RSS before→after: 2436→2431 MiB (Δ -5 MiB)
+
+
+

Summary

+

In summary, for the three plates in batch 24, macpie successfully +identified 4 out of 6 good DMSO wells that were also selected by the +DRUGseq authors. Only for plate VH02012944, one of the DRUGseq selected +DMSO wells (D23) was not in our top 5, but the other well (H23) was +ranked 6th by macpie, which is very close to the top 5. This +demonstrates that macpie is effective in selecting high-quality DMSO +controls for downstream analysis without running permutation tests, +making it a computationally efficient choice.

+

This function runs for each plate, it does not take into account any +batches or plates. If it’s a cross-plates design, it is recommended to +either compute within-plate or remove plate effects (e.g. using ComBat, +limma removeBatchEffect functions) first.

+

The select_top_replicates_tmmcpm function runs in approximately 1.38 +seconds for three plates, with a peak memory usage of around 313 Mib. +This demonstrates that the function is efficient and can be used +effectively for selecting DMSO controls in a reasonable timeframe +without excessive memory consumption.

+
+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + diff --git a/vignettes/check_zero_inflation.Rmd b/vignettes/check_zero_inflation.Rmd index b74f9e9..724ad3b 100644 --- a/vignettes/check_zero_inflation.Rmd +++ b/vignettes/check_zero_inflation.Rmd @@ -173,7 +173,7 @@ Each of the columns in the summary table are defined as follows: - median_p0_nb: Median expected zero proportion under the NB model across genes in the group - - median_ZI: Median zero-inflation index (ZI = p0_nb - p0_obs for each gene) across genes in the group + - median_ZI: Median zero-inflation index (ZI = p0_obs - p0_nb for each gene) across genes in the group - observed_zeros_num: Number of data points with observed zeros (it shouldn't be more than n_genes*n_wells for each group) @@ -186,11 +186,67 @@ Each of the columns in the summary table are defined as follows: ***
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #### See below for interpretation of the results For DMSO vs camptothecin, we can see that both DMSO and camptothecin groups have negative median ZI values, indicating that there is no significant zero-inflation in either group. Percentage of genes with ZI greater than 0.1 or 0.2 are 13.5% and 6.6% for camptothecin. As camptothecin acts as a DNA topoisomerase inhibitor and induces DNA damage, it can lead to cell cycle arrest and apoptosis, which may reduce the overall transcriptional activity in the cells. This reduction in transcriptional activity can result in genes being expressed at lower levels or not at all. On the other hand, DMSO is a common solvent used in biological experiments and is not expected to induce significant changes in gene expression. Therefore, it is reasonable to observe a lower level of zero-inflation in the DMSO group compared to the camptothecin group. Same as ZI results for DMSO, the percentage of genes with ZI greater than 0.1 or 0.2 are both 0%. -Besides, the observed zeros are also lower than the expected zeros, further supporting the absence of zero-inflation. Overall, these results suggest that the data does not exhibit significant zero-inflation, and a standard negative binomial model may be appropriate for downstream analyses. Zero-inflated models (ZINB) may not be necessary for this dataset. +Besides, the observed zeros are also lower than the expected zeros, further supporting the absence of zero-inflation. Overall, these results suggest that the data does not exhibit significant zero-inflation, and a standard negative binomial model may be appropriate for downstream analyses. Zero-inflated models (ZINB) may not be necessary for this data set. @@ -236,8 +292,120 @@ zi_results$gene_metrics_by_group %>% filter(group=="Staurosporine_10") %>% head( ```{r} zi_results$summary_by_group ``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #### See below for interpretation of the results For DMSO vs staurosporine, we can see that only staurosporine has 14% of median ZI value, the observed zeros number (713) is much higher than the expected zeros number (~355), indicating that there is some level of zero-inflation in the staurosporine group. As staurosporine is a cell death control, which it is expected to see zero-inflated data in this group. In this case, it may be appropriate to consider using a zero-inflated negative binomial (ZINB) model for downstream analyses to account for the excess zeros in the data. + + + +### All treatment groups + +Finally, we will check for zero-inflation across all treatment groups in the data. + +```{r} +# Check for zero-inflation +all_conditions <- unique(sub_mac$combined_id) +zi_results <- check_zeroinflation(sub_mac, + group_by = "combined_id", + samples = all_conditions, + batch = 1, + cutoffs = c(0.1, 0.2)) +``` + +#### View gene-wise metrics for each group + +```{r} +zi_results$gene_metrics_by_group %>% filter(group=="Luminespib_10") %>% head() + +``` + + +```{r} +zi_results$summary_by_group %>% head(10) +``` + +```{r} +high_doses_zi_results <- zi_results$summary_by_group %>% filter(grepl("_10$", group)) +#concatenate with DMSO +high_doses_zi_results <- rbind(high_doses_zi_results, + zi_results$summary_by_group %>% filter(group=="DMSO_0")) + + +long_zi_results <- high_doses_zi_results %>% select(group, median_p0_obs, median_p0_nb, median_ZI) %>% + pivot_longer(cols = c(median_p0_obs, median_p0_nb, median_ZI), + names_to = "metric", + values_to = "value") + +library(ggplot2) +ggplot(long_zi_results, aes(x = group, y = value, fill = metric)) + + geom_bar(stat = "identity", position = "dodge") + + labs(title = "Zero-Inflation Metrics by Treatment Groups (10uM) and DMSO", + x = "Groups", + y = "zero-inflation proportion") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_manual(values = macpie_colours$discrete[1:3]) +``` + +#### See below for interpretation of the results + + - median_p0_obs: Median observed zero proportion across genes in the group + + - median_p0_nb: Median expected zero proportion under the NB model across genes in the group + + - median_ZI: Median zero-inflation index (ZI = p0_obs - p0_nb for each gene) across genes in the group + +From the plot, we can see that most treatment groups have small or even negative median ZI values, indicating that there is no significant zero-inflation in these groups. Only Staurosporine_10 shows positive median ZI values, suggesting some level of zero-inflation in the group. As described above, staurosporine is a potent inducer of apoptosis, which can lead to increased cell death and reduced gene expression, resulting in more zero counts in the data. Therefore, it is reasonable to observe excessive zeros in the staurosporine group. For other treatment groups, the small or even negative median ZI values suggest that a standard negative binomial model may be appropriate for downstream analyses. Zero-inflated models (ZINB) may not be necessary for these groups. diff --git a/vignettes/check_zero_inflation.html b/vignettes/check_zero_inflation.html new file mode 100644 index 0000000..cc08ea4 --- /dev/null +++ b/vignettes/check_zero_inflation.html @@ -0,0 +1,2051 @@ + + + + + + + + + + + + + +Check for zero-inflation in your data + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + +
+

Overview

+

This is a quick demonstration of the +check_zeroinflation() function from our +macpie package. This function is a fast diagnostic tool +to help you assess whether your MACseq data exhibits zero-inflation +relative to a negative binomial (NB) model.

+

We will use a lightweight convenience wrapper +subsample_genes() around +seqgendiff::select_counts() to create a +smaller object with a subset of genes for faster computation. This +function is also part of the macpie package.

+

Under the hood, check_zeroinflation() uses +edgeR to estimate gene-wise dispersions and compute +expected zero probabilities under a NB model. It then compares the +observed and expected zero-inflated indexes for each gene within each +group defined in the metadata.

+

This function:

+
    +
  • estimates gene-wise tagwise dispersions with edgeR (using all +selected groups),

  • +
  • builds NB-expected zero probabilities from TMMwsp-scaled means, +and

  • +
  • returns per-gene ZI (observed zeros minus NB-expected zeros) and +per-group summaries (e.g., % genes with ZI > 0.05). ZI-cutoffs are +user-defined.

  • +
+

The output is a list with two elements:

+
    +
  • summary_by_group: A summary table showing the number +and percentage of genes that are zero-inflated for each group at +specified cutoffs.

  • +
  • gene_metrics_by_group: A detailed table with +gene-wise metrics, including observed and expected zero numbers and +proportions, zero-inflation index (ZI), mean count, and dispersion +estimates for each gene in each group.

  • +
+

Note: check_zeroinflation() relies on +edgeR to estimate dispersion. The current implementation requires ≥2 +groups in the design so that edgeR can stabilize gene-wise dispersions +across groups. If you only have a single group and still want a +design-aware baseline for expected zeros, fit a Gamma–Poisson/NB GLM and +compute the expected zero probabilities from its fitted means and +over-dispersion.

+
+


+
+
+

Load data

+

Load data and preprocess before subsampling genes and checking for +zero-inflation.

+
#install.packages("macpie")  # or devtools::install_github("PMCC-BioinformaticsCore/macpie")
+library(macpie)
+
+# Define project variables
+project_name <- "PMMSq033"
+project_metadata <- system.file("extdata/PMMSq033_metadata.csv", package = "macpie")
+# Load metadata
+metadata <- read_metadata(project_metadata)
+
+
# Import raw data
+project_rawdata <- paste0(dir, "/macpieData/PMMSq033/raw_matrix")
+project_name <- "PMMSq033"
+raw_counts <- Read10X(data.dir = project_rawdata)
+# Create tidySeurat object
+mac <- CreateSeuratObject(counts = raw_counts,
+                          project = project_name,
+                          min.cells = 1,
+                          min.features = 1)
+#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
+#> ('-')
+# Join with metadata
+mac <- mac %>%
+  inner_join(metadata, by = c(".cell" = "Barcode"))
+# Add unique identifier
+mac <- mac %>%
+  mutate(combined_id = str_c(Compound_ID, Concentration_1, sep = "_")) %>%
+  mutate(combined_id = gsub(" ", "", .data$combined_id))
+mac <- mac %>%
+  filter(Project == "Current")
+
+

Filter genes with very low counts across all samples. This step is +important because genes with extremely low expression can lead to +unreliable estimates of dispersion and expected zero probabilities, +which can skew the zero-inflation assessment.

+
# Filter by read count per sample group
+mac <- filter_genes_by_expression(mac, 
+                                  group_by = "combined_id", 
+                                  min_counts = 10, 
+                                  min_samples = 2)
+
+
+

Subsample genes for faster computation

+

This is to randomly select a subset of 1000 genes for a quick check. +For a more comprehensive analysis, consider using a larger subset.

+
# Subsample genes for faster computation
+sub_mac <- subsample_genes(mac, ngene = 1000, gselect = "random", seed = 1)
+sub_mac %>% nrow()
+#> [1] 1000
+
+
+

Check for zero-inflation

+

We can now run the zero-inflation check on the subsampled data.

+
+

DMSO vs Camptothecin

+

First, we will compare two groups: “DMSO_0” and +“CAMPTOTHECIN_10”.

+
zi_results <- check_zeroinflation(sub_mac, 
+                                  group_by = "combined_id",
+                                  samples = c("DMSO_0","CAMPTOTHECIN_10"),
+                                  batch = 1,
+                                  cutoffs = c(0.1, 0.2))
+
+
+

View gene-wise metrics for each group

+

For DMSO group:

+
zi_results$gene_metrics_by_group %>% filter(group=="DMSO_0") %>% head()
+#>           group     gene mean_count_group dispersion p0_obs obs_zeros_num
+#> MTND1P23 DMSO_0 MTND1P23             9.68   9.77e-05      0             0
+#> MTATP6P1 DMSO_0 MTATP6P1             7.26   9.77e-05      0             0
+#> ISG15    DMSO_0    ISG15             9.58   9.77e-05      0             0
+#> PUSL1    DMSO_0    PUSL1             8.63   9.77e-05      0             0
+#> DVL1     DMSO_0     DVL1             8.05   9.77e-05      0             0
+#> MRPL20   DMSO_0   MRPL20             4.16   9.77e-05      0             0
+#>             p0_nb expected_zeros_num        ZI
+#> MTND1P23 8.79e-05            0.00167 -8.79e-05
+#> MTATP6P1 8.47e-04            0.01610 -8.47e-04
+#> ISG15    9.69e-05            0.00184 -9.69e-05
+#> PUSL1    2.34e-04            0.00445 -2.34e-04
+#> DVL1     4.03e-04            0.00765 -4.03e-04
+#> MRPL20   1.66e-02            0.31555 -1.66e-02
+

For Camptothecin group:

+
zi_results$gene_metrics_by_group %>% filter(group=="CAMPTOTHECIN_10") %>% head()
+#>                     group     gene mean_count_group dispersion p0_obs
+#> MTND1P231 CAMPTOTHECIN_10 MTND1P23             7.67   9.77e-05      0
+#> MTATP6P11 CAMPTOTHECIN_10 MTATP6P1             5.00   9.77e-05      0
+#> ISG151    CAMPTOTHECIN_10    ISG15             7.33   9.77e-05      0
+#> PUSL11    CAMPTOTHECIN_10    PUSL1             2.00   9.77e-05      0
+#> DVL11     CAMPTOTHECIN_10     DVL1             0.00   9.77e-05      1
+#> MRPL201   CAMPTOTHECIN_10   MRPL20            10.33   9.77e-05      0
+#>           obs_zeros_num    p0_nb expected_zeros_num        ZI
+#> MTND1P231             0 0.001184           0.003553 -0.001184
+#> MTATP6P11             0 0.010444           0.031333 -0.010444
+#> ISG151                0 0.001546           0.004638 -0.001546
+#> PUSL11                0 0.145972           0.437915 -0.145972
+#> DVL11                 3 1.000000           3.000000  0.000000
+#> MRPL201               0 0.000146           0.000438 -0.000146
+
+
+

View summary statistics for each group

+
zi_results$summary_by_group
+#>                           group n_genes n_wells median_p0_obs median_p0_nb
+#> CAMPTOTHECIN_10 CAMPTOTHECIN_10    1000       3             0     0.058845
+#> DMSO_0                   DMSO_0    1000      19             0     0.000491
+#>                 median_ZI observed_zeros_num expected_zeros_num pct_ZI_gt_0.1
+#> CAMPTOTHECIN_10 -0.013835                434              487.9         0.135
+#> DMSO_0          -0.000445                 17               55.2         0.000
+#>                 pct_ZI_gt_0.2
+#> CAMPTOTHECIN_10         0.066
+#> DMSO_0                  0.000
+

From the summary table, we can see the summary statistics for each +group, including the number and percentage of genes that are +zero-inflated at the specified cutoffs.

+

Each of the columns in the summary table are defined as follows:

+
    +
  • group: The treatment group

  • +
  • n_genes: Total number of genes analyzed in the group

  • +
  • n_wells: Total number of wells/samples in the group

  • +
  • median_p0_obs: Median observed zero proportion across genes in +the group

  • +
  • median_p0_nb: Median expected zero proportion under the NB model +across genes in the group

  • +
  • median_ZI: Median zero-inflation index (ZI = p0_obs - p0_nb for +each gene) across genes in the group

  • +
  • observed_zeros_num: Number of data points with observed zeros (it +shouldn’t be more than n_genes*n_wells for each group)

  • +
  • expected_zeros_num: Number of data points with expected zeros +under the NB model (same here, it shouldn’t be more than n_genes*n_wells +for each group)

  • +
  • pct_ZI_gt_0.1: Percentage of genes with ZI greater than +0.1

  • +
  • pct_ZI_gt_0.2: Percentage of genes with ZI greater than +0.2

  • +
+
+


+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

See below for interpretation of the results

+

For DMSO vs camptothecin, we can see that both DMSO and camptothecin +groups have negative median ZI values, indicating that there is no +significant zero-inflation in either group. Percentage of genes with ZI +greater than 0.1 or 0.2 are 13.5% and 6.6% for camptothecin. As +camptothecin acts as a DNA topoisomerase inhibitor and induces DNA +damage, it can lead to cell cycle arrest and apoptosis, which may reduce +the overall transcriptional activity in the cells. This reduction in +transcriptional activity can result in genes being expressed at lower +levels or not at all. On the other hand, DMSO is a common solvent used +in biological experiments and is not expected to induce significant +changes in gene expression. Therefore, it is reasonable to observe a +lower level of zero-inflation in the DMSO group compared to the +camptothecin group. Same as ZI results for DMSO, the percentage of genes +with ZI greater than 0.1 or 0.2 are both 0%.

+

Besides, the observed zeros are also lower than the expected zeros, +further supporting the absence of zero-inflation. Overall, these results +suggest that the data does not exhibit significant zero-inflation, and a +standard negative binomial model may be appropriate for downstream +analyses. Zero-inflated models (ZINB) may not be necessary for this data +set.

+
+


+
+
+
+

DMSO vs Staruosporine

+

Next, we will compare two treatment groups: “DMSO_0” and +“Staurosporine_10”. As staurosporine is a potent inducer of apoptosis, +we might expect to see more zero-inflation in this group compared to +DMSO.

+
# Check for zero-inflation
+zi_results <- check_zeroinflation(sub_mac, 
+                                  group_by = "combined_id",
+                                  samples = c("DMSO_0","Staurosporine_10"),
+                                  batch = 1,
+                                  cutoffs = c(0.1, 0.2))
+
+

View gene-wise metrics for each group

+

For DMSO group:

+
zi_results$gene_metrics_by_group %>% filter(group=="DMSO_0") %>% head()
+#>           group     gene mean_count_group dispersion p0_obs obs_zeros_num
+#> MTND1P23 DMSO_0 MTND1P23             9.68   9.77e-05      0             0
+#> MTATP6P1 DMSO_0 MTATP6P1             7.26   4.24e-02      0             0
+#> ISG15    DMSO_0    ISG15             9.58   9.77e-05      0             0
+#> PUSL1    DMSO_0    PUSL1             8.63   9.77e-05      0             0
+#> DVL1     DMSO_0     DVL1             8.05   9.77e-05      0             0
+#> MRPL20   DMSO_0   MRPL20             4.16   5.22e-02      0             0
+#>             p0_nb expected_zeros_num        ZI
+#> MTND1P23 8.79e-05            0.00167 -8.79e-05
+#> MTATP6P1 1.99e-03            0.03784 -1.99e-03
+#> ISG15    9.69e-05            0.00184 -9.69e-05
+#> PUSL1    2.34e-04            0.00445 -2.34e-04
+#> DVL1     4.03e-04            0.00765 -4.03e-04
+#> MRPL20   2.42e-02            0.46018 -2.42e-02
+

For Staurosporine group:

+
zi_results$gene_metrics_by_group %>% filter(group=="Staurosporine_10") %>% head()
+#>                      group     gene mean_count_group dispersion p0_obs
+#> MTND1P231 Staurosporine_10 MTND1P23             4.33   9.77e-05  0.000
+#> MTATP6P11 Staurosporine_10 MTATP6P1            10.67   4.24e-02  0.000
+#> ISG151    Staurosporine_10    ISG15             5.00   9.77e-05  0.000
+#> PUSL11    Staurosporine_10    PUSL1             2.00   9.77e-05  0.667
+#> DVL11     Staurosporine_10     DVL1             2.33   9.77e-05  0.000
+#> MRPL201   Staurosporine_10   MRPL20             5.00   5.22e-02  0.000
+#>           obs_zeros_num   p0_nb expected_zeros_num       ZI
+#> MTND1P231             0 0.07082             0.2125 -0.07082
+#> MTATP6P11             0 0.00657             0.0197 -0.00657
+#> ISG151                0 0.05190             0.1557 -0.05190
+#> PUSL11                2 0.23189             0.6957  0.43478
+#> DVL11                 0 0.19278             0.5784 -0.19278
+#> MRPL201               0 0.05974             0.1792 -0.05974
+
+
+

View summary statistics for each group

+
zi_results$summary_by_group
+#>                             group n_genes n_wells median_p0_obs median_p0_nb
+#> DMSO_0                     DMSO_0    1000      19         0.000     0.000491
+#> Staurosporine_10 Staurosporine_10    1000       3         0.333     0.082998
+#>                  median_ZI observed_zeros_num expected_zeros_num pct_ZI_gt_0.1
+#> DMSO_0           -0.000457                 17               77.4         0.000
+#> Staurosporine_10  0.140550                713              354.9         0.541
+#>                  pct_ZI_gt_0.2
+#> DMSO_0                   0.000
+#> Staurosporine_10         0.409
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

See below for interpretation of the results

+

For DMSO vs staurosporine, we can see that only staurosporine has 14% +of median ZI value, the observed zeros number (713) is much higher than +the expected zeros number (~355), indicating that there is some level of +zero-inflation in the staurosporine group. As staurosporine is a cell +death control, which it is expected to see zero-inflated data in this +group. In this case, it may be appropriate to consider using a +zero-inflated negative binomial (ZINB) model for downstream analyses to +account for the excess zeros in the data.

+
+
+
+

All treatment groups

+

Finally, we will check for zero-inflation across all treatment groups +in the data.

+
# Check for zero-inflation
+all_conditions <- unique(sub_mac$combined_id)
+zi_results <- check_zeroinflation(sub_mac, 
+                                  group_by = "combined_id",
+                                  samples = all_conditions,
+                                  batch = 1,
+                                  cutoffs = c(0.1, 0.2))
+
+

View gene-wise metrics for each group

+
zi_results$gene_metrics_by_group %>% filter(group=="Luminespib_10") %>% head()
+#>                    group     gene mean_count_group dispersion p0_obs
+#> MTND1P2364 Luminespib_10 MTND1P23             8.67   9.77e-05      0
+#> MTATP6P164 Luminespib_10 MTATP6P1             5.67   1.64e-02      0
+#> ISG1564    Luminespib_10    ISG15             7.00   9.77e-05      0
+#> PUSL164    Luminespib_10    PUSL1             8.00   9.77e-05      0
+#> DVL164     Luminespib_10     DVL1             5.33   9.77e-05      0
+#> MRPL2064   Luminespib_10   MRPL20             8.67   4.96e-02      0
+#>            obs_zeros_num    p0_nb expected_zeros_num        ZI
+#> MTND1P2364             0 0.000301           0.000904 -0.000301
+#> MTATP6P164             0 0.005591           0.016773 -0.005591
+#> ISG1564                0 0.001346           0.004038 -0.001346
+#> PUSL164                0 0.000547           0.001641 -0.000547
+#> DVL164                 0 0.006143           0.018428 -0.006143
+#> MRPL2064               0 0.001014           0.003043 -0.001014
+
zi_results$summary_by_group %>% head(10)
+#>                               group n_genes n_wells median_p0_obs median_p0_nb
+#> 5-Azacytidine_0.1 5-Azacytidine_0.1    1000       3             0     0.001313
+#> 5-Azacytidine_1     5-Azacytidine_1    1000       3             0     0.000833
+#> 5-Azacytidine_10   5-Azacytidine_10    1000       3             0     0.002611
+#> Abemaciclib_0.1     Abemaciclib_0.1    1000       3             0     0.000542
+#> Abemaciclib_1         Abemaciclib_1    1000       3             0     0.001257
+#> Abemaciclib_10       Abemaciclib_10    1000       3             0     0.011172
+#> Adavosertib_0.1     Adavosertib_0.1    1000       3             0     0.001502
+#> Adavosertib_1         Adavosertib_1    1000       3             0     0.001106
+#> Adavosertib_10       Adavosertib_10    1000       3             0     0.003791
+#> ANASTROZOLE_0.1     ANASTROZOLE_0.1    1000       3             0     0.000475
+#>                   median_ZI observed_zeros_num expected_zeros_num pct_ZI_gt_0.1
+#> 5-Azacytidine_0.1 -0.001313                  8               32.2         0.007
+#> 5-Azacytidine_1   -0.000694                 12               28.7         0.007
+#> 5-Azacytidine_10  -0.001909                 59               87.1         0.030
+#> Abemaciclib_0.1   -0.000542                  5               18.4         0.005
+#> Abemaciclib_1     -0.000974                 16               30.6         0.013
+#> Abemaciclib_10    -0.007505                112               86.7         0.098
+#> Adavosertib_0.1   -0.001502                  7               19.8         0.007
+#> Adavosertib_1     -0.001098                 13               23.5         0.012
+#> Adavosertib_10    -0.002832                 46               65.1         0.037
+#> ANASTROZOLE_0.1   -0.000475                  1               26.2         0.001
+#>                   pct_ZI_gt_0.2
+#> 5-Azacytidine_0.1         0.007
+#> 5-Azacytidine_1           0.004
+#> 5-Azacytidine_10          0.018
+#> Abemaciclib_0.1           0.005
+#> Abemaciclib_1             0.012
+#> Abemaciclib_10            0.084
+#> Adavosertib_0.1           0.006
+#> Adavosertib_1             0.010
+#> Adavosertib_10            0.025
+#> ANASTROZOLE_0.1           0.001
+
high_doses_zi_results <- zi_results$summary_by_group %>% filter(grepl("_10$", group))
+#concatenate with DMSO
+high_doses_zi_results <- rbind(high_doses_zi_results,
+                                zi_results$summary_by_group %>% filter(group=="DMSO_0"))
+
+
+long_zi_results <- high_doses_zi_results %>% select(group, median_p0_obs, median_p0_nb, median_ZI) %>%
+  pivot_longer(cols = c(median_p0_obs, median_p0_nb, median_ZI), 
+               names_to = "metric", 
+               values_to = "value")
+
+library(ggplot2)
+ggplot(long_zi_results, aes(x = group, y = value, fill = metric)) +
+  geom_bar(stat = "identity", position = "dodge") +
+  labs(title = "Zero-Inflation Metrics by Treatment Groups (10uM) and DMSO",
+       x = "Groups",
+       y = "zero-inflation proportion") +
+  theme_minimal() +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+  scale_fill_manual(values = macpie_colours$discrete[1:3])
+

+
+
+

See below for interpretation of the results

+
    +
  • median_p0_obs: Median observed zero proportion across genes in +the group

  • +
  • median_p0_nb: Median expected zero proportion under the NB model +across genes in the group

  • +
  • median_ZI: Median zero-inflation index (ZI = p0_obs - p0_nb for +each gene) across genes in the group

  • +
+

From the plot, we can see that most treatment groups have small or +even negative median ZI values, indicating that there is no significant +zero-inflation in these groups. Only Staurosporine_10 shows positive +median ZI values, suggesting some level of zero-inflation in the group. +As described above, staurosporine is a potent inducer of apoptosis, +which can lead to increased cell death and reduced gene expression, +resulting in more zero counts in the data. Therefore, it is reasonable +to observe excessive zeros in the staurosporine group. For other +treatment groups, the small or even negative median ZI values suggest +that a standard negative binomial model may be appropriate for +downstream analyses. Zero-inflated models (ZINB) may not be necessary +for these groups.

+
+
+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + diff --git a/vignettes/model_selection.Rmd b/vignettes/model_selection.Rmd new file mode 100644 index 0000000..1d78341 --- /dev/null +++ b/vignettes/model_selection.Rmd @@ -0,0 +1,177 @@ +--- +title: "Model selection " +output: + rmarkdown::html_document: + theme: flatly + toc: true + toc_float: true + toc_depth: 5 +vignette: > + %\VignetteIndexEntry{check_zero_inflation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} + %\VignetteBuild{true} + +--- + + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + + + +```{r set_wd, include=FALSE} +dir <- "/Users/liuxin/macpie_Dev/" +devtools::load_all(paste0(dir, "macpie/")) +``` + +```{r setup} +#install.packages("macpie") # or devtools::install_github("PMCC-BioinformaticsCore/macpie") +library(macpie) + +# Define project variables +project_name <- "PMMSq033" +project_metadata <- system.file("extdata/PMMSq033_metadata.csv", package = "macpie") +# Load metadata +metadata <- read_metadata(project_metadata) + +``` + +```{r load_data} +# Import raw data +project_rawdata <- paste0(dir, "/macpieData/PMMSq033/raw_matrix") +project_name <- "PMMSq033" +raw_counts <- Read10X(data.dir = project_rawdata) +# Create tidySeurat object +mac <- CreateSeuratObject(counts = raw_counts, + project = project_name, + min.cells = 1, + min.features = 1) +# Join with metadata +mac <- mac %>% + inner_join(metadata, by = c(".cell" = "Barcode")) +# Add unique identifier +mac <- mac %>% + mutate(combined_id = str_c(Compound_ID, Concentration_1, sep = "_")) %>% + mutate(combined_id = gsub(" ", "", .data$combined_id)) +mac <- mac %>% + filter(Project == "Current") + +``` + +Filter genes with very low counts across all samples. This step is important because genes with extremely low expression can lead to unreliable estimates of dispersion and expected zero probabilities, which can skew the zero-inflation assessment. + +```{r} +# Filter by read count per sample group +mac <- filter_genes_by_expression(mac, + group_by = "combined_id", + min_counts = 10, + min_samples = 2) +``` + + +## Subsample genes for faster computation + +This is to randomly select a subset of 1000 genes for a quick check. For a more comprehensive analysis, consider using a larger subset. + +```{r} +# Subsample genes for faster computation +sub_mac <- subsample_genes(mac, ngene = 1000, gselect = "random", seed = 1) +sub_mac %>% nrow() +``` + + + + +## DMSO vs Camptothecin + +We can now run the zero-inflation check on the subsampled data. + +First, we will compare two treatment groups: "DMSO_0" and "CAMPTOTHECIN_10". + + +### Check for zero-inflation + + +```{r} +zi_results <- check_zeroinflation(sub_mac, + group_by = "combined_id", + samples = c("DMSO_0","CAMPTOTHECIN_10"), + batch = 1, + cutoffs = c(0.1, 0.20)) +zi_results$summary_by_group +``` + +From the summary table, we can see the summary statistics for each group, including the number and percentage of genes that are zero-inflated at the specified cutoffs. + +Each of the columns in the summary table are defined as follows: + + - group: The treatment group + + - n_genes: Total number of genes analyzed in the group + + - n_wells: Total number of wells/samples in the group + + - median_p0_obs: Median observed zero proportion across genes in the group + + - median_p0_nb: Median expected zero proportion under the NB model across genes in the group + + - median_ZI: Median zero-inflation index (ZI = p0_nb - p0_obs for each gene) across genes in the group + + - observed_zeros_num: Number of data points with observed zeros (it shouldn't be more than n_genes*n_wells for each group) + + - expected_zeros_num: Number of data points with expected zeros under the NB model (same here, it shouldn't be more than n_genes*n_wells for each group) + + - pct_ZI_gt_0.1: Percentage of genes with ZI greater than 0.1 + + - pct_ZI_gt_0.2: Percentage of genes with ZI greater than 0.2 + + + + +### RLE plots for visualisation + +RLE (Relative Log Expression) plots can be used here to visualize the distribution of gene expression levels across samples within each treatment/control group. It helps to identify any systematic biases or differences in expression distributions that may indicate RUV method s are needed for normalization. + +```{r} +de_groups <- subset(sub_mac, subset = combined_id %in% c("DMSO_0", "CAMPTOTHECIN_10")) +plot_rle(de_groups, label_column = "combined_id") +plot_rle(de_groups, label_column = "combined_id", normalisation = "limma_voom") +plot_rle(de_groups, label_column = "combined_id", normalisation = "DESeq2") +``` + + + + + +### DMSO vs Staruosporine + +Next, we will compare two treatment groups: "DMSO_0" and "Staurosporine_10". As staurosporine is a potent inducer of apoptosis, we might expect to see more zero-inflation in this group compared to DMSO. + + +```{r} +# Check for zero-inflation +zi_results <- check_zeroinflation(sub_mac, + group_by = "combined_id", + samples = c("DMSO_0","Staurosporine_10"), + batch = 1, + cutoffs = c(0.1, 0.20)) +``` + +```{r} +zi_results$summary_by_group +``` + + +```{r} +#select DMSO and Staurosporine groups +de_groups <- subset(sub_mac, subset = combined_id %in% c("DMSO_0", "Staurosporine_10")) +plot_rle(de_groups, label_column = "combined_id") +plot_rle(de_groups, label_column = "combined_id", normalisation = "limma_voom") +plot_rle(de_groups, label_column = "combined_id", normalisation = "DESeq2") +# plot_rle(de_groups, label_column = "combined_id", normalisation = "zinb") +``` \ No newline at end of file From 99eefc3ee084b904e094d3c58cee8c196fa55b1a Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Tue, 21 Oct 2025 14:35:47 +1100 Subject: [PATCH 05/16] add R/select_good_controls.R --- R/select_good_controls.R | 147 ++++++++++++++++++++++++++++++++++++ man/select_good_controls.Rd | 94 +++++++++++++++++++++++ 2 files changed, 241 insertions(+) create mode 100644 R/select_good_controls.R create mode 100644 man/select_good_controls.Rd diff --git a/R/select_good_controls.R b/R/select_good_controls.R new file mode 100644 index 0000000..2267133 --- /dev/null +++ b/R/select_good_controls.R @@ -0,0 +1,147 @@ +#' Select high-quality control replicates via TMMwsp log-CPM correlation +#' +#' @description +#' For a given control group (e.g., DMSO) on a specific plate/batch, this function +#' ranks samples by their average correlation (Fisher z–averaged) to all *other* +#' samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally) +#' plots per-sample expression distributions and sample–sample correlation heatmaps. +#' +#' @param data A tidyseurat object containing an RNA assay with a **counts** layer. +#' @param combined_id Character scalar: the control/treatment label to keep +#' (e.g., `"CB_43_EP73_0"`). Only cells/samples with this label are considered. +#' @param orig_ident Character scalar: the plate/batch identifier to keep +#' (e.g., `"VH02012942"`). Only cells/samples from this batch are considered. +#' @param cpm_filter Numeric scalar; CPM threshold used for gene filtering prior to +#' normalization (default `1`). +#' @param min_samps Integer; a gene must be expressed (CPM > `cpm_filter`) in at least +#' this many samples to be retained (default `16`). +#' @param corr_method Correlation type used for ranking; one of +#' `c("spearman","pearson")` (default `"spearman"`). +#' @param top_n Integer; the number of top-ranked samples to report in `topN`. +#' Ties at the cutoff are kept (default `5`). +#' @param make_plots Logical; if `TRUE`, print a log2-CPM boxplot and Pearson/Spearman +#' correlation heatmaps (default `TRUE`). +#' +#' @details +#' Workflow: +#' 1) Subset to the specified `combined_id` **and** `orig_ident` (plate/batch). +#' 2) Build an `edgeR::DGEList`, filter lowly expressed genes using CPM and `min_samps`. +#' 3) Normalize with **TMMwsp** and compute log2-CPM. +#' 4) Rank samples by mean Fisher z–transformed correlation to all *other* samples +#' (according to `corr_method`). +#' 5) Return the ranking, correlation matrices, the normalized matrix, and (optionally) +#' plots for QC. +#' +#' Column names of the counts matrix are rewritten to `"_"` +#' for easier visual inspection in plots. +#' +#' @return A list with elements: +#' \item{subset_obj}{The Seurat object subset used for analysis.} +#' \item{dge}{The filtered `edgeR::DGEList`.} +#' \item{log_cpm_tmm}{Matrix of TMMwsp log2-CPM (genes × samples).} +#' \item{boxplot_df}{Long-format data frame used for the boxplot (`gene`, `sample`, `log_cpm`).} +#' \item{cor_pearson}{Sample–sample Pearson correlation matrix.} +#' \item{cor_spearman}{Sample–sample Spearman correlation matrix.} +#' \item{ranking_method}{The correlation method used for ranking.} +#' \item{scores_mean_to_others}{Named numeric vector of mean Fisher-z back-transformed +#' correlations (higher = better), sorted decreasing.} +#' \item{topN}{Named numeric vector of the top-ranked samples (ties at the cutoff kept).} +#' +#' @examples +#' data(mini_mac) +#' res <- select_good_controls(mini_mac,combined_id = "DMSO_0", orig_ident = "PMMSq033_mini") +#' +#' +#' +#' @importFrom edgeR DGEList calcNormFactors cpm +#' @importFrom tibble rownames_to_column +#' @importFrom tidyr pivot_longer +#' @export + +select_good_controls <- function( + data, + combined_id, # e.g. "CB_43_EP73_0" + orig_ident, # e.g. "VH02012942" + cpm_filter = 1, # CPM threshold for gene filtering + min_samps = 16, # number of samples a gene must be expressed in + corr_method = c("spearman","pearson"), + top_n = 5, + make_plots = TRUE +){ + corr_method <- match.arg(corr_method) + sel_cells <- colnames(data)[data$combined_id == combined_id & + data$orig.ident == orig_ident] + if (length(sel_cells) == 0L) { + stop("No cells/samples match the specified 'combined_id' and 'orig_ident'.") + } + subgroup <- subset(data, cells = sel_cells) + # Counts and human-friendly column names + counts_d <- Seurat::GetAssayData(subgroup, assay = "RNA", layer = "counts") + well_colnames <- paste0(subgroup$orig.ident, "_", subgroup$Well_ID) + names(well_colnames) <- rownames(subgroup@meta.data) + colnames(counts_d) <- well_colnames[colnames(counts_d)] + # edgeR container + gene filtering + y <- edgeR::DGEList(counts_d, group = subgroup$orig.ident) + keep <- rowSums(edgeR::cpm(y) > cpm_filter) >= min_samps + y <- y[keep, , keep.lib.sizes = FALSE] + y <- edgeR::calcNormFactors(y, method = "TMMwsp") + log_cpm_tmm <- edgeR::cpm(y, log = TRUE, normalized.lib.sizes = TRUE) + # Long data for boxplot + df_long <- as.data.frame(log_cpm_tmm) |> + tibble::rownames_to_column(var = "gene") |> + tidyr::pivot_longer(-gene, names_to = "sample", values_to = "log_cpm") + if (make_plots) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { + warning("Package 'ggplot2' not available; skipping boxplot.") + } else { + print( + ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) + + ggplot2::geom_boxplot(outlier.size = 0.5) + + ggplot2::labs(x = "Sample", y = "log2 CPM", + title = "Boxplot of log2-CPM (TMMwsp)") + + ggplot2::theme_classic() + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + ) + } + } + # Correlation matrices + cors_pear <- stats::cor(log_cpm_tmm, method = "pearson") + cors_spear <- stats::cor(log_cpm_tmm, method = "spearman") + if (make_plots) { + if (!requireNamespace("pheatmap", quietly = TRUE)) { + warning("Package 'pheatmap' not available; skipping heatmaps.") + } else { + pheatmap::pheatmap(cors_pear, main = "Sample–sample correlation (Pearson, log2-CPM)") + pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)") + } + } + # Ranking by mean Fisher-z correlation to all *other* samples + R <- stats::cor(log_cpm_tmm, method = corr_method, use = "pairwise.complete.obs") + diag(R) <- NA_real_ + # Clip to (-1,1), Fisher z-transform, average, back-transform + Z <- atanh(pmin(pmax(R, -0.999999), 0.999999)) + score_z <- rowMeans(Z, na.rm = TRUE) + score_r <- tanh(score_z) + # Top-N names and scores (keep ties at the cutoff) + ord <- order(score_r, decreasing = TRUE, na.last = NA) + srt <- score_r[ord] + if (length(srt) == 0L) { + stop("No samples available after filtering; adjust 'cpm_filter'/'min_samps'.") + } + k <- min(top_n, length(srt)) + cutoff <- srt[k] + keepN <- srt >= cutoff + topN <- srt[keepN] + # Return everything useful + list( + subset_obj = subgroup, + dge = y, + log_cpm_tmm = log_cpm_tmm, + boxplot_df = df_long, + cor_pearson = cors_pear, + cor_spearman = cors_spear, + ranking_method = corr_method, + scores_mean_to_others = sort(score_r, decreasing = TRUE), + topN = topN + ) +} diff --git a/man/select_good_controls.Rd b/man/select_good_controls.Rd new file mode 100644 index 0000000..1e5827e --- /dev/null +++ b/man/select_good_controls.Rd @@ -0,0 +1,94 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/select_good_controls.R +\name{select_good_controls} +\alias{select_good_controls} +\title{Select high-quality control replicates via TMMwsp log-CPM correlation} +\usage{ +select_good_controls( + data, + combined_id, + orig_ident, + cpm_filter = 1, + min_samps = 16, + corr_method = c("spearman", "pearson"), + top_n = 5, + make_plots = TRUE +) +} +\arguments{ +\item{data}{A tidyseurat object containing an RNA assay with a \strong{counts} layer.} + +\item{combined_id}{Character scalar: the control/treatment label to keep +(e.g., \code{"CB_43_EP73_0"}). Only cells/samples with this label are considered.} + +\item{orig_ident}{Character scalar: the plate/batch identifier to keep +(e.g., \code{"VH02012942"}). Only cells/samples from this batch are considered.} + +\item{cpm_filter}{Numeric scalar; CPM threshold used for gene filtering prior to +normalization (default \code{1}).} + +\item{min_samps}{Integer; a gene must be expressed (CPM > \code{cpm_filter}) in at least +this many samples to be retained (default \code{16}).} + +\item{corr_method}{Correlation type used for ranking; one of +\code{c("spearman","pearson")} (default \code{"spearman"}).} + +\item{top_n}{Integer; the number of top-ranked samples to report in \code{topN}. +Ties at the cutoff are kept (default \code{5}).} + +\item{make_plots}{Logical; if \code{TRUE}, print a log2-CPM boxplot and Pearson/Spearman +correlation heatmaps (default \code{TRUE}).} +} +\value{ +A list with elements: +\item{subset_obj}{The Seurat object subset used for analysis.} +\item{dge}{The filtered \code{edgeR::DGEList}.} +\item{log_cpm_tmm}{Matrix of TMMwsp log2-CPM (genes × samples).} +\item{boxplot_df}{Long-format data frame used for the boxplot (\code{gene}, \code{sample}, \code{log_cpm}).} +\item{cor_pearson}{Sample–sample Pearson correlation matrix.} +\item{cor_spearman}{Sample–sample Spearman correlation matrix.} +\item{ranking_method}{The correlation method used for ranking.} +\item{scores_mean_to_others}{Named numeric vector of mean Fisher-z back-transformed +correlations (higher = better), sorted decreasing.} +\item{topN}{Named numeric vector of the top-ranked samples (ties at the cutoff kept).} +} +\description{ +For a given control group (e.g., DMSO) on a specific plate/batch, this function +ranks samples by their average correlation (Fisher z–averaged) to all \emph{other} +samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally) +plots per-sample expression distributions and sample–sample correlation heatmaps. +} +\details{ +Workflow: +\enumerate{ +\item Subset to the specified \code{combined_id} \strong{and} \code{orig_ident} (plate/batch). +\item Build an \code{edgeR::DGEList}, filter lowly expressed genes using CPM and \code{min_samps}. +\item Normalize with \strong{TMMwsp} and compute log2-CPM. +\item Rank samples by mean Fisher z–transformed correlation to all \emph{other} samples +(according to \code{corr_method}). +\item Return the ranking, correlation matrices, the normalized matrix, and (optionally) +plots for QC. +} + +Column names of the counts matrix are rewritten to \code{"_"} +for easier visual inspection in plots. +} +\examples{ +\dontrun{ +res <- select_good_controls( + data = seu, + combined_id = "CB_43_EP73_0", + orig_ident = "VH02012942", + cpm_filter = 1, + min_samps = 16, + corr_method = "spearman", + top_n = 5, + make_plots = TRUE +) +names(res$topN) +} + +} +\seealso{ +\code{\link[edgeR:calcNormFactors]{edgeR::calcNormFactors()}}, \code{\link[edgeR:cpm]{edgeR::cpm()}} +} From 01a08943f047f727b74f33832cd1429a3d905638 Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Wed, 22 Oct 2025 10:18:37 +1100 Subject: [PATCH 06/16] add --- vignettes/assessing_zero_inflation.Rmd | 328 +++++++++++++++++++++++++ 1 file changed, 328 insertions(+) create mode 100644 vignettes/assessing_zero_inflation.Rmd diff --git a/vignettes/assessing_zero_inflation.Rmd b/vignettes/assessing_zero_inflation.Rmd new file mode 100644 index 0000000..36adc17 --- /dev/null +++ b/vignettes/assessing_zero_inflation.Rmd @@ -0,0 +1,328 @@ +--- +title: "Assessing zero-inflation in your data" +output: + rmarkdown::html_document: + theme: flatly + toc: true + toc_float: true + toc_depth: 5 +vignette: > + %\VignetteIndexEntry{assessing_zero_inflation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} + %\VignetteBuild{true} + +--- + + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + digits = 3 +) +options(digits = 3) +``` + + + +## Overview + +This is a quick demonstration of the `check_zeroinflation()` function from our **macpie** package. This function is a fast diagnostic tool to help you assess whether your MACseq data exhibits zero-inflation relative to a negative binomial (NB) model. + +We use a lightweight convenience wrapper `subsample_genes()` around **`seqgendiff::select_counts()`** to create a smaller object with a subset of genes for faster computation. This function is also part of the **macpie** package. + + +Under the hood, `check_zeroinflation()` uses **edgeR** to estimate gene-wise dispersions and compute expected zero probabilities under a NB model. It then compares the observed and expected zero-inflated indexes for each gene within each group defined in the metadata. + +This function: + + - estimates gene-wise tagwise dispersions with edgeR (using all selected groups), + + - builds NB-expected zero probabilities from TMMwsp-scaled means, and + + - returns per-gene ZI (observed zeros minus NB-expected zeros) and per-group summaries (e.g., % genes with ZI > 0.05). ZI-cutoffs are user-defined. + + +The output is a list with two elements: + + - `summary_by_group`: A summary table showing the number and percentage of genes that are zero-inflated for each group at specified cutoffs. + + - `gene_metrics_by_group`: A detailed table with gene-wise metrics, including observed and expected zero numbers and proportions, zero-inflation index (ZI), mean count, and dispersion estimates for each gene in each group. + +**Note**: `check_zeroinflation()` relies on edgeR to estimate dispersion. The current implementation requires ≥2 groups in the design so that edgeR can stabilize gene-wise dispersions across groups. If you only have a single group and still want a design-aware baseline for expected zeros, fit a Gamma–Poisson/NB GLM and compute the expected zero probabilities from its fitted means and over-dispersion. + + +*** +
+ +```{r set_wd, include=FALSE} +dir <- "/Users/liuxin/macpie_Dev/" +devtools::load_all(paste0(dir, "macpie/")) +``` + + +Load data and preprocess before subsampling genes and checking for zero-inflation. + +```{r setup} +#install.packages("macpie") # or devtools::install_github("PMCC-BioinformaticsCore/macpie") +library(macpie) +library(seqgendiff) +library(ggplot2) +library(cowplot) + +# Define project variables +project_name <- "PMMSq033" +project_metadata <- system.file("extdata/PMMSq033_metadata.csv", package = "macpie") +# Load metadata +metadata <- read_metadata(project_metadata) + +``` + +```{r load_data} +# Import raw data +project_rawdata <- paste0(dir, "/macpieData/PMMSq033/raw_matrix") +project_name <- "PMMSq033" +raw_counts <- Read10X(data.dir = project_rawdata) +# Create tidySeurat object +mac <- CreateSeuratObject(counts = raw_counts, + project = project_name, + min.cells = 1, + min.features = 1) +# Join with metadata +mac <- mac %>% + inner_join(metadata, by = c(".cell" = "Barcode")) +# Add unique identifier +mac <- mac %>% + mutate(combined_id = str_c(Compound_ID, Concentration_1, sep = "_")) %>% + mutate(combined_id = gsub(" ", "", .data$combined_id)) +mac <- mac %>% + filter(Project == "Current") + +``` + + +## Subsample genes without filtering + +We first subsample genes without filtering to see the zero-inflation results before filtering lowly expressed genes. + +This is to randomly select a subset of 1000 genes for a quick check. For a more comprehensive analysis, consider using a larger subset. + +```{r} +# Subsample genes for faster computation +sub_mac_unfiltered <- subsample_genes(mac, ngene = 1000, gselect = "random", seed = 1) +sub_mac_unfiltered %>% nrow() +``` + + +We now look at zero-inflation across high dose treatment groups and DMSO control. + +```{r} +# Check for zero-inflation +high_doses <- grep("_10$", unique(sub_mac_unfiltered$combined_id), value = TRUE) +#add DMSO_0 +high_doses <- c(high_doses, "DMSO_0") +zi_results_unfiltered <- check_zeroinflation(sub_mac_unfiltered, + group_by = "combined_id", + samples = high_doses, + batch = 1, + cutoffs = c(0.1, 0.2)) +``` + +#### View gene-wise metrics for each group + +For example, view the first few rows for DMSO_0 group: + +```{r} +zi_results_unfiltered$gene_metrics_by_group %>% filter(group=="DMSO_0") %>% head() + +``` +And for Staurosporine_10 group: + +```{r} +zi_results_unfiltered$gene_metrics_by_group %>% filter(group=="Staurosporine_10") %>% head() +``` + + + +#### View summary statistics for each group + +```{r} +zi_results_unfiltered$summary_by_group %>% head(10) +``` + + +From the summary table, we can see the summary statistics for each group, including the number and percentage of genes that are zero-inflated at the specified cutoffs. + +Each of the columns in the summary table are defined as follows: + + - group: The treatment group + + - n_genes: Total number of genes analyzed in the group + + - n_wells: Total number of wells/samples in the group + + - mean_p0_obs: mean observed zero proportion across genes in the group + + - mean_p0_nb: mean expected zero proportion under the NB model across genes in the group + + - mean_ZI: mean zero-inflation index (ZI = p0_obs - p0_nb for each gene) across genes in the group + + - observed_zeros_num: Number of data points with observed zeros (it shouldn't be more than n_genes*n_wells for each group) + + - expected_zeros_num: Number of data points with expected zeros under the NB model (same here, it shouldn't be more than n_genes*n_wells for each group) + + - pct_ZI_gt_0.1: Percentage of genes with ZI greater than 0.1 + + - pct_ZI_gt_0.2: Percentage of genes with ZI greater than 0.2 + + +Visualisation of zero-inflation metrics for high dose treatment groups and DMSO control. + +```{r, fig.height=10, fig.width=8} +high_doses_zi_results_unfiltered <- zi_results_unfiltered$summary_by_group %>% filter(grepl("_10$", group)) +#concatenate with DMSO +high_doses_zi_results_unfiltered <- rbind(high_doses_zi_results_unfiltered, + zi_results_unfiltered$summary_by_group %>% filter(group=="DMSO_0")) + + +long_zi_results_unfiltered <- high_doses_zi_results_unfiltered %>% select(group, mean_p0_obs, mean_p0_nb, mean_ZI) %>% + pivot_longer(cols = c(mean_p0_obs, mean_p0_nb, mean_ZI), + names_to = "metric", + values_to = "value") + +# rank groups by mean_p0_obs +long_zi_results_unfiltered$group <- factor(long_zi_results_unfiltered$group, + levels = high_doses_zi_results_unfiltered$group[order(-high_doses_zi_results_unfiltered$mean_p0_obs)]) + +#only show mean_p0_obs and mean_p0_nb +long_zi_results_unfiltered_prop <- long_zi_results_unfiltered %>% filter(metric %in% c("mean_p0_obs", "mean_p0_nb")) + + +p1 <- ggplot(long_zi_results_unfiltered_prop, aes(x = group, y = value, fill = metric)) + + geom_bar(stat = "identity", position = "dodge") + + labs(title = "Zero-Inflation Metrics by Treatment Groups (10uM) and DMSO \nmacseq-unfiltered", + x = "Groups", + y = "zero-inflation proportion") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_manual(values = macpie_colours$discrete[1:3]) + +# show ZI separately +p2 <- ggplot(long_zi_results_unfiltered %>% filter(metric=="mean_ZI"), aes(x = group, y = value, fill = metric)) + + geom_bar(stat = "identity", position = "dodge") + + labs(title = "Mean Zero-Inflation Index (ZI) by Treatment Groups (10uM) and DMSO \nmacseq-unfiltered", + x = "Groups", + y = "Mean Zero-Inflation Index (ZI)") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_manual(values = macpie_colours$discrete[1]) + + +plot_grid(p1, p2, ncol = 1) + +``` + +Ranking of groups by mean observed zero proportion (mean_p0_obs) shows that Staurosporine_10 has the highest observed zero proportion, followed by Camptothecin_10, and paclitaxel_10. We can see staurosporine_10 group has the highest mean ZI value and highest proportion of zeros among all treatment groups, indicating significant zero-inflation in this group. Other treatment groups show small or even negative mean ZI values, suggesting no significant zero-inflation. As staurosporine is a cell death control, which it is expected to see zero-inflated data in this group. + +DMSO control group with ~60% of zeros and small mean ZI value, indicating no significant zero-inflation in the control group. + + + +## Subsample genes after filtering lowly expressed genes + +Filter genes with very low counts across all samples. This step is important because genes with extremely low expression can lead to unreliable estimates of dispersion and expected zero probabilities, which can skew the zero-inflation assessment. + +Here we filter genes that have at least 10 counts in at least 2 samples within each group defined by `combined_id`. + +```{r} +# Filter by read count per sample group +mac <- filter_genes_by_expression(mac, + group_by = "combined_id", + min_counts = 10, + min_samples = 2) +``` + + +### Subsample genes for faster computation + +This is to randomly select a subset of 1000 genes for a quick check. For a more comprehensive analysis, consider using a larger subset. + +```{r} +# Subsample genes for faster computation +sub_mac <- subsample_genes(mac, ngene = 1000, gselect = "random", seed = 1) +sub_mac %>% nrow() +``` + + +```{r} +# Check for zero-inflation +all_conditions <- unique(sub_mac$combined_id) +zi_results <- check_zeroinflation(sub_mac, + group_by = "combined_id", + samples = all_conditions, + batch = 1, + cutoffs = c(0.1, 0.2)) +``` + +#### View gene-wise metrics for each group + +```{r} +zi_results$gene_metrics_by_group %>% filter(group=="Luminespib_10") %>% head() + +``` + + +```{r} +zi_results$summary_by_group %>% head(10) +``` + +```{r, fig.height=10, fig.width=8} +high_doses_zi_results <- zi_results$summary_by_group %>% filter(grepl("_10$", group)) +#concatenate with DMSO +high_doses_zi_results <- rbind(high_doses_zi_results, + zi_results$summary_by_group %>% filter(group=="DMSO_0")) + + +long_zi_results <- high_doses_zi_results %>% select(group, mean_p0_obs, mean_p0_nb, mean_ZI) %>% + pivot_longer(cols = c(mean_p0_obs, mean_p0_nb, mean_ZI), + names_to = "metric", + values_to = "value") + +# rank groups by mean_p0_obs +long_zi_results$group <- factor(long_zi_results$group, + levels = high_doses_zi_results$group[order(-high_doses_zi_results$mean_p0_obs)]) + +#only show mean_p0_obs and mean_p0_nb +long_zi_results_prop <- long_zi_results %>% filter(metric %in% c("mean_p0_obs", "mean_p0_nb")) + + +p1 <- ggplot(long_zi_results_prop, aes(x = group, y = value, fill = metric)) + + geom_bar(stat = "identity", position = "dodge") + + labs(title = "Zero-Inflation Metrics by Treatment Groups (10uM) and DMSO \nmacseq-filtered", + x = "Groups", + y = "zero-inflation proportion") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_manual(values = macpie_colours$discrete[1:3]) + +# show ZI separately +p2 <- ggplot(long_zi_results %>% filter(metric=="mean_ZI"), aes(x = group, y = value, fill = metric)) + + geom_bar(stat = "identity", position = "dodge") + + labs(title = "Mean Zero-Inflation Index (ZI) by Treatment Groups (10uM) and DMSO \nmacseq-filtered", + x = "Groups", + y = "Mean Zero-Inflation Index (ZI)") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_manual(values = macpie_colours$discrete[1]) + + +plot_grid(p1, p2, ncol = 1) +``` + +After filtering lowly expressed genes, we observe proportions of both expected and observed zeros decrease across all treatment groups compared to the unfiltered data. These results suggest that lowly expressed genes contribute significantly to the overall zero counts or sparsity in the data. + +DMSO control group has the lowest observed and expected zero proportions among all groups, and staurosporine_10 has the highest observed and expected zero proportions. These findings are expected as these are vehicle and positive controls. + + + From bfb2d157ea0d6eb4fad4a70cd1cd44e748cf38f3 Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Wed, 22 Oct 2025 15:16:31 +1100 Subject: [PATCH 07/16] change select_good_controls.R to select_robust_controls.R --- ...od_controls.R => select_robust_controls.R} | 4 +- man/select_robust_controls.Rd | 82 +++++++++++++++++++ 2 files changed, 84 insertions(+), 2 deletions(-) rename R/{select_good_controls.R => select_robust_controls.R} (98%) create mode 100644 man/select_robust_controls.Rd diff --git a/R/select_good_controls.R b/R/select_robust_controls.R similarity index 98% rename from R/select_good_controls.R rename to R/select_robust_controls.R index 2267133..b250c93 100644 --- a/R/select_good_controls.R +++ b/R/select_robust_controls.R @@ -49,7 +49,7 @@ #' #' @examples #' data(mini_mac) -#' res <- select_good_controls(mini_mac,combined_id = "DMSO_0", orig_ident = "PMMSq033_mini") +#' res <- select_robust_controls(mini_mac,combined_id = "DMSO_0", orig_ident = "PMMSq033_mini") #' #' #' @@ -58,7 +58,7 @@ #' @importFrom tidyr pivot_longer #' @export -select_good_controls <- function( +select_robust_controls <- function( data, combined_id, # e.g. "CB_43_EP73_0" orig_ident, # e.g. "VH02012942" diff --git a/man/select_robust_controls.Rd b/man/select_robust_controls.Rd new file mode 100644 index 0000000..0d4de26 --- /dev/null +++ b/man/select_robust_controls.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/select_robust_controls.R +\name{select_robust_controls} +\alias{select_robust_controls} +\title{Select high-quality control replicates via TMMwsp log-CPM correlation} +\usage{ +select_robust_controls( + data, + combined_id, + orig_ident, + cpm_filter = 1, + min_samps = 16, + corr_method = c("spearman", "pearson"), + top_n = 5, + make_plots = TRUE +) +} +\arguments{ +\item{data}{A tidyseurat object containing an RNA assay with a \strong{counts} layer.} + +\item{combined_id}{Character scalar: the control/treatment label to keep +(e.g., \code{"CB_43_EP73_0"}). Only cells/samples with this label are considered.} + +\item{orig_ident}{Character scalar: the plate/batch identifier to keep +(e.g., \code{"VH02012942"}). Only cells/samples from this batch are considered.} + +\item{cpm_filter}{Numeric scalar; CPM threshold used for gene filtering prior to +normalization (default \code{1}).} + +\item{min_samps}{Integer; a gene must be expressed (CPM > \code{cpm_filter}) in at least +this many samples to be retained (default \code{16}).} + +\item{corr_method}{Correlation type used for ranking; one of +\code{c("spearman","pearson")} (default \code{"spearman"}).} + +\item{top_n}{Integer; the number of top-ranked samples to report in \code{topN}. +Ties at the cutoff are kept (default \code{5}).} + +\item{make_plots}{Logical; if \code{TRUE}, print a log2-CPM boxplot and Pearson/Spearman +correlation heatmaps (default \code{TRUE}).} +} +\value{ +A list with elements: +\item{subset_obj}{The Seurat object subset used for analysis.} +\item{dge}{The filtered \code{edgeR::DGEList}.} +\item{log_cpm_tmm}{Matrix of TMMwsp log2-CPM (genes × samples).} +\item{boxplot_df}{Long-format data frame used for the boxplot (\code{gene}, \code{sample}, \code{log_cpm}).} +\item{cor_pearson}{Sample–sample Pearson correlation matrix.} +\item{cor_spearman}{Sample–sample Spearman correlation matrix.} +\item{ranking_method}{The correlation method used for ranking.} +\item{scores_mean_to_others}{Named numeric vector of mean Fisher-z back-transformed +correlations (higher = better), sorted decreasing.} +\item{topN}{Named numeric vector of the top-ranked samples (ties at the cutoff kept).} +} +\description{ +For a given control group (e.g., DMSO) on a specific plate/batch, this function +ranks samples by their average correlation (Fisher z–averaged) to all \emph{other} +samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally) +plots per-sample expression distributions and sample–sample correlation heatmaps. +} +\details{ +Workflow: +\enumerate{ +\item Subset to the specified \code{combined_id} \strong{and} \code{orig_ident} (plate/batch). +\item Build an \code{edgeR::DGEList}, filter lowly expressed genes using CPM and \code{min_samps}. +\item Normalize with \strong{TMMwsp} and compute log2-CPM. +\item Rank samples by mean Fisher z–transformed correlation to all \emph{other} samples +(according to \code{corr_method}). +\item Return the ranking, correlation matrices, the normalized matrix, and (optionally) +plots for QC. +} + +Column names of the counts matrix are rewritten to \code{"_"} +for easier visual inspection in plots. +} +\examples{ +data(mini_mac) +res <- select_robust_controls(mini_mac,combined_id = "DMSO_0", orig_ident = "PMMSq033_mini") + + + +} From f1c30a2bf4b554d5a3fa8753db757933b299609d Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Wed, 22 Oct 2025 15:19:51 +1100 Subject: [PATCH 08/16] fix bug in R/check_zeroinflation.R --- R/check_zeroinflation.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index 84ab865..d2f4af5 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -157,9 +157,9 @@ check_zeroinflation <- function(data = NULL, group = unique(df$group), n_genes = nrow(df), n_wells = sum(combined_id == unique(df$group)), - median_p0_obs = median(df$p0_obs), - median_p0_nb = median(df$p0_nb), - median_ZI = median(df$ZI), + mean_p0_obs = mean(df$p0_obs), + mean_p0_nb = mean(df$p0_nb), + mean_ZI = mean(df$ZI), observed_zeros_num = sum(df$obs_zeros_num), expected_zeros_num = sum(df$expected_zeros_num) ) From 3c6d6a1af773f140c875c85d5078763cad574e2d Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Wed, 22 Oct 2025 16:41:42 +1100 Subject: [PATCH 09/16] remove testing vignettes --- man/select_good_controls.Rd | 94 - vignettes/DRUGseq_DE.Rmd | 1751 --------- vignettes/DRUGseq_DE.html | 4347 ----------------------- vignettes/DRUGseq_first500runs.Rmd | 652 ---- vignettes/DRUGseq_first500runs.html | 2176 ------------ vignettes/DRUGseq_first500runs.knit.log | 2 - vignettes/benchmark_DMSOruntime.Rmd | 476 --- vignettes/benchmark_DMSOruntime.html | 2016 ----------- vignettes/benchmark_DMSOselection.Rmd | 422 --- vignettes/benchmark_DMSOselection.html | 2017 ----------- vignettes/check_zero_inflation.Rmd | 411 --- vignettes/check_zero_inflation.html | 2051 ----------- vignettes/macpie_short.Rmd | 408 --- vignettes/model_selection.Rmd | 177 - 14 files changed, 17000 deletions(-) delete mode 100644 man/select_good_controls.Rd delete mode 100644 vignettes/DRUGseq_DE.Rmd delete mode 100644 vignettes/DRUGseq_DE.html delete mode 100644 vignettes/DRUGseq_first500runs.Rmd delete mode 100644 vignettes/DRUGseq_first500runs.html delete mode 100644 vignettes/DRUGseq_first500runs.knit.log delete mode 100644 vignettes/benchmark_DMSOruntime.Rmd delete mode 100644 vignettes/benchmark_DMSOruntime.html delete mode 100644 vignettes/benchmark_DMSOselection.Rmd delete mode 100644 vignettes/benchmark_DMSOselection.html delete mode 100644 vignettes/check_zero_inflation.Rmd delete mode 100644 vignettes/check_zero_inflation.html delete mode 100644 vignettes/macpie_short.Rmd delete mode 100644 vignettes/model_selection.Rmd diff --git a/man/select_good_controls.Rd b/man/select_good_controls.Rd deleted file mode 100644 index 1e5827e..0000000 --- a/man/select_good_controls.Rd +++ /dev/null @@ -1,94 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/select_good_controls.R -\name{select_good_controls} -\alias{select_good_controls} -\title{Select high-quality control replicates via TMMwsp log-CPM correlation} -\usage{ -select_good_controls( - data, - combined_id, - orig_ident, - cpm_filter = 1, - min_samps = 16, - corr_method = c("spearman", "pearson"), - top_n = 5, - make_plots = TRUE -) -} -\arguments{ -\item{data}{A tidyseurat object containing an RNA assay with a \strong{counts} layer.} - -\item{combined_id}{Character scalar: the control/treatment label to keep -(e.g., \code{"CB_43_EP73_0"}). Only cells/samples with this label are considered.} - -\item{orig_ident}{Character scalar: the plate/batch identifier to keep -(e.g., \code{"VH02012942"}). Only cells/samples from this batch are considered.} - -\item{cpm_filter}{Numeric scalar; CPM threshold used for gene filtering prior to -normalization (default \code{1}).} - -\item{min_samps}{Integer; a gene must be expressed (CPM > \code{cpm_filter}) in at least -this many samples to be retained (default \code{16}).} - -\item{corr_method}{Correlation type used for ranking; one of -\code{c("spearman","pearson")} (default \code{"spearman"}).} - -\item{top_n}{Integer; the number of top-ranked samples to report in \code{topN}. -Ties at the cutoff are kept (default \code{5}).} - -\item{make_plots}{Logical; if \code{TRUE}, print a log2-CPM boxplot and Pearson/Spearman -correlation heatmaps (default \code{TRUE}).} -} -\value{ -A list with elements: -\item{subset_obj}{The Seurat object subset used for analysis.} -\item{dge}{The filtered \code{edgeR::DGEList}.} -\item{log_cpm_tmm}{Matrix of TMMwsp log2-CPM (genes × samples).} -\item{boxplot_df}{Long-format data frame used for the boxplot (\code{gene}, \code{sample}, \code{log_cpm}).} -\item{cor_pearson}{Sample–sample Pearson correlation matrix.} -\item{cor_spearman}{Sample–sample Spearman correlation matrix.} -\item{ranking_method}{The correlation method used for ranking.} -\item{scores_mean_to_others}{Named numeric vector of mean Fisher-z back-transformed -correlations (higher = better), sorted decreasing.} -\item{topN}{Named numeric vector of the top-ranked samples (ties at the cutoff kept).} -} -\description{ -For a given control group (e.g., DMSO) on a specific plate/batch, this function -ranks samples by their average correlation (Fisher z–averaged) to all \emph{other} -samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally) -plots per-sample expression distributions and sample–sample correlation heatmaps. -} -\details{ -Workflow: -\enumerate{ -\item Subset to the specified \code{combined_id} \strong{and} \code{orig_ident} (plate/batch). -\item Build an \code{edgeR::DGEList}, filter lowly expressed genes using CPM and \code{min_samps}. -\item Normalize with \strong{TMMwsp} and compute log2-CPM. -\item Rank samples by mean Fisher z–transformed correlation to all \emph{other} samples -(according to \code{corr_method}). -\item Return the ranking, correlation matrices, the normalized matrix, and (optionally) -plots for QC. -} - -Column names of the counts matrix are rewritten to \code{"_"} -for easier visual inspection in plots. -} -\examples{ -\dontrun{ -res <- select_good_controls( - data = seu, - combined_id = "CB_43_EP73_0", - orig_ident = "VH02012942", - cpm_filter = 1, - min_samps = 16, - corr_method = "spearman", - top_n = 5, - make_plots = TRUE -) -names(res$topN) -} - -} -\seealso{ -\code{\link[edgeR:calcNormFactors]{edgeR::calcNormFactors()}}, \code{\link[edgeR:cpm]{edgeR::cpm()}} -} diff --git a/vignettes/DRUGseq_DE.Rmd b/vignettes/DRUGseq_DE.Rmd deleted file mode 100644 index 004dc20..0000000 --- a/vignettes/DRUGseq_DE.Rmd +++ /dev/null @@ -1,1751 +0,0 @@ ---- -title: "Benchmarking" -output: - rmarkdown::html_document: - theme: flatly - toc: true - toc_float: true - toc_depth: 5 -vignette: > - %\VignetteIndexEntry{benchmarking} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} - %\VignetteBuild{true} ---- - -```{r set_wd, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -dir <- "/Users/liuxin/macpie_Dev/" -devtools::load_all(paste0(dir, "macpie/")) - -``` - - -```{r setup} -library(macpie) -library(tibble) -library(stringr) -library(pheatmap) -library(ggiraph) -library(tidyseurat) -library(purrr) -library(DT) -library(ggrepel) -library(ggalluvial) -library(ggplot2) -library(ggvenn) -library(limma) -``` - - -## DRUGseq results - - -```{r} -# batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds")) -``` - -```{r} -# load(paste0(dir,"DRUGseqData/de.RData")) -# batch24_de <- de_res%>%filter(batch_id=="24") -# saveRDS(batch24_de, file = paste0(dir,"DRUGseqData/DE_batch24.Rds")) -batch24_de <- readRDS(paste0(dir,"DRUGseqData/DE_batch24.Rds")) -head(batch24_de) - -``` - -```{r} - -FF86_res <- batch24_de %>% filter(cmpd_sample_id=="FF-86-NH56") -ff86_res_toptable <- FF86_res[,13:18] -ff86_res_toptable <- ff86_res_toptable %>% - separate(gene.ID, into = c("gene", "chrom"), sep = ",") %>% - select(-chrom) %>% mutate(combined_id ="FF_86_NH56_10") %>% - rename(log2FC=logFC, p_value_adj = adj.P.Val) - -``` - - -```{r} -head(ff86_res_toptable) -``` - -```{r, fig.height=6} -plot_volcano(ff86_res_toptable, max.overlaps =5) -``` - -```{r} -ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% nrow() - -drugseq_deg <- ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% select(gene) %>% pull() -``` - -```{r} -#show histogram of p values -ggplot(ff86_res_toptable, aes(x = P.Value)) + - geom_histogram(binwidth = 0.01, fill = "blue", color = "black", alpha = 0.7) + - labs(title = "Histogram of Adjusted P-values", x = "Adjusted P-value", y = "Frequency") + - theme_minimal() -``` - - - -## macpie results - -## All DMSO wells - -### Load filered data - - -```{r filter_genes_batch24} -# mac_filtered <- filter_genes_by_expression(as_mac, -# group_by = "combined_id", min_counts = 10, -# min_samples = min_sample_num) - -# saveRDS(mac_filtered, -# file = paste0(dir, "DRUGseqData/macpie_filtered_batch24.Rds")) - - -mac_filtered <- readRDS(paste0(dir, "/DRUGseqData/macpie_filtered_batch24.Rds")) -``` - - - - - -```{r violin_plots_batch24, fig.width=10, fig.height=6} -mac_filtered[["percent.mt"]] <- PercentageFeatureSet(mac_filtered, pattern = "^mt-|^MT-") -mac_filtered[["percent.ribo"]] <- PercentageFeatureSet(mac_filtered, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA") -``` - - -```{r} -mac_filtered$combined_id <- str_replace_all(mac_filtered$combined_id, "-","_") -``` - - -#### Correction of batch effect - -According to DRUGseq metadata: - - - Wells with water are labelled as EC-27-RY89 - - - Wells with DMSO are labelled as CB-43-EP73 - - - - - - - - - - - - - - - - - - - - - - -*Note:* instead of discussing which correction methods we should use for this data, we only show the ways we detected and corrected batch effect here. As batch effect adjustment for sequencing data has been implemented in different methods, such as DESeq2, RUVSeq, edgeR. We highly recommend users thoroughly checking any batch effects and exploring different methods. - -In the next part of the vignette, we demonstrate a `batch` parameter has implemented in the differential expression test for batch correction. - - - -```{r} -##Limma trend - -# data <- mac_badDSMOremoved -# treatment_samples <- "FF_86_NH56_10" -# control_samples <- "CB_43_EP73_0" - - limma.trend<- function(data = data, treatment_samples, control_samples){ - data <- data[, grepl(paste0(treatment_samples, "|", control_samples), data$combined_id)] - batch <- data$orig.ident -if (length(unique(data$combined_id)) < 2) { - stop("Insufficient factors for differential expression analysis.") - } -pheno_data <- data.frame(batch = as.factor(batch), condition = as.factor(data$combined_id)) -combined_id <- data$combined_id -model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else - model.matrix(~0+combined_id + batch) - - dge <- DGEList(counts = data@assays$RNA$counts, - samples = pheno_data$condition, - group = pheno_data$condition) - dge <- calcNormFactors(dge, method="TMMwsp") - # keep <- edgeR::filterByExpr(dge, design = model_matrix) - # dge <- dge[keep, , keep.lib.sizes = FALSE] - # clean.TMM<-log2(edgeR::cpm(dge, normalized.lib.sizes=T,log=F)+1) - clean.TMM <- cpm(dge, log=TRUE, prior.count=3) - myargs <- list(paste0("combined_id", - treatment_samples, "-", - paste0("combined_id", control_samples)), - levels = model_matrix) - contrasts <- do.call(makeContrasts, myargs) - - lmfit <- lmFit(clean.TMM, model_matrix) - lmfit <- contrasts.fit(lmfit, contrasts) - lmfit <- eBayes(lmfit, trend = TRUE) - #Warning: Zero sample variances detected, have been offset away from zero - top_table<- topTable(lmfit, number = nrow(data)) %>% - as.data.frame() %>% - select("logFC", "t", "P.Value", "adj.P.Val") %>% - rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val")%>% - rownames_to_column("gene") - - return(top_table)} - - # top_table%>%filter(p_value_adj < 0.05 & log2FC>0) -``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -```{r} -# make a function of alluvial plot with ggalluvial to compare the ranking of DEGs from macpie and DRUGseq -plot_alluvial <- function(macpie_tbl, drugseq_tbl){ - df <- macpie_tbl %>% - transmute(gene, p_value_adj_mac = p_value_adj, log2FC_mac = log2FC) %>% - inner_join( - drugseq_tbl %>% transmute(gene, p_value_adj_drg = p_value_adj, log2FC_drg = log2FC), - by = "gene" - ) - -# Rank within each method by (padj, |log2FC|), tie-safe -rank_by_two <- function(padj, lfc) { - ord <- order(padj, -abs(lfc), na.last = TRUE) - r <- integer(length(padj)) - r[ord] <- seq_along(ord) - r -} -df <- df %>% - mutate( - rank_mac = rank_by_two(p_value_adj_mac, log2FC_mac), - rank_drg = rank_by_two(p_value_adj_drg, log2FC_drg) - ) - -# Define rank bins -cuts <- c(0, 25, 50, 100, 200, 300, 500, Inf) -labs <- c("Top25","26–50","51–100","101–200","201–300", "301-500",">500") - -df <- df %>% - mutate( - mac_bin = cut(rank_mac, breaks = cuts, labels = labs, right = TRUE, include.lowest = TRUE), - drg_bin = cut(rank_drg, breaks = cuts, labels = labs, right = TRUE, include.lowest = TRUE) - ) - -# Ranking movement category (color) -bin_index <- function(x) match(x, labs) # lower index = "more top" -df <- df %>% - mutate( - mac_idx = bin_index(mac_bin), - drg_idx = bin_index(drg_bin), - movement = case_when( - is.na(mac_idx) | is.na(drg_idx) ~ NA_character_, - mac_idx < drg_idx ~ "Higher rank in macpie", # moved up into a more-top bin - mac_idx > drg_idx ~ "Lower rank in macpie", # moved down - TRUE ~ "Same rank" - ), - movement = factor(movement, levels = c("Higher rank in macpie","Same rank","Lower rank in macpie")) - ) %>% - tidyr::drop_na(mac_bin, drg_bin, movement) - -# Aggregate flows for alluvial -flows <- df %>% - count(mac_bin, drg_bin, movement, name = "n") %>% - mutate( - mac_bin = factor(mac_bin, levels = labs), - drg_bin = factor(drg_bin, levels = labs) - ) - -ggplot(flows, aes(axis1 = mac_bin, axis2 = drg_bin, y = n)) + - geom_alluvium(aes(fill = movement), alpha = 0.75, width = 0.14, knot.pos = 0.4) + - geom_stratum(width = 0.14, color = "grey30") + - geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) + - scale_x_discrete(limits = c("Macpie rank", "DRUGseq rank"), expand = c(.05, .05)) + - scale_fill_manual(values = c("Higher rank in macpie" = "orange", - "Same rank" = "#7570b3", - "Lower rank in macpie" = "grey60")) + - labs(x = NULL, y = "Gene count", - fill = "Movement vs DRUGseq", - subtitle = "Only DEGs with padj < 0.05 & log2FC>0 (both methods)") + - theme_classic() - -} -``` - - -#### Differential gene expression - - -In here, you can specify a single condition in the combined_id column and compare with DMSO (i.e.CB_43_EP73_0). By using the plate IDs in the column of orig.ident as the input for batch parameter, `compute_singe_de` function can perform differential expression analysis using the preferred method (limma voom in this example) with batch information. - - - -```{r} -methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") - -methods_res <- lapply(methods, function(m){ - - message("\n","Processing method: ", m,"\n") - # m<-"limma_voom" - treatment_samples <- "FF_86_NH56_10" - control_samples <- "CB_43_EP73_0" - subset <- mac_filtered%>%filter(combined_id%in%c(treatment_samples,control_samples)) - - batch <- subset$orig.ident - if (m!="limma_trend"){ - subset <- filter_genes_by_expression(subset, - group_by = "combined_id", min_counts = 5, - min_samples = 1) - top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch) - } else{ - top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples) - } - - # plot(plot_volcano(top_table, max.overlaps = 5)) - alldmso_degs <- top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() - length(intersect(alldmso_degs, drugseq_deg)) - - top_table <- top_table %>% - arrange(p_value_adj, desc(log2FC)) %>% - mutate(gene = factor(gene, levels = unique(gene))) - # add a column if there are in the intersect(alldmso_degs, drugseq_deg) - top_table <- top_table %>% - mutate(in_drugseq_deg = ifelse(gene %in% intersect(alldmso_degs, drugseq_deg), "yes", "no")) - - plt_ecdf <- ggplot(top_table, aes(x = p_value_adj, color = in_drugseq_deg)) + - stat_ecdf(size = 1) + - scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) + - labs(x = "Adjusted p-value (log10 scale)", y = "ECDF", - color = "In DRUGseq DEGs") + - theme(axis.text.x = element_text(angle = 90, hjust = 1)) + - theme_classic() - - - ks_test_results <- ks.test(top_table$p_value_adj[top_table$in_drugseq_deg=="yes"], - top_table$p_value_adj[top_table$in_drugseq_deg=="no"], alternative = "greater") - - # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + - # geom_point(alpha = 0.6, size = 1.2) + - # geom_hline(yintercept = -log10(0.05), linetype = "dashed") + - # geom_vline(xintercept = c(-1, 1), linetype = "dashed") + - # scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) + - # labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") + - # theme_classic() - # - # label a few top overlapping genes - lab_genes <- top_table[top_table$in_drugseq_deg=="yes", ] |> - dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) - - volcano_overlap <- ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + - geom_point(alpha = 0.6, size = 1.2) + - geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 50) + - scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+ - theme_classic() - - ks <- c(25,50,100,200,500,1000) - prec_at_k <- sapply(ks, function(k) - mean(top_table$in_drugseq_deg[1:k] == "yes") - ) - # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)", - # ylab = "Precision@k (fraction in DRUGseq set)") - - ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) + - geom_point() + geom_line() + - labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+ - theme_classic() - - # alluvial plot - macpie_tbl <- top_table %>% filter(p_value_adj < 0.05 & log2FC>0) - drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0) - alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl) - - #return - result_list <- list(top_table = top_table, - num_degs_macpie = length(alldmso_degs), - n_overlap = length(intersect(alldmso_degs, drugseq_deg)), - ecdf_plot = plt_ecdf, - ks_test_results = ks_test_results, - volcano_plot = volcano_overlap, - ks_plot = ks_plot, - alluvial_plot = alluvial_plot) - return(result_list) - -}) - -names(methods_res) <- methods - -``` - - -#### Summary table - - -```{r} -#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method -deg_summary <- map_df(methods_res, function(x) { - data.frame( - num_degs_macpie = x$num_degs_macpie, - n_overlap = x$n_overlap, - num_degs_DRUGseq = length(drugseq_deg) - ) -}, .id = paste0("macpie_methods")) - -deg_summary -``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -### Overlapping volcano plot - -```{r} -methods_res$limma_voom$volcano_plot -methods_res$DESeq2$volcano_plot -methods_res$edgeR$volcano_plot -methods_res$limma_trend$volcano_plot -``` - - - - - - - - - - - - - -### Alluvial plot - -```{r} -methods_res$limma_voom$alluvial_plot -methods_res$DESeq2$alluvial_plot -methods_res$edgeR$alluvial_plot -methods_res$limma_trend$alluvial_plot -``` - - -```{r} -#append top_table from each method into a single data frame -all_methods_top_table <- map_df(methods_res, "top_table", .id = "method") -head(all_methods_top_table) - -#if in_drugseq_deg is "no", change method to "DRUGseq" -all_methods_top_table <- all_methods_top_table %>% - mutate(method = ifelse(in_drugseq_deg == "no", "not in DRUGseq", method)) - -#rename methods -all_methods_top_table$method <- recode(all_methods_top_table$method, - "limma_voom" = "macpie:limma_voom", - "DESeq2" = "macpie:DESeq2", - "edgeR" = "macpie:edgeR", - "limma_trend" = "macpie:limma_trend") - - -ggplot(all_methods_top_table, aes(x = p_value_adj, color = method)) + - stat_ecdf(size = 1) + - scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) + - labs(x = "Adjusted p-value (log10 scale)", y = "ECDF", - color = "In DRUGseq DEGs") + - theme(axis.text.x = element_text(angle = 90, hjust = 1)) + - theme_classic() - - -``` - - -```{r} -rm(top_table, alldmso_degs, deg_summary) -``` - - - - -## Good DMSO - top 5 from macpie - -From public DRUG-seq analysis pipeline, authors identified two reference controls: all DMSO wells and the ‘good DMSO’ wells. - -We know these good DMSO wells for batch 24 from their published data: - - - VH02012942: I23, M23 - - - VH02012944: D23, H23 - - - VH02012956: I23, J23 - - -```{r} -batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds")) -names(batch24) -``` - -```{r metdata} -#make a combined metadata for three plates -batch24_metadata <- batch24 %>% - map_dfr(~ { - .x$Annotation %>% - mutate( - Plate_ID = plate_barcode, - Well_ID = well_id, - Barcode = paste0(plate_barcode, "_", well_index), - Row = LETTERS[row], - Column = as.integer(col), - Species = "human", - Cell_type = "U2OS", - Model_type = "2D_adherent", - Time = as.factor(hours_post_treatment), - Unit = "h", - Treatment_1 = cmpd_sample_id, - Concentration_1 = as.numeric(concentration), - Unit_1 = unit, - Sample_type = if_else(well_type == "SA" & col == 24, - "Positive Control", - well_type) - ) - }) - - -batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id, - well_index, col, row, biosample_id, external_biosample_id, - cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample)) -``` - - - -```{r count_matrix} -# create a combined UMI matrix for 3 plates -batch24_counts <- batch24 %>% - map(~ { - .x$UMI.counts %>% - as.data.frame() %>% - rownames_to_column("gene_id") %>% - separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>% - mutate(gene_name = make.unique(gene_name)) %>% - select(-chrom) %>% - tibble::column_to_rownames(var = "gene_name") %>% - as.matrix() - }) - -binded_counts <- do.call(cbind, batch24_counts) - - - -``` - - - -```{r load_data} -as_mac <- CreateSeuratObject(counts = binded_counts, - min.cells = 1, - min.features = 1) - -as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode")) -``` - - -Filtering steps - -Include only good DMSO wells as controls - -```{r} - -as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1) - -badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>% - filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23", "K23", "J23","C23"))) | - (Plate_ID == "VH02012944" & !(Well_ID %in% c("I23", "M23", "J23", "G23", "K23")))| - (Plate_ID == "VH02012956" & ! (Well_ID %in% c("I23", "J23", "O23","M23","K23")))) - - - -keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO)) - - -mac_badDSMOremoved <- as_mac[,keep_wells] - -mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_") - - -``` - - -```{r} -min_sample_num <- min(table(mac_badDSMOremoved$combined_id)) -mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved, - group_by = "combined_id", min_counts =10, - min_samples = min_sample_num) - -``` - - -```{r} -mac_badDSMOremoved[["percent.mt"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^mt-|^MT-") -mac_badDSMOremoved[["percent.ribo"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA") - -``` - -```{r} -p <- plot_plate_layout(mac_badDSMOremoved, "nCount_RNA", "combined_id") + facet_wrap(~orig.ident, ncol = 1) + - theme(strip.text = element_text(size=10), - axis.text.x = element_text(size=10), - axis.text.y = element_text(size=8), - legend.title = element_text(size=10), - legend.text = element_text(size=8), - trip.background = element_blank()) -girafe(ggobj = p, - fonts = list(sans = "sans"), - options = list( - opts_hover(css = "stroke:black; stroke-width:1px;") - )) -``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -```{r} -methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") - -five_dmso_methods_res <- lapply(methods, function(m){ - message("Processing method: ", m) - # m<-"limma_voom" - treatment_samples <- "FF_86_NH56_10" - control_samples <- "CB_43_EP73_0" - subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples)) - - batch <- subset$orig.ident - if (m!="limma_trend"){ - # subset <- filter_genes_by_expression(subset, - # group_by = "combined_id", min_counts = 10, - # min_samples = 1) - badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch) - } else{ - badDMSO_out_top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples) - } - - # plot(plot_volcano(top_table, max.overlaps = 5)) - badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() - length(intersect(badDMSO_out_degs, drugseq_deg)) - - badDMSO_out_top_table <- badDMSO_out_top_table %>% - arrange(p_value_adj, desc(log2FC)) %>% - mutate(gene = factor(gene, levels = unique(gene))) - # add a column if there are in the intersect(alldmso_degs, drugseq_deg) - badDMSO_out_top_table <- badDMSO_out_top_table %>% - mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no")) - - plt_ecdf <- ggplot(badDMSO_out_top_table, aes(x = p_value_adj, color = in_drugseq_deg)) + - stat_ecdf(size = 1) + - scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) + - labs(x = "Adjusted p-value (log10 scale)", y = "ECDF", - color = "In DRUGseq DEGs") + - theme(axis.text.x = element_text(angle = 90, hjust = 1)) + - theme_classic() - - - ks_test_results <- ks.test(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="yes"], - badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="no"], alternative = "greater") - - # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + - # geom_point(alpha = 0.6, size = 1.2) + - # geom_hline(yintercept = -log10(0.05), linetype = "dashed") + - # geom_vline(xintercept = c(-1, 1), linetype = "dashed") + - # scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) + - # labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") + - # theme_classic() - # - # label a few top overlapping genes - lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |> - dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) - - volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + - geom_point(alpha = 0.6, size = 1.2) + - geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) + - scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+ - theme_classic() - - ks <- c(25,50,100,200,500,1000) - prec_at_k <- sapply(ks, function(k) - mean(badDMSO_out_top_table$in_drugseq_deg[1:k] == "yes") - ) - # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)", - # ylab = "Precision@k (fraction in DRUGseq set)") - - ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) + - geom_point() + geom_line() + - labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+ - theme_classic() - - # alluvial plot - macpie_tbl <- badDMSO_out_top_table %>% filter(p_value_adj < 0.05 & log2FC>0) - drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0) - alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl) - - #return - result_list <- list(top_table = badDMSO_out_top_table, - num_degs_macpie = length(badDMSO_out_degs), - n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)), - ecdf_plot = plt_ecdf, - ks_test_results = ks_test_results, - volcano_plot = volcano_overlap, - ks_plot = ks_plot, - alluvial_plot = alluvial_plot) - return(result_list) - -}) - -names(five_dmso_methods_res) <- methods - -``` - -### Summary table - -```{r} -#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method -deg_summary <- map_df(five_dmso_methods_res, function(x) { - data.frame( - num_degs_macpie = x$num_degs_macpie, - n_overlap = x$n_overlap, - num_degs_DRUGseq = length(drugseq_deg) - ) -}, .id = paste0("macpie_methods")) - -deg_summary -``` - - - - -### ECDF plot - -```{r} -#append top_table from each method into a single data frame -good_dmso_top_table <- map_df(five_dmso_methods_res, "top_table", .id = "method") -head(good_dmso_top_table) - -#if in_drugseq_deg is "no", change method to "DRUGseq" -good_dmso_top_table<- good_dmso_top_table %>% - mutate(method = ifelse(in_drugseq_deg == "no", "not_in_DRUGseq", method)) - -#rename methods -good_dmso_top_table$method <- recode(good_dmso_top_table$method, - "limma_voom" = "macpie:limma_voom", - "DESeq2" = "macpie:DESeq2", - "edgeR" = "macpie:edgeR", - "limma_trend" = "macpie:limma_trend") - - -ggplot(good_dmso_top_table, aes(x = p_value_adj, color = method)) + - stat_ecdf(size = 1) + - scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) + - labs(x = "Adjusted p-value (log10 scale)", y = "ECDF", - color = "In DRUGseq DEGs") + - theme(axis.text.x = element_text(angle = 90, hjust = 1)) + - theme_classic() - - -``` - - - - - - - - -### Overlapping volcano plot - -```{r} -five_dmso_methods_res$limma_voom$volcano_plot -five_dmso_methods_res$DESeq2$volcano_plot -five_dmso_methods_res$edgeR$volcano_plot -five_dmso_methods_res$limma_trend$volcano_plot -``` - - - - - - - - - - - - - -### Alluvial plot - -```{r} -five_dmso_methods_res$limma_voom$alluvial_plot -five_dmso_methods_res$DESeq2$alluvial_plot -five_dmso_methods_res$edgeR$alluvial_plot -five_dmso_methods_res$limma_trend$alluvial_plot -``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -## Good DMSO - DRUGseq - -From public DRUG-seq analysis pipeline, authors identified two reference controls: all DMSO wells and the ‘good DMSO’ wells. - -We know these good DMSO wells for batch 24 from their published data: - - - VH02012942: I23, M23 - - - VH02012944: D23, H23 - - - VH02012956: I23, J23 - - -```{r} -batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds")) -names(batch24) -``` - -```{r metdata_for_batch24} -#make a combined metadata for three plates -batch24_metadata <- batch24 %>% - map_dfr(~ { - .x$Annotation %>% - mutate( - Plate_ID = plate_barcode, - Well_ID = well_id, - Barcode = paste0(plate_barcode, "_", well_index), - Row = LETTERS[row], - Column = as.integer(col), - Species = "human", - Cell_type = "U2OS", - Model_type = "2D_adherent", - Time = as.factor(hours_post_treatment), - Unit = "h", - Treatment_1 = cmpd_sample_id, - Concentration_1 = as.numeric(concentration), - Unit_1 = unit, - Sample_type = if_else(well_type == "SA" & col == 24, - "Positive Control", - well_type) - ) - }) - - -batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id, - well_index, col, row, biosample_id, external_biosample_id, - cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample)) -``` - - - -```{r count_matrix_for_batch24} -# create a combined UMI matrix for 3 plates -batch24_counts <- batch24 %>% - map(~ { - .x$UMI.counts %>% - as.data.frame() %>% - rownames_to_column("gene_id") %>% - separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>% - mutate(gene_name = make.unique(gene_name)) %>% - select(-chrom) %>% - tibble::column_to_rownames(var = "gene_name") %>% - as.matrix() - }) - -binded_counts <- do.call(cbind, batch24_counts) - - - -``` - - - -```{r load_data_batch24} -as_mac <- CreateSeuratObject(counts = binded_counts, - min.cells = 1, - min.features = 1) - -as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode")) -``` - - -Filtering steps - -Include only good DMSO wells as controls - -```{r} - -as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1) - -badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>% - filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23"))) | - (Plate_ID == "VH02012944" & !(Well_ID %in% c("D23", "H23")))| - (Plate_ID == "VH02012956" & ! (Well_ID %in% c("I23", "J23")))) - - - -keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO)) - - -mac_badDSMOremoved <- as_mac[,keep_wells] - -mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_") - - -``` - - -```{r} -min_sample_num <- min(table(mac_badDSMOremoved$combined_id)) -mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved, - group_by = "combined_id", min_counts =10, - min_samples = min_sample_num) - -``` - - -```{r} -mac_badDSMOremoved[["percent.mt"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^mt-|^MT-") -mac_badDSMOremoved[["percent.ribo"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA") - -``` - - - - - - - - -```{r} -methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") - -good_dmso_methods_res <- lapply(methods, function(m){ - message("Processing method: ", m) - # m<-"limma_voom" - treatment_samples <- "FF_86_NH56_10" - control_samples <- "CB_43_EP73_0" - subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples)) - - batch <- subset$orig.ident - if (m!="limma_trend"){ - # subset <- filter_genes_by_expression(subset, - # group_by = "combined_id", min_counts = 10, - # min_samples = 1) - badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch) - } else{ - badDMSO_out_top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples) - } - - # plot(plot_volcano(top_table, max.overlaps = 5)) - badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() - length(intersect(badDMSO_out_degs, drugseq_deg)) - - badDMSO_out_top_table <- badDMSO_out_top_table %>% - arrange(p_value_adj, desc(log2FC)) %>% - mutate(gene = factor(gene, levels = unique(gene))) - # add a column if there are in the intersect(alldmso_degs, drugseq_deg) - badDMSO_out_top_table <- badDMSO_out_top_table %>% - mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no")) - - plt_ecdf <- ggplot(badDMSO_out_top_table, aes(x = p_value_adj, color = in_drugseq_deg)) + - stat_ecdf(size = 1) + - scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) + - labs(x = "Adjusted p-value (log10 scale)", y = "ECDF", - color = "In DRUGseq DEGs") + - theme(axis.text.x = element_text(angle = 90, hjust = 1)) + - theme_classic() - - - ks_test_results <- ks.test(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="yes"], - badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="no"], alternative = "greater") - - # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + - # geom_point(alpha = 0.6, size = 1.2) + - # geom_hline(yintercept = -log10(0.05), linetype = "dashed") + - # geom_vline(xintercept = c(-1, 1), linetype = "dashed") + - # scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) + - # labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") + - # theme_classic() - # - # label a few top overlapping genes - lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |> - dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) - - volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + - geom_point(alpha = 0.6, size = 1.2) + - geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) + - scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+ - theme_classic() - - ks <- c(25,50,100,200,500,1000) - prec_at_k <- sapply(ks, function(k) - mean(badDMSO_out_top_table$in_drugseq_deg[1:k] == "yes") - ) - # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)", - # ylab = "Precision@k (fraction in DRUGseq set)") - - ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) + - geom_point() + geom_line() + - labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+ - theme_classic() - - # alluvial plot - macpie_tbl <- badDMSO_out_top_table %>% filter(p_value_adj < 0.05 & log2FC>0) - drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0) - alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl) - - #return - result_list <- list(top_table = badDMSO_out_top_table, - num_degs_macpie = length(badDMSO_out_degs), - n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)), - ecdf_plot = plt_ecdf, - ks_test_results = ks_test_results, - volcano_plot = volcano_overlap, - ks_plot = ks_plot, - alluvial_plot = alluvial_plot) - return(result_list) - -}) - -names(good_dmso_methods_res) <- methods - -``` - -### Summary table - -```{r} -#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method -deg_summary <- map_df(good_dmso_methods_res, function(x) { - data.frame( - num_degs_macpie = x$num_degs_macpie, - n_overlap = x$n_overlap, - num_degs_DRUGseq = length(drugseq_deg) - ) -}, .id = paste0("macpie_methods")) - -deg_summary -``` - - - - -### ECDF plot - -```{r} -#append top_table from each method into a single data frame -good_dmso_top_table <- map_df(good_dmso_methods_res, "top_table", .id = "method") -head(good_dmso_top_table) - -#if in_drugseq_deg is "no", change method to "DRUGseq" -good_dmso_top_table<- good_dmso_top_table %>% - mutate(method = ifelse(in_drugseq_deg == "no", "not_in_DRUGseq", method)) - -#rename methods -good_dmso_top_table$method <- recode(good_dmso_top_table$method, - "limma_voom" = "macpie:limma_voom", - "DESeq2" = "macpie:DESeq2", - "edgeR" = "macpie:edgeR", - "limma_trend" = "macpie:limma_trend") - - -ggplot(good_dmso_top_table, aes(x = p_value_adj, color = method)) + - stat_ecdf(size = 1) + - scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) + - labs(x = "Adjusted p-value (log10 scale)", y = "ECDF", - color = "In DRUGseq DEGs") + - theme(axis.text.x = element_text(angle = 90, hjust = 1)) + - theme_classic() - - -``` - - -```{r} -good_dmso_methods_res$limma_voom$ecdf_plot -good_dmso_methods_res$DESeq2$ecdf_plot -good_dmso_methods_res$edgeR$ecdf_plot -good_dmso_methods_res$limma_trend$ecdf_plot -``` -### Overlapping volcano plot - -```{r} -good_dmso_methods_res$limma_voom$volcano_plot -good_dmso_methods_res$DESeq2$volcano_plot -good_dmso_methods_res$edgeR$volcano_plot -good_dmso_methods_res$limma_trend$volcano_plot -``` - - - - - - - - - - - -### Alluvial plot - -```{r} -good_dmso_methods_res$limma_voom$alluvial_plot -good_dmso_methods_res$DESeq2$alluvial_plot -good_dmso_methods_res$edgeR$alluvial_plot -good_dmso_methods_res$limma_trend$alluvial_plot -``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -## Comparison - -To compare DEGs with different replicate numbers and different methods - -```{r} -methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") - -get_jaccard <- function(deg_set, drugseq_deg){ - intersection <- length(intersect(deg_set, drugseq_deg)) - union <- length(union(deg_set, drugseq_deg)) - jaccard_index <- intersection / union - return(jaccard_index) -} - -jaccard_index <- lapply(methods, function(m){ - # all dmso - degs <- methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() - jaccard_all <- get_jaccard(degs, drugseq_deg) - # five dmso - degs <- five_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() - jaccard_five <- get_jaccard(degs, drugseq_deg) - # three dmso - degs <- good_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() - jaccard_three <- get_jaccard(degs, drugseq_deg) - jaccard_index <- data.frame( - method = m, - jaccard_all = jaccard_all, - jaccard_five = jaccard_five, - jaccard_three = jaccard_three - ) - return(jaccard_index) -}) - -df <- as.data.frame(do.call(rbind, jaccard_index)) -rownames(df) <- df$method -df <- df %>% select(-method) -colnames(df) <- c("All DMSO", "macpie: 15 DMSO", "DRUGseq: 6 DMSO") -pheatmap::pheatmap(df, - cluster_rows = FALSE, - cluster_cols = FALSE, - display_numbers = TRUE, - main = "Jaccard Index between macpie DEGs and DRUGseq DEGs") - -``` - -```{r} -library(UpSetR) -all_dmso <- list( - limma_voom = methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), - DESeq2 = methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), - edgeR = methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), - limma_trend = methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), - DRUGseq = drugseq_deg -) -upset(fromList(all_dmso), - nsets = 5, - order.by = "freq", - main.bar.color = "black", - sets.bar.color = "gray23", - text.scale = c(2, 2, 2, 1.5, 2, 1.5)) -``` - -```{r} -five_dmso <- list( - limma_voom = five_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), - DESeq2 = five_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), - edgeR = five_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), - limma_trend = five_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), - DRUGseq = drugseq_deg -) -upset(fromList(five_dmso), - nsets = 5, - order.by = "freq", - main.bar.color = "black", - sets.bar.color = "gray23", - text.scale = c(2, 2, 2, 1.5, 2, 1.5)) -``` - - -```{r} -good_dmso <- list( - limma_voom = good_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), - DESeq2 = good_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), - edgeR = good_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), - limma_trend = good_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), - DRUGseq = drugseq_deg -) -upset(fromList(good_dmso), - nsets = 5, - order.by = "freq", - main.bar.color = "black", - sets.bar.color = "gray23", - text.scale = c(2, 2, 2, 1.5, 2, 1.5)) -``` - diff --git a/vignettes/DRUGseq_DE.html b/vignettes/DRUGseq_DE.html deleted file mode 100644 index a5965dd..0000000 --- a/vignettes/DRUGseq_DE.html +++ /dev/null @@ -1,4347 +0,0 @@ - - - - - - - - - - - - - -Benchmarking - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
-
-
-
- -
- - - - - - - -
library(macpie)
-library(tibble)
-library(stringr)
-library(pheatmap)
-library(ggiraph)
-library(tidyseurat)
-
## Loading required package: ttservice
-
## 
-## Attaching package: 'ttservice'
-
## The following objects are masked from 'package:macpie':
-## 
-##     bind_cols, bind_rows, plot_ly
-
## Loading required package: SeuratObject
-
## Loading required package: sp
-
## 
-## Attaching package: 'SeuratObject'
-
## The following objects are masked from 'package:base':
-## 
-##     intersect, t
-
## ========================================
-## tidyseurat version 0.8.0
-## If you use TIDYSEURAT in published research, please cite:
-## 
-## Mangiola et al. Interfacing Seurat with the R tidy universe. Bioinformatics 2021.
-## 
-## This message can be suppressed by:
-##   suppressPackageStartupMessages(library(tidyseurat))
-##   
-## To restore the Seurat default display use options("restore_Seurat_show" = TRUE) 
-## ========================================
-
## 
-## Attaching package: 'tidyseurat'
-
## The following object is masked from 'package:ttservice':
-## 
-##     plot_ly
-
library(purrr)
-
## 
-## Attaching package: 'purrr'
-
## The following object is masked from 'package:testthat':
-## 
-##     is_null
-
library(DT)
-
## 
-## Attaching package: 'DT'
-
## The following object is masked from 'package:SeuratObject':
-## 
-##     JS
-
## The following object is masked from 'package:macpie':
-## 
-##     JS
-
library(ggrepel)
-library(ggalluvial) 
-library(ggplot2)
-library(ggvenn)
-
## Loading required package: grid
-
library(limma)
-
## 
-## Attaching package: 'limma'
-
## The following object is masked from 'package:macpie':
-## 
-##     plotMA
-
-

DRUGseq results

-
# batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds"))
-
# load(paste0(dir,"DRUGseqData/de.RData")) 
-# batch24_de <- de_res%>%filter(batch_id=="24")
-# saveRDS(batch24_de, file = paste0(dir,"DRUGseqData/DE_batch24.Rds"))
-batch24_de <- readRDS(paste0(dir,"DRUGseqData/DE_batch24.Rds"))
-head(batch24_de)
-
##   investigation_id analysis_id pipeline_run_key pipeline_group_run_key method
-## 1             2384          24               27                     27  limma
-## 2             2384          24               27                     27  limma
-## 3             2384          24               27                     27  limma
-## 4             2384          24               27                     27  limma
-## 5             2384          24               27                     27  limma
-## 6             2384          24               27                     27  limma
-##                 normalization
-## 1 quantile=0.75;TMM;log2(CPM)
-## 2 quantile=0.75;TMM;log2(CPM)
-## 3 quantile=0.75;TMM;log2(CPM)
-## 4 quantile=0.75;TMM;log2(CPM)
-## 5 quantile=0.75;TMM;log2(CPM)
-## 6 quantile=0.75;TMM;log2(CPM)
-##                                                  comparison_name_public
-## 1 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
-## 2 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
-## 3 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
-## 4 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
-## 5 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
-## 6 SA_UD-20-ER54_10uM_24.0_batchid24 vs RC_CB-43-EP73_0uM_24.0_batchid24
-##   cmpd_sample_id concentration unit hours_post_treatment batch_id
-## 1     UD-20-ER54            10   uM                   24       24
-## 2     UD-20-ER54            10   uM                   24       24
-## 3     UD-20-ER54            10   uM                   24       24
-## 4     UD-20-ER54            10   uM                   24       24
-## 5     UD-20-ER54            10   uM                   24       24
-## 6     UD-20-ER54            10   uM                   24       24
-##            gene.ID     logFC    adj.P.Val      P.Value   AveExpr         t
-## 1   MYL6,grch38_12  1.726500 6.467501e-43 4.413170e-47 11.768734 15.518950
-## 2   APRT,grch38_16  2.334604 2.164408e-24 2.953815e-28  8.730357 11.508262
-## 3  KRT18,grch38_12  1.157740 1.327225e-15 2.716939e-19  9.335040  9.239790
-## 4    SCD,grch38_10  1.199936 1.352286e-13 3.690990e-17  7.732632  8.636743
-## 5  TUBB6,grch38_18 -1.005931 3.723618e-11 1.270426e-14  8.083550 -7.873065
-## 6 TIMM8B,grch38_11  1.109810 3.654986e-10 1.496412e-13  7.957763  7.532087
-
FF86_res <- batch24_de %>% filter(cmpd_sample_id=="FF-86-NH56")
-ff86_res_toptable <- FF86_res[,13:18]
-ff86_res_toptable <- ff86_res_toptable %>% 
-  separate(gene.ID, into = c("gene", "chrom"), sep = ",") %>%
-  select(-chrom) %>% mutate(combined_id ="FF_86_NH56_10") %>%
-  rename(log2FC=logFC, p_value_adj = adj.P.Val)
-
head(ff86_res_toptable)
-
##         gene    log2FC   p_value_adj       P.Value     AveExpr         t
-## 1 AC122713.2  4.862957 1.397952e-191 9.539077e-196  0.04920401  41.95785
-## 2      TRIP6 -3.328079 5.156775e-166 7.037564e-170  9.40624753 -37.18794
-## 3   MIR7-3HG  8.513923 2.799813e-151 5.731449e-155  0.20792310  34.52594
-## 4       TUBB -2.898698 1.317157e-145 3.595107e-149 11.65399547 -33.50503
-## 5     CYP4F2  6.454706 7.730175e-142 2.637385e-145  0.10131583  32.82814
-## 6       HDGF -3.828482 2.967781e-140 1.215059e-143  9.05238910 -32.53773
-##     combined_id
-## 1 FF_86_NH56_10
-## 2 FF_86_NH56_10
-## 3 FF_86_NH56_10
-## 4 FF_86_NH56_10
-## 5 FF_86_NH56_10
-## 6 FF_86_NH56_10
-
plot_volcano(ff86_res_toptable, max.overlaps =5)
-
## Warning: ggrepel: 4342 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% nrow()
-
## [1] 1423
-
drugseq_deg <- ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% select(gene) %>% pull()
-
#show histogram of p values
-ggplot(ff86_res_toptable, aes(x = P.Value)) +
-  geom_histogram(binwidth = 0.01, fill = "blue", color = "black", alpha = 0.7) +
-  labs(title = "Histogram of Adjusted P-values", x = "Adjusted P-value", y = "Frequency") +
-  theme_minimal()
-

-
-
-

macpie results

-
-
-

All DMSO wells

-
-

Load filered data

-
# mac_filtered <- filter_genes_by_expression(as_mac,
-#                 group_by = "combined_id", min_counts = 10,
-#                 min_samples = min_sample_num)
-
-# saveRDS(mac_filtered,
-# file = paste0(dir, "DRUGseqData/macpie_filtered_batch24.Rds"))
-
-
-mac_filtered <- readRDS(paste0(dir, "/DRUGseqData/macpie_filtered_batch24.Rds"))
-
mac_filtered[["percent.mt"]] <- PercentageFeatureSet(mac_filtered, pattern = "^mt-|^MT-")
-mac_filtered[["percent.ribo"]] <- PercentageFeatureSet(mac_filtered, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA")
-
mac_filtered$combined_id <- str_replace_all(mac_filtered$combined_id, "-","_")
-
-

Correction of batch effect

-

According to DRUGseq metadata:

-
    -
  • Wells with water are labelled as EC-27-RY89

  • -
  • Wells with DMSO are labelled as CB-43-EP73

  • -
-
# mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0","FF_86_NH56_10"))
-mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0"))
-plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "raw")+ scale_x_discrete(drop = FALSE) + 
-  theme(axis.text.x = element_text(angle = 45, hjust = 1))
-
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
-
## Scale for x is already present.
-## Adding another scale for x, which will replace the existing scale.
-

-
plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "limma_voom")+ scale_x_discrete(drop = FALSE) + 
-  theme(axis.text.x = element_text(angle = 45, hjust = 1))
-
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
-## Scale for x is already present.
-## Adding another scale for x, which will replace the existing scale.
-

-
plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "DESeq2")+ scale_x_discrete(drop = FALSE) + 
-  theme(axis.text.x = element_text(angle = 45, hjust = 1))
-
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
-## converting counts to integer mode
-## Scale for x is already present.
-## Adding another scale for x, which will replace the existing scale.
-

-
plot_rle(mac_filtered_cp, label_column = "orig.ident", normalisation = "edgeR")+ scale_x_discrete(drop = FALSE) + 
-  theme(axis.text.x = element_text(angle = 45, hjust = 1))
-
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
-## Scale for x is already present.
-## Adding another scale for x, which will replace the existing scale.
-

-

Note: instead of discussing which correction methods we -should use for this data, we only show the ways we detected and -corrected batch effect here. As batch effect adjustment for sequencing -data has been implemented in different methods, such as DESeq2, RUVSeq, -edgeR. We highly recommend users thoroughly checking any batch effects -and exploring different methods.

-

In the next part of the vignette, we demonstrate a batch -parameter has implemented in the differential expression test for batch -correction.

-
##Limma trend 
-
-# data <- mac_badDSMOremoved
-# treatment_samples <- "FF_86_NH56_10"
-# control_samples <- "CB_43_EP73_0"
-
- limma.trend<- function(data = data, treatment_samples, control_samples){
-   data <- data[, grepl(paste0(treatment_samples, "|", control_samples), data$combined_id)]
-   batch <- data$orig.ident
-if (length(unique(data$combined_id)) < 2) {
-      stop("Insufficient factors for differential expression analysis.")
-    }
-pheno_data <- data.frame(batch = as.factor(batch), condition = as.factor(data$combined_id))
-combined_id <- data$combined_id
-model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else
-      model.matrix(~0+combined_id + batch)
-
-    dge <- DGEList(counts = data@assays$RNA$counts,
-                   samples = pheno_data$condition,
-                   group = pheno_data$condition)
-    dge <- calcNormFactors(dge, method="TMMwsp")
-    # keep <- edgeR::filterByExpr(dge, design = model_matrix)
-    # dge  <- dge[keep, , keep.lib.sizes = FALSE]
-    clean.TMM<-log2(edgeR::cpm(dge,  normalized.lib.sizes=T,log=F)+1) 
-    myargs <- list(paste0("combined_id",
-                          treatment_samples, "-",
-                          paste0("combined_id", control_samples)),
-                   levels = model_matrix)
-    contrasts <- do.call(makeContrasts, myargs)
-    
-    lmfit <- lmFit(clean.TMM, model_matrix)
-    lmfit <- contrasts.fit(lmfit, contrasts)
-    lmfit <- eBayes(lmfit, trend = TRUE)
-    #Warning: Zero sample variances detected, have been offset away from zero
-    top_table<- topTable(lmfit, number = nrow(data)) %>%
-      as.data.frame() %>%
-      select("logFC", "t", "P.Value", "adj.P.Val") %>%
-      rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val")%>%
-      rownames_to_column("gene")
-
-    return(top_table)}
-
-    # top_table%>%filter(p_value_adj < 0.05 & log2FC>0)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# make a function of alluvial plot with ggalluvial to compare the ranking of DEGs from macpie and DRUGseq
-plot_alluvial <- function(macpie_tbl, drugseq_tbl){
-  df <- macpie_tbl %>%
-  transmute(gene, p_value_adj_mac = p_value_adj, log2FC_mac = log2FC) %>%
-  inner_join(
-    drugseq_tbl %>% transmute(gene, p_value_adj_drg = p_value_adj, log2FC_drg = log2FC),
-    by = "gene"
-  )
-
-#  Rank within each method by (padj, |log2FC|), tie-safe
-rank_by_two <- function(padj, lfc) {
-  ord <- order(padj, -abs(lfc), na.last = TRUE)
-  r   <- integer(length(padj))
-  r[ord] <- seq_along(ord)
-  r
-}
-df <- df %>%
-  mutate(
-    rank_mac = rank_by_two(p_value_adj_mac, log2FC_mac),
-    rank_drg = rank_by_two(p_value_adj_drg, log2FC_drg)
-  )
-
-# Define rank bins
-cuts <- c(0, 25, 50, 100, 200, 300, 500, Inf)
-labs <- c("Top25","26–50","51–100","101–200","201–300", "301-500",">500")
-
-df <- df %>%
-  mutate(
-    mac_bin = cut(rank_mac, breaks = cuts, labels = labs, right = TRUE, include.lowest = TRUE),
-    drg_bin = cut(rank_drg, breaks = cuts, labels = labs, right = TRUE, include.lowest = TRUE)
-  )
-
-# Ranking movement category (color)
-bin_index <- function(x) match(x, labs)   # lower index = "more top"
-df <- df %>%
-  mutate(
-    mac_idx = bin_index(mac_bin),
-    drg_idx = bin_index(drg_bin),
-    movement = case_when(
-      is.na(mac_idx) | is.na(drg_idx) ~ NA_character_,
-      mac_idx < drg_idx ~ "Higher rank in macpie",   # moved up into a more-top bin
-      mac_idx > drg_idx ~ "Lower rank in macpie",    # moved down
-      TRUE               ~ "Same rank"
-    ),
-    movement = factor(movement, levels = c("Higher rank in macpie","Same rank","Lower rank in macpie"))
-  ) %>%
-  tidyr::drop_na(mac_bin, drg_bin, movement)
-
-#  Aggregate flows for alluvial
-flows <- df %>%
-  count(mac_bin, drg_bin, movement, name = "n") %>%
-  mutate(
-    mac_bin = factor(mac_bin, levels = labs),
-    drg_bin = factor(drg_bin, levels = labs)
-  )
-
-ggplot(flows, aes(axis1 = mac_bin, axis2 = drg_bin, y = n)) +
-  geom_alluvium(aes(fill = movement), alpha = 0.75, width = 0.14, knot.pos = 0.4) +
-  geom_stratum(width = 0.14, color = "grey30") +
-  geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
-  scale_x_discrete(limits = c("Macpie rank", "DRUGseq rank"), expand = c(.05, .05)) +
-  scale_fill_manual(values = c("Higher rank in macpie" = "orange",
-                               "Same rank"        = "#7570b3",
-                               "Lower rank in macpie" = "grey60")) +
-  labs(x = NULL, y = "Gene count",
-       fill = "Movement vs DRUGseq",
-       subtitle = "Only DEGs with padj < 0.05 & log2FC>0 (both methods)") +
-  theme_classic()
-  
-}
-
-
-

Differential gene expression

-

In here, you can specify a single condition in the combined_id column -and compare with DMSO (i.e.CB_43_EP73_0). By using the plate IDs in the -column of orig.ident as the input for batch parameter, -compute_singe_de function can perform differential -expression analysis using the preferred method (limma voom in this -example) with batch information.

-
methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")
-
-methods_res <- lapply(methods, function(m){
-  
-  message("\n","Processing method: ", m,"\n") 
-  # m<-"limma_voom"
-  treatment_samples <- "FF_86_NH56_10"
-  control_samples <- "CB_43_EP73_0"
-  subset <- mac_filtered%>%filter(combined_id%in%c(treatment_samples,control_samples))
-
-  batch <- subset$orig.ident
-  if (m!="limma_trend"){
-    subset <- filter_genes_by_expression(subset,
-                           group_by = "combined_id", min_counts = 5,
-                           min_samples = 1)
-    top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch)
-  } else{
-    top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples)
-  }
-  
-  # plot(plot_volcano(top_table, max.overlaps = 5))
-  alldmso_degs <- top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
-  length(intersect(alldmso_degs, drugseq_deg))
-  
-  top_table <- top_table %>%
-    arrange(p_value_adj, desc(log2FC)) %>%
-    mutate(gene = factor(gene, levels = unique(gene)))
-  # add a column if there are in the intersect(alldmso_degs, drugseq_deg)
-  top_table <- top_table %>%
-    mutate(in_drugseq_deg = ifelse(gene %in% intersect(alldmso_degs, drugseq_deg), "yes", "no"))
-  
-  plt_ecdf <- ggplot(top_table, aes(x = p_value_adj, color = in_drugseq_deg)) +
-    stat_ecdf(size = 1) +
-    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
-    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
-         color = "In DRUGseq DEGs") +
-    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
-    theme_classic()
-  
-  
-  ks_test_results <- ks.test(top_table$p_value_adj[top_table$in_drugseq_deg=="yes"],
-          top_table$p_value_adj[top_table$in_drugseq_deg=="no"], alternative = "greater")
-  
-  # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
-  #   geom_point(alpha = 0.6, size = 1.2) +
-  #   geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
-  #   geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
-  #   scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) +
-  #   labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") +
-  #   theme_classic()
-  # 
-  # label a few top overlapping genes
-  lab_genes <- top_table[top_table$in_drugseq_deg=="yes", ] |>
-    dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) 
-  
-  volcano_overlap <- ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
-    geom_point(alpha = 0.6, size = 1.2) +
-    geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 50) +
-    scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+
-    theme_classic()
-  
-  ks <- c(25,50,100,200,500,1000)
-  prec_at_k <- sapply(ks, function(k)
-    mean(top_table$in_drugseq_deg[1:k] == "yes")
-  )
-  # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)",
-  #      ylab = "Precision@k (fraction in DRUGseq set)")
-  
-  ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) +
-    geom_point() + geom_line() +
-    labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+
-    theme_classic()
-  
-  # alluvial plot
-  macpie_tbl <- top_table %>% filter(p_value_adj < 0.05 & log2FC>0)
-  drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0)
-  alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl)
-  
-  #return 
-  result_list <- list(top_table = top_table,
-                      num_degs_macpie = length(alldmso_degs),
-                      n_overlap = length(intersect(alldmso_degs, drugseq_deg)),
-                      ecdf_plot = plt_ecdf,
-                      ks_test_results = ks_test_results,
-                      volcano_plot = volcano_overlap,
-                      ks_plot = ks_plot,
-                      alluvial_plot = alluvial_plot)
-  return(result_list)
-  
-})
-
## 
-## Processing method: limma_voom
-
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
-## ℹ Please use `linewidth` instead.
-## This warning is displayed once every 8 hours.
-## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
-## generated.
-
## Warning in ks.test.default(top_table$p_value_adj[top_table$in_drugseq_deg == :
-## p-value will be approximate in the presence of ties
-
## 
-## Processing method: DESeq2
-
## converting counts to integer mode
-
## estimating size factors
-
## estimating dispersions
-
## gene-wise dispersion estimates
-
## mean-dispersion relationship
-
## final dispersion estimates
-
## fitting model and testing
-
## -- replacing outliers and refitting for 1186 genes
-## -- DESeq argument 'minReplicatesForReplace' = 7 
-## -- original counts are preserved in counts(dds)
-
## estimating dispersions
-
## fitting model and testing
-
## Warning in ks.test.default(top_table$p_value_adj[top_table$in_drugseq_deg == :
-## p-value will be approximate in the presence of ties
-
## 
-## Processing method: edgeR
-
## Warning in ks.test.default(top_table$p_value_adj[top_table$in_drugseq_deg == :
-## p-value will be approximate in the presence of ties
-
## 
-## Processing method: limma_trend
-
## Warning: Zero sample variances detected, have been offset away from zero
-## Warning: p-value will be approximate in the presence of ties
-
names(methods_res) <- methods
-
-
-

Summary table

-
#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method
-deg_summary <- map_df(methods_res, function(x) {
-  data.frame(
-    num_degs_macpie = x$num_degs_macpie,
-    n_overlap = x$n_overlap,
-    num_degs_DRUGseq = length(drugseq_deg)
-  )
-}, .id = paste0("macpie_methods"))
-
-deg_summary
-
##   macpie_methods num_degs_macpie n_overlap num_degs_DRUGseq
-## 1     limma_voom            4280       778             1423
-## 2         DESeq2            1959       779             1423
-## 3          edgeR            2321       721             1423
-## 4    limma_trend            1574       400             1423
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
-

Overlapping volcano plot

-
methods_res$limma_voom$volcano_plot
-
## Warning: ggrepel: 746 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
methods_res$DESeq2$volcano_plot
-
## Warning: Removed 831 rows containing missing values or values outside the scale range
-## (`geom_point()`).
-
## Warning: ggrepel: 738 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
methods_res$edgeR$volcano_plot
-
## Warning: ggrepel: 654 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
methods_res$limma_trend$volcano_plot
-
## Warning: ggrepel: 368 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

- - - - - - - -
-
-

Alluvial plot

-
methods_res$limma_voom$alluvial_plot
-
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-

-
methods_res$DESeq2$alluvial_plot
-
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-

-
methods_res$edgeR$alluvial_plot
-
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-

-
methods_res$limma_trend$alluvial_plot
-
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-

-
#append top_table from each method into a single data frame
-all_methods_top_table <- map_df(methods_res, "top_table", .id = "method")
-head(all_methods_top_table)
-
##       method       gene   log2FC   metric      p_value  p_value_adj
-## 1 limma_voom AL031963.2 1.483560 27.66417 4.119762e-36 8.820411e-32
-## 2 limma_voom   Z97192.4 5.026037 26.15530 9.565172e-35 1.023952e-30
-## 3 limma_voom    SLC45A4 1.720577 20.68042 3.573334e-29 2.550170e-25
-## 4 limma_voom AC007066.1 1.844768 18.51969 1.356781e-22 7.262170e-19
-## 5 limma_voom    CYCSP44 1.696008 14.95493 5.065287e-22 1.807463e-18
-## 6 limma_voom AP003068.2 1.679486 17.96392 4.522782e-22 1.807463e-18
-##   in_drugseq_deg
-## 1             no
-## 2            yes
-## 3            yes
-## 4            yes
-## 5             no
-## 6             no
-
#if in_drugseq_deg is "no", change method to "DRUGseq"
-all_methods_top_table <- all_methods_top_table %>%
-  mutate(method = ifelse(in_drugseq_deg == "no", "not in DRUGseq", method))
-
-#rename methods
-all_methods_top_table$method <- recode(all_methods_top_table$method,
-                                        "limma_voom" = "macpie:limma_voom",
-                                        "DESeq2" = "macpie:DESeq2",
-                                        "edgeR" = "macpie:edgeR",
-                                        "limma_trend" = "macpie:limma_trend")
-
-
-ggplot(all_methods_top_table, aes(x = p_value_adj, color = method)) +
-    stat_ecdf(size = 1) +
-    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
-    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
-         color = "In DRUGseq DEGs") +
-  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
-    theme_classic()
-
## Warning: Removed 831 rows containing non-finite outside the scale range
-## (`stat_ecdf()`).
-

-
rm(top_table, alldmso_degs, deg_summary)
-
## Warning in rm(top_table, alldmso_degs, deg_summary): object 'top_table' not
-## found
-
## Warning in rm(top_table, alldmso_degs, deg_summary): object 'alldmso_degs' not
-## found
-
-
-
-

Good DMSO - top 5 from macpie

-

From public DRUG-seq analysis pipeline, authors identified two -reference controls: all DMSO wells and the ‘good DMSO’ wells.

-

We know these good DMSO wells for batch 24 from their published -data:

-
    -
  • VH02012942: I23, M23

  • -
  • VH02012944: D23, H23

  • -
  • VH02012956: I23, J23

  • -
-
batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds"))
-names(batch24)
-
## [1] "VH02012956" "VH02012942" "VH02012944"
-
#make a combined metadata for three plates
-batch24_metadata <- batch24 %>% 
-  map_dfr(~ {
-    .x$Annotation %>%
-      mutate(
-        Plate_ID        = plate_barcode,
-        Well_ID         = well_id,
-        Barcode         = paste0(plate_barcode, "_", well_index),
-        Row             = LETTERS[row],
-        Column          = as.integer(col),
-        Species         = "human",
-        Cell_type       = "U2OS",
-        Model_type      = "2D_adherent",
-        Time            = as.factor(hours_post_treatment),
-        Unit            = "h",
-        Treatment_1     = cmpd_sample_id,
-        Concentration_1 = as.numeric(concentration),
-        Unit_1          = unit,
-        Sample_type     = if_else(well_type == "SA" & col == 24,
-                                  "Positive Control",
-                                  well_type)
-      )
-  })
-
-
-batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id,
-                                                 well_index, col, row, biosample_id, external_biosample_id,
-                                                 cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample))
-
# create a combined UMI matrix for 3 plates
-batch24_counts <- batch24 %>%
-  map(~ {
-    .x$UMI.counts %>%
-      as.data.frame() %>% 
-      rownames_to_column("gene_id") %>%
-      separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>%
-      mutate(gene_name = make.unique(gene_name)) %>%
-      select(-chrom) %>%
-      tibble::column_to_rownames(var = "gene_name") %>%
-      as.matrix()
-  })
-
-binded_counts <- do.call(cbind, batch24_counts)
-
as_mac <- CreateSeuratObject(counts = binded_counts, 
-  min.cells = 1, 
-  min.features = 1)
-
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
-## ('-')
-
## Warning: Data is of class matrix. Coercing to dgCMatrix.
-
as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode"))
-

Filtering steps

-

Include only good DMSO wells as controls

-
as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1)
-
-badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>% 
- filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23", "K23", "J23","C23"))) |
-        (Plate_ID == "VH02012944" & !(Well_ID %in% c("I23", "M23", "J23", "G23", "K23")))|
-        (Plate_ID == "VH02012956" & !  (Well_ID %in% c("I23", "J23", "O23","M23","K23"))))
-
-
-
-keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO))
-
-
-mac_badDSMOremoved <- as_mac[,keep_wells]
-
-mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_")
-
min_sample_num <- min(table(mac_badDSMOremoved$combined_id))
-mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved,
-                                           group_by = "combined_id", min_counts =10,
-                                           min_samples = min_sample_num)
-
mac_badDSMOremoved[["percent.mt"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^mt-|^MT-")
-mac_badDSMOremoved[["percent.ribo"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA")
-
p <- plot_plate_layout(mac_badDSMOremoved, "nCount_RNA", "combined_id") + facet_wrap(~orig.ident, ncol = 1) + 
-  theme(strip.text = element_text(size=10),
-        axis.text.x = element_text(size=10), 
-        axis.text.y = element_text(size=8),
-        legend.title = element_text(size=10),
-        legend.text = element_text(size=8),
-        trip.background = element_blank())
-girafe(ggobj = p, 
-  fonts = list(sans = "sans"),
-  options = list(
-    opts_hover(css = "stroke:black; stroke-width:1px;")
-  ))
-
## Warning in plot_theme(plot): The `trip.background` theme element is not defined
-## in the element hierarchy.
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")
-
-five_dmso_methods_res <- lapply(methods, function(m){
-  message("Processing method: ", m) 
-  # m<-"limma_voom"
-  treatment_samples <- "FF_86_NH56_10"
-  control_samples <- "CB_43_EP73_0"
-  subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples))
-
-  batch <- subset$orig.ident
-  if (m!="limma_trend"){
-      # subset <- filter_genes_by_expression(subset,
-      #                        group_by = "combined_id", min_counts = 10,
-      #                        min_samples = 1)
-     badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method =  m, batch = batch)
-  } else{
-    badDMSO_out_top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples)
-  }
- 
-  # plot(plot_volcano(top_table, max.overlaps = 5))
-  badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
-  length(intersect(badDMSO_out_degs, drugseq_deg))
-  
-  badDMSO_out_top_table <- badDMSO_out_top_table %>%
-    arrange(p_value_adj, desc(log2FC)) %>%
-    mutate(gene = factor(gene, levels = unique(gene)))
-  # add a column if there are in the intersect(alldmso_degs, drugseq_deg)
-  badDMSO_out_top_table <- badDMSO_out_top_table %>%
-    mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no"))
-  
-  plt_ecdf <- ggplot(badDMSO_out_top_table, aes(x = p_value_adj, color = in_drugseq_deg)) +
-    stat_ecdf(size = 1) +
-    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
-    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
-         color = "In DRUGseq DEGs") +
-    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
-    theme_classic()
-  
-  
-  ks_test_results <- ks.test(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="yes"],
-          badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="no"], alternative = "greater")
-  
-  # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
-  #   geom_point(alpha = 0.6, size = 1.2) +
-  #   geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
-  #   geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
-  #   scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) +
-  #   labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") +
-  #   theme_classic()
-  # 
-  # label a few top overlapping genes
-  lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |>
-    dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) 
-  
-  volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
-    geom_point(alpha = 0.6, size = 1.2) +
-    geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) +
-    scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+
-    theme_classic()
-  
-  ks <- c(25,50,100,200,500,1000)
-  prec_at_k <- sapply(ks, function(k)
-    mean(badDMSO_out_top_table$in_drugseq_deg[1:k] == "yes")
-  )
-  # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)",
-  #      ylab = "Precision@k (fraction in DRUGseq set)")
-  
-  ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) +
-    geom_point() + geom_line() +
-    labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+
-    theme_classic()
-  
-  # alluvial plot
-  macpie_tbl <- badDMSO_out_top_table %>% filter(p_value_adj < 0.05 & log2FC>0)
-  drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0)
-  alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl)
-  
-  #return 
-  result_list <- list(top_table = badDMSO_out_top_table,
-                      num_degs_macpie = length(badDMSO_out_degs),
-                      n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)),
-                      ecdf_plot = plt_ecdf,
-                      ks_test_results = ks_test_results,
-                      volcano_plot = volcano_overlap,
-                      ks_plot = ks_plot,
-                      alluvial_plot = alluvial_plot)
-  return(result_list)
-  
-})
-
## Processing method: limma_voom
-
## Warning in
-## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
-## == : p-value will be approximate in the presence of ties
-
## Processing method: DESeq2
-
## converting counts to integer mode
-
## estimating size factors
-
## estimating dispersions
-
## gene-wise dispersion estimates
-
## mean-dispersion relationship
-
## final dispersion estimates
-
## fitting model and testing
-
## Warning in
-## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
-## == : p-value will be approximate in the presence of ties
-
## Processing method: edgeR
-
## Warning in
-## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
-## == : p-value will be approximate in the presence of ties
-
## Processing method: limma_trend
-
## Warning: Zero sample variances detected, have been offset away from zero
-## Warning: p-value will be approximate in the presence of ties
-
names(five_dmso_methods_res) <- methods
-
-

Summary table

-
#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method
-deg_summary <- map_df(five_dmso_methods_res, function(x) {
-  data.frame(
-    num_degs_macpie = x$num_degs_macpie,
-    n_overlap = x$n_overlap,
-    num_degs_DRUGseq = length(drugseq_deg)
-  )
-}, .id = paste0("macpie_methods"))
-
-deg_summary
-
##   macpie_methods num_degs_macpie n_overlap num_degs_DRUGseq
-## 1     limma_voom            3456       592             1423
-## 2         DESeq2            1107       549             1423
-## 3          edgeR            2185       690             1423
-## 4    limma_trend               5         2             1423
-
-
-

ECDF plot

-
#append top_table from each method into a single data frame
-good_dmso_top_table <- map_df(five_dmso_methods_res, "top_table", .id = "method")
-head(good_dmso_top_table)
-
##       method       gene   log2FC   metric      p_value  p_value_adj
-## 1 limma_voom AL031963.2 1.477162 34.73174 4.135992e-26 1.048226e-21
-## 2 limma_voom    SLC45A4 1.746060 25.83312 2.602120e-22 3.297406e-18
-## 3 limma_voom   Z97192.4 5.194480 22.09739 2.424226e-20 2.047986e-16
-## 4 limma_voom AP003068.2 1.691204 26.07754 6.263377e-20 3.968475e-16
-## 5 limma_voom AL583807.1 2.808626 22.32748 2.810497e-18 1.187154e-14
-## 6 limma_voom AC007066.1 1.833326 23.05883 2.624228e-18 1.187154e-14
-##   in_drugseq_deg
-## 1             no
-## 2            yes
-## 3            yes
-## 4             no
-## 5             no
-## 6            yes
-
#if in_drugseq_deg is "no", change method to "DRUGseq"
-good_dmso_top_table<- good_dmso_top_table %>%
-  mutate(method = ifelse(in_drugseq_deg == "no", "not_in_DRUGseq", method))
-
-#rename methods
-good_dmso_top_table$method <- recode(good_dmso_top_table$method,
-                                        "limma_voom" = "macpie:limma_voom",
-                                        "DESeq2" = "macpie:DESeq2",
-                                        "edgeR" = "macpie:edgeR",
-                                        "limma_trend" = "macpie:limma_trend")
-
-
-ggplot(good_dmso_top_table, aes(x = p_value_adj, color = method)) +
-    stat_ecdf(size = 1) +
-    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
-    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
-         color = "In DRUGseq DEGs") +
-  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
-    theme_classic()
-
## Warning: Removed 12331 rows containing non-finite outside the scale range
-## (`stat_ecdf()`).
-

- - - - - - -
-
-

Overlapping volcano plot

-
five_dmso_methods_res$limma_voom$volcano_plot
-
## Warning: ggrepel: 575 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
five_dmso_methods_res$DESeq2$volcano_plot
-
## Warning: Removed 12331 rows containing missing values or values outside the scale range
-## (`geom_point()`).
-
## Warning: ggrepel: 536 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
five_dmso_methods_res$edgeR$volcano_plot
-
## Warning: ggrepel: 662 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
five_dmso_methods_res$limma_trend$volcano_plot
-

- - - - - - - -
-
-

Alluvial plot

-
five_dmso_methods_res$limma_voom$alluvial_plot
-
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-

-
five_dmso_methods_res$DESeq2$alluvial_plot
-
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-

-
five_dmso_methods_res$edgeR$alluvial_plot
-
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-

-
five_dmso_methods_res$limma_trend$alluvial_plot
-
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
-

Good DMSO - DRUGseq

-

From public DRUG-seq analysis pipeline, authors identified two -reference controls: all DMSO wells and the ‘good DMSO’ wells.

-

We know these good DMSO wells for batch 24 from their published -data:

-
    -
  • VH02012942: I23, M23

  • -
  • VH02012944: D23, H23

  • -
  • VH02012956: I23, J23

  • -
-
batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds"))
-names(batch24)
-
## [1] "VH02012956" "VH02012942" "VH02012944"
-
#make a combined metadata for three plates
-batch24_metadata <- batch24 %>% 
-  map_dfr(~ {
-    .x$Annotation %>%
-      mutate(
-        Plate_ID        = plate_barcode,
-        Well_ID         = well_id,
-        Barcode         = paste0(plate_barcode, "_", well_index),
-        Row             = LETTERS[row],
-        Column          = as.integer(col),
-        Species         = "human",
-        Cell_type       = "U2OS",
-        Model_type      = "2D_adherent",
-        Time            = as.factor(hours_post_treatment),
-        Unit            = "h",
-        Treatment_1     = cmpd_sample_id,
-        Concentration_1 = as.numeric(concentration),
-        Unit_1          = unit,
-        Sample_type     = if_else(well_type == "SA" & col == 24,
-                                  "Positive Control",
-                                  well_type)
-      )
-  })
-
-
-batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id,
-                                                 well_index, col, row, biosample_id, external_biosample_id,
-                                                 cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample))
-
# create a combined UMI matrix for 3 plates
-batch24_counts <- batch24 %>%
-  map(~ {
-    .x$UMI.counts %>%
-      as.data.frame() %>% 
-      rownames_to_column("gene_id") %>%
-      separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>%
-      mutate(gene_name = make.unique(gene_name)) %>%
-      select(-chrom) %>%
-      tibble::column_to_rownames(var = "gene_name") %>%
-      as.matrix()
-  })
-
-binded_counts <- do.call(cbind, batch24_counts)
-
as_mac <- CreateSeuratObject(counts = binded_counts, 
-  min.cells = 1, 
-  min.features = 1)
-
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
-## ('-')
-
## Warning: Data is of class matrix. Coercing to dgCMatrix.
-
as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode"))
-

Filtering steps

-

Include only good DMSO wells as controls

-
as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1)
-
-badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>% 
- filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23"))) |
-        (Plate_ID == "VH02012944" & !(Well_ID %in% c("D23", "H23")))|
-        (Plate_ID == "VH02012956" & !  (Well_ID %in% c("I23", "J23"))))
-
-
-
-keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO))
-
-
-mac_badDSMOremoved <- as_mac[,keep_wells]
-
-mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_")
-
min_sample_num <- min(table(mac_badDSMOremoved$combined_id))
-mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved,
-                                           group_by = "combined_id", min_counts =10,
-                                           min_samples = min_sample_num)
-
mac_badDSMOremoved[["percent.mt"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^mt-|^MT-")
-mac_badDSMOremoved[["percent.ribo"]] <- PercentageFeatureSet(mac_badDSMOremoved, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA")
-
methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")
-
-good_dmso_methods_res <- lapply(methods, function(m){
-  message("Processing method: ", m) 
-  # m<-"limma_voom"
-  treatment_samples <- "FF_86_NH56_10"
-  control_samples <- "CB_43_EP73_0"
-  subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples))
-
-  batch <- subset$orig.ident
-  if (m!="limma_trend"){
-      # subset <- filter_genes_by_expression(subset,
-      #                        group_by = "combined_id", min_counts = 10,
-      #                        min_samples = 1)
-     badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method =  m, batch = batch)
-  } else{
-    badDMSO_out_top_table <- limma.trend(data = subset, treatment_samples = treatment_samples, control_samples = control_samples)
-  }
- 
-  # plot(plot_volcano(top_table, max.overlaps = 5))
-  badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
-  length(intersect(badDMSO_out_degs, drugseq_deg))
-  
-  badDMSO_out_top_table <- badDMSO_out_top_table %>%
-    arrange(p_value_adj, desc(log2FC)) %>%
-    mutate(gene = factor(gene, levels = unique(gene)))
-  # add a column if there are in the intersect(alldmso_degs, drugseq_deg)
-  badDMSO_out_top_table <- badDMSO_out_top_table %>%
-    mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no"))
-  
-  plt_ecdf <- ggplot(badDMSO_out_top_table, aes(x = p_value_adj, color = in_drugseq_deg)) +
-    stat_ecdf(size = 1) +
-    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
-    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
-         color = "In DRUGseq DEGs") +
-    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
-    theme_classic()
-  
-  
-  ks_test_results <- ks.test(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="yes"],
-          badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg=="no"], alternative = "greater")
-  
-  # ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
-  #   geom_point(alpha = 0.6, size = 1.2) +
-  #   geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
-  #   geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
-  #   scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe")) +
-  #   labs(x = "log2FC", y = expression(-log[10]("adj p")), color = "In DRUGseq DEGs") +
-  #   theme_classic()
-  # 
-  # label a few top overlapping genes
-  lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |>
-    dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) 
-  
-  volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) +
-    geom_point(alpha = 0.6, size = 1.2) +
-    geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) +
-    scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+
-    theme_classic()
-  
-  ks <- c(25,50,100,200,500,1000)
-  prec_at_k <- sapply(ks, function(k)
-    mean(badDMSO_out_top_table$in_drugseq_deg[1:k] == "yes")
-  )
-  # plot(ks, prec_at_k, type = "b", xlab = "k (top-k by macpie)",
-  #      ylab = "Precision@k (fraction in DRUGseq set)")
-  
-  ks_plot <- ggplot(data.frame(k=ks, prec=prec_at_k), aes(x=k, y=prec)) +
-    geom_point() + geom_line() +
-    labs(x = "k (top-k by macpie)", y = "Precision@k (fraction in DRUGseq set)")+
-    theme_classic()
-  
-  # alluvial plot
-  macpie_tbl <- badDMSO_out_top_table %>% filter(p_value_adj < 0.05 & log2FC>0)
-  drugseq_tbl <- ff86_res_toptable %>% filter(p_value_adj < 0.05 & log2FC>0)
-  alluvial_plot <- plot_alluvial(macpie_tbl = macpie_tbl, drugseq_tbl = drugseq_tbl)
-  
-  #return 
-  result_list <- list(top_table = badDMSO_out_top_table,
-                      num_degs_macpie = length(badDMSO_out_degs),
-                      n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)),
-                      ecdf_plot = plt_ecdf,
-                      ks_test_results = ks_test_results,
-                      volcano_plot = volcano_overlap,
-                      ks_plot = ks_plot,
-                      alluvial_plot = alluvial_plot)
-  return(result_list)
-  
-})
-
## Processing method: limma_voom
-
## Warning in
-## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
-## == : p-value will be approximate in the presence of ties
-
## Processing method: DESeq2
-
## converting counts to integer mode
-
## estimating size factors
-
## estimating dispersions
-
## gene-wise dispersion estimates
-
## mean-dispersion relationship
-
## final dispersion estimates
-
## fitting model and testing
-
## Warning in
-## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
-## == : p-value will be approximate in the presence of ties
-
## Processing method: edgeR
-
## Warning in
-## ks.test.default(badDMSO_out_top_table$p_value_adj[badDMSO_out_top_table$in_drugseq_deg
-## == : p-value will be approximate in the presence of ties
-
## Processing method: limma_trend
-
## Warning: Zero sample variances detected, have been offset away from zero
-## Warning: p-value will be approximate in the presence of ties
-
names(good_dmso_methods_res) <- methods
-
-

Summary table

-
#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method
-deg_summary <- map_df(good_dmso_methods_res, function(x) {
-  data.frame(
-    num_degs_macpie = x$num_degs_macpie,
-    n_overlap = x$n_overlap,
-    num_degs_DRUGseq = length(drugseq_deg)
-  )
-}, .id = paste0("macpie_methods"))
-
-deg_summary
-
##   macpie_methods num_degs_macpie n_overlap num_degs_DRUGseq
-## 1     limma_voom            1352       204             1423
-## 2         DESeq2              10         8             1423
-## 3          edgeR            1757       604             1423
-## 4    limma_trend               2         1             1423
-
-
-

ECDF plot

-
#append top_table from each method into a single data frame
-good_dmso_top_table <- map_df(good_dmso_methods_res, "top_table", .id = "method")
-head(good_dmso_top_table)
-
##       method       gene   log2FC   metric       p_value   p_value_adj
-## 1 limma_voom AL031963.2 1.450009 52.04866  0.000000e+00  0.000000e+00
-## 2 limma_voom  RNU6-633P 1.254947 72.59594  0.000000e+00  0.000000e+00
-## 3 limma_voom    ELOCP10 1.269453 35.96512 2.086923e-279 1.762685e-275
-## 4 limma_voom AP003068.2 1.677015 35.65408 1.072200e-274 6.792120e-271
-## 5 limma_voom AC007066.1 1.821885 32.63913 4.558729e-220 2.310273e-216
-## 6 limma_voom    SLC45A4 1.745313 31.67705 6.854635e-218 2.894826e-214
-##   in_drugseq_deg
-## 1             no
-## 2             no
-## 3             no
-## 4             no
-## 5            yes
-## 6            yes
-
#if in_drugseq_deg is "no", change method to "DRUGseq"
-good_dmso_top_table<- good_dmso_top_table %>%
-  mutate(method = ifelse(in_drugseq_deg == "no", "not_in_DRUGseq", method))
-
-#rename methods
-good_dmso_top_table$method <- recode(good_dmso_top_table$method,
-                                        "limma_voom" = "macpie:limma_voom",
-                                        "DESeq2" = "macpie:DESeq2",
-                                        "edgeR" = "macpie:edgeR",
-                                        "limma_trend" = "macpie:limma_trend")
-
-
-ggplot(good_dmso_top_table, aes(x = p_value_adj, color = method)) +
-    stat_ecdf(size = 1) +
-    scale_x_continuous(trans = "log10", breaks = c(1e-6,1e-4,5e-2)) +
-    labs(x = "Adjusted p-value (log10 scale)", y = "ECDF",
-         color = "In DRUGseq DEGs") +
-  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
-    theme_classic()
-
## Warning in scale_x_continuous(trans = "log10", breaks = c(1e-06, 1e-04, :
-## log-10 transformation introduced infinite values.
-
## Warning: Removed 19707 rows containing non-finite outside the scale range
-## (`stat_ecdf()`).
-

-
good_dmso_methods_res$limma_voom$ecdf_plot
-
## Warning in scale_x_continuous(trans = "log10", breaks = c(1e-06, 1e-04, :
-## log-10 transformation introduced infinite values.
-
## Warning: Removed 2 rows containing non-finite outside the scale range
-## (`stat_ecdf()`).
-

-
good_dmso_methods_res$DESeq2$ecdf_plot
-
## Warning: Removed 19705 rows containing non-finite outside the scale range
-## (`stat_ecdf()`).
-

-
good_dmso_methods_res$edgeR$ecdf_plot
-

-
good_dmso_methods_res$limma_trend$ecdf_plot
-

-### Overlapping volcano plot

-
good_dmso_methods_res$limma_voom$volcano_plot
-
## Warning: ggrepel: 196 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
good_dmso_methods_res$DESeq2$volcano_plot
-
## Warning: Removed 19705 rows containing missing values or values outside the scale range
-## (`geom_point()`).
-

-
good_dmso_methods_res$edgeR$volcano_plot
-
## Warning: ggrepel: 583 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
good_dmso_methods_res$limma_trend$volcano_plot
-

- - - - - - - -
-
-

Alluvial plot

-
good_dmso_methods_res$limma_voom$alluvial_plot
-
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-

-
good_dmso_methods_res$DESeq2$alluvial_plot
-
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-

-
good_dmso_methods_res$edgeR$alluvial_plot
-
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-

-
good_dmso_methods_res$limma_trend$alluvial_plot
-
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
-## params$discern): Some strata appear at multiple axes.
-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
-

Comparison

-

To compare DEGs with different replicate numbers and different -methods

-
methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend")
-
-get_jaccard <- function(deg_set, drugseq_deg){
-  intersection <- length(intersect(deg_set, drugseq_deg))
-  union <- length(union(deg_set, drugseq_deg))
-  jaccard_index <- intersection / union
-  return(jaccard_index)
-}
-
-jaccard_index <- lapply(methods, function(m){
-  # all dmso
-  degs <- methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
-  jaccard_all <- get_jaccard(degs, drugseq_deg)
-  # five dmso
-  degs <- five_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
-  jaccard_five <- get_jaccard(degs, drugseq_deg)
-  # three dmso
-  degs <- good_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull()
-  jaccard_three <- get_jaccard(degs, drugseq_deg)
-  jaccard_index <- data.frame(
-    method = m,
-    jaccard_all = jaccard_all,
-    jaccard_five = jaccard_five,
-    jaccard_three = jaccard_three
-  )
-  return(jaccard_index)
-})
-
-df <- as.data.frame(do.call(rbind, jaccard_index))
-rownames(df) <- df$method
-df <- df %>% select(-method)
-colnames(df) <- c("All DMSO", "macpie: 15 DMSO", "DRUGseq: 6 DMSO")
-pheatmap::pheatmap(df,
-         cluster_rows = FALSE,
-         cluster_cols = FALSE,
-         display_numbers = TRUE,
-         main = "Jaccard Index between macpie DEGs and DRUGseq DEGs")
-

-
library(UpSetR)
-all_dmso <- list(
-  limma_voom = methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
-  DESeq2 = methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
-  edgeR = methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
-  limma_trend = methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
-  DRUGseq = drugseq_deg
-)
-upset(fromList(all_dmso), 
-      nsets = 5, 
-      order.by = "freq",
-      main.bar.color = "black",
-      sets.bar.color = "gray23",
-      text.scale = c(2, 2, 2, 1.5, 2, 1.5))
-

-
five_dmso <- list(
-  limma_voom = five_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
-  DESeq2 = five_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
-  edgeR = five_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
-  limma_trend = five_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
-  DRUGseq = drugseq_deg
-)
-upset(fromList(five_dmso), 
-      nsets = 5, 
-      order.by = "freq",
-      main.bar.color = "black",
-      sets.bar.color = "gray23",
-      text.scale = c(2, 2, 2, 1.5, 2, 1.5))
-

-
good_dmso <- list(
-  limma_voom = good_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
-  DESeq2 = good_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
-  edgeR = good_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
-  limma_trend = good_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(),
-  DRUGseq = drugseq_deg
-)
-upset(fromList(good_dmso), 
-      nsets = 5, 
-      order.by = "freq",
-      main.bar.color = "black",
-      sets.bar.color = "gray23",
-      text.scale = c(2, 2, 2, 1.5, 2, 1.5))
-

-
- - - -
-
- -
- - - - - - - - - - - - - - - - diff --git a/vignettes/DRUGseq_first500runs.Rmd b/vignettes/DRUGseq_first500runs.Rmd deleted file mode 100644 index 3878bba..0000000 --- a/vignettes/DRUGseq_first500runs.Rmd +++ /dev/null @@ -1,652 +0,0 @@ ---- -title: "DRUG-seq data analysis pipeline" -output: - html_document: - toc: true - toc_float: true ---- - -```{r} - -suppressPackageStartupMessages({ - library(dplyr); library(tidyr); library(limma); library(edgeR) -}) -``` - - -```{r} -library(stringr) -library(DT) -library(tidyr) -library(dplyr) -library(foreach) -library(limma) -library(edgeR) -library(umap) -library(data.table) -library(ggplot2) -library(GGally) -library(plotly) -library(ggrepel) -library(viridis) -library(RANN) -library(igraph) -library(RColorBrewer) -library(patchwork) -``` - -# DRUGseq data analysis pipeline - DMSO selection - -## data path setup -```{r setup,include=FALSE} -dir <- "/Users/liuxin/macpie_Dev/" -setwd(paste0(dir,"/DRUGseqData/DRUG-seq-main")) -analysis_dir <- getwd() -# dataset <- "flowcell_4000_UMI_decode_VH02001612.RData" -input_data_dir <- file.path(analysis_dir,'data') -knitr::opts_knit$set(root.dir = analysis_dir) -out_data_dir <- file.path(analysis_dir,'out') -if(!dir.exists(out_data_dir)) {dir.create(out_data_dir)} -``` - -**project folder:** -`r analysis_dir` - - -## 1.2 functions -```{r limma toptable} -get_union_toptable <- function(fit3) { - contrast_names <- colnames(coef(fit3)) - n_contrasts <- length(contrast_names) - - tt <- foreach(i=1:n_contrasts, .combine=rbind) %do% - { topTable(fit3, coef = i, - p.value = 1, number = Inf) %>% - mutate(GeneId=rownames(.), contrast=colnames(coef(fit3))[i]) } - - tt$contrast <- factor(tt$contrast, levels=contrast_names) - return(tt) -} -``` - - -```{r DE.limma trend} -DE.limma_trend<-function(Exp=Exp,CTRL,meta,best_DMSO=NULL,random_run=FALSE){ - if(!is.null(best_DMSO)){ - print('Starting DE calculations using best DMSO.') - }else{print('Starting DE calculations.') - } - - de<-lapply(1:length(Exp), function(i){ - b<-names(Exp)[i] - meta_sub<- meta %>% filter(batch==b) %>% mutate(plate_barcode_short = substr(plate_barcode, nchar(plate_barcode)-2, nchar(plate_barcode))) - if(!is.null(best_DMSO)) { - meta_sub <- meta_sub %>% filter(treatment!=CTRL|barcode_well %in% best_DMSO$barcode_well) #only use best DMSO as control - } - - rownames(meta_sub)<-meta_sub$plate_id - UMI_batch<-lapply(1:length(Exp[[i]]), function(j){ - #print(paste0('i ',i,' / j ',j)) - x<-Exp[[i]][[j]] - mat<-x$UMI.counts - mat<-mat[!rownames(mat) %in% grep("ERCC-",rownames(mat),value = T),] - mat<-mat[order(row.names(mat)),] - return(mat) - }) - # active checking that the objects in the list have exactly the same names - unames <- unique(unlist(lapply(UMI_batch, rownames))) - stopifnot( all( sapply(UMI_batch, function(x) identical(rownames(x), unames)) ) ) - UMI_batch<-do.call(cbind,UMI_batch) - - - UMI_batch<-UMI_batch[,meta_sub$plate_id] - clean<-UMI_batch[apply(UMI_batch,1,max)>quantile(apply(UMI_batch,1,max),0.75),] - clean<-clean[,colSums(clean)>0] - - d<-DGEList(counts=clean) - d <- calcNormFactors(d, method="TMM") - clean.TMM<-log2(edgeR::cpm(d, normalized.lib.sizes=T,log=F)+1) - - meta_sub<-meta_sub[colnames(clean.TMM),] - meta_sub$Sample<-sapply(meta_sub$Sample, function(k) gsub("-",".",k)) - - - - design<-model.matrix(~0+factor(meta_sub$Sample)+factor(meta_sub$replicate)) - colnames(design)<-c(levels(factor(meta_sub$Sample)),levels(factor(meta_sub$replicate))[-1]) - rownames(design)<-colnames(clean) - samples<-unique(meta_sub$Sample) - ctrl<-grep(CTRL,samples,value = T) - #print(ctrl) - samples<-samples[!samples %in% ctrl] - - contrasts<-sapply(samples, function(ctrs)paste0(ctrs,"-",ctrl)) - contrast.matrix<-limma::makeContrasts(contrasts=contrasts, levels = design) - - fit = limma::lmFit(clean.TMM, design) - fit2 = limma::contrasts.fit(fit,contrast.matrix) - fit3 = limma::eBayes(fit2, trend = T) - - DE.table <- get_union_toptable(fit3) %>% - separate(contrast,c('contrast','ctrl_arm'),sep="-") %>% - select(-ctrl_arm) %>% - merge(meta_sub %>% - filter(!treatment ==CTRL) %>% - select(plate_barcode_short,treatment,treatment_dose,timepoint,Sample,plate_well,batch) %>% - distinct() %>% - group_by(treatment,treatment_dose,timepoint,Sample,batch) %>% - summarise(plate_well = paste(paste(plate_barcode_short,plate_well,sep = " "), collapse=",")), - sort=FALSE, by.x='contrast',by.y='Sample') %>% mutate(Sample=contrast) # combine metadata with toptable - - }) - if (random_run==TRUE){ - de<-do.call(rbind,de) %>% - filter(Sample=="RC_to_SA") %>% - group_by(batch,plate_well) %>% - filter(abs(logFC)>1,adj.P.Val<0.1) %>% - summarise(count=n()) %>% - mutate(run_no=n,contrast='RC_to_SA') %>% - ungroup() - }else{ - de<-do.call(rbind,de) - } - return(de) -} -``` - -```{r volcano plot} -vpPlot_DElabel <- function(this_contrast_toptabel,batch_select=1,this_arms){ - - pcut<-0.1 - #color= black,red, blue - # 0 1 2 - color_custum<-c(rgb(0,0,0,0.5),rgb(1,0,0,1),rgb(0,0,1,1)) - - this_contrast_toptabel <- this_contrast_toptabel %>% - filter(treatment %in% this_arms,batch==batch_select) %>% - mutate(label=0) - - - - high_concentration<-this_contrast_toptabel %>% select(treatment_dose) %>% max() - - this_contrast_toptabel_plot<-NULL - - for (SampleName_lib_temp in this_arms) { - upregulate_gene <- this_contrast_toptabel %>% - filter(treatment == SampleName_lib_temp) %>% - filter(treatment_dose==high_concentration, adj.P.Val1) %>% - select(GeneId) %>% - .$GeneId - downregulate_gene <-this_contrast_toptabel %>% - filter(treatment == SampleName_lib_temp) %>% - filter(treatment_dose==high_concentration, adj.P.Val% - select(GeneId) %>% - .$GeneId - - this_contrast_toptabel_plot <- rbind(this_contrast_toptabel_plot,this_contrast_toptabel - %>% filter(treatment==SampleName_lib_temp) %>% - mutate(label=ifelse(GeneId %in% upregulate_gene,1,label)) %>% - mutate(label=ifelse(GeneId %in% downregulate_gene,2,label))) - } - - this_contrast_toptabel_plot$label<-as.factor(this_contrast_toptabel_plot$label) - levels(this_contrast_toptabel_plot$label)<-c("not significant ","upregulated genes","downregulated genes") - this_contrast_toptabel_plot$treatment<-factor(this_contrast_toptabel_plot$treatment) - - - - vp<-this_contrast_toptabel_plot %>% arrange(label) %>% - ggplot(aes(x=logFC, y=-log10(adj.P.Val), color=label))+ - scale_color_manual(values = color_custum)+ - geom_point(fill="white",shape=21, size=1, stroke=0.5) + - geom_abline(slope = 0, intercept = -log10(pcut), size=0.3, linetype='dashed') + - geom_vline(xintercept = -1, size=0.3, linetype='dashed') + - geom_vline(xintercept = 1, size=0.3, linetype='dashed') + - xlim(-8,8)+ - theme_bw() + - theme(axis.text.x = element_text(face="bold",size=14), - axis.text.y = element_text(face="bold", size=10), - axis.title=element_text(size=14,face="bold"), - legend.text=element_text(size=12) ) + - facet_grid(treatment~treatment_dose)+ - theme(strip.text = element_text(face="bold", size=12)) - - return(vp) - } -``` - -```{r VennDiagram} -## works up to 3 groups comparison now -VD_plot <- function(this_contrast_toptable,batch_select=1){ - this_contrast_toptable_plot <- this_contrast_toptable %>% - mutate(sig=ifelse(abs(logFC)>1 & adj.P.Val<0.1,-1,0)) %>% - mutate(sig=ifelse(logFC>1 & adj.P.Val<0.1,1,sig)) %>% - mutate(contrast=paste0(treatment_dose,'uM')) %>% - select(contrast,GeneId,sig) %>% - reshape2::dcast(GeneId~contrast,value.var="sig") - - - rownames(this_contrast_toptable_plot) <- this_contrast_toptable_plot$GeneId - return(this_contrast_toptable_plot %>% select(-GeneId) %>% vennDiagram(circle.col=c("turquoise", "salmon","palegreen"),cex=1)) - -} -``` - - -```{r UMAP} -pca_umap <- function(counts=NULL,center_pca=T,metric="euclidean", scale_pca=T,n_neighbors=15,n_pca=20){ - - pca<-prcomp(t(counts), scale = scale_pca, center = center_pca) - rownames(pca$x) <- colnames(counts) - umap <- umap(pca$x[,1:n_pca],method = "umap-learn",metric=metric, n_neighbors=n_neighbors) - rownames(umap$layout) <- colnames(counts) - return(umap) - - -} - -``` - -```{r customized pair plot} -ggpairs_custom<-function(toptable_meta,this_arms,p_cut,corr_size=3,batch_s=1,strip_size=NULL,label_size=NULL,exclusive_gene=NULL){ - - toptable_meta=toptable_meta %>% mutate(contrast=paste(treatment,treatment_dose,timepoint,sep="_")) - this_contrast=toptable_meta %>% - filter(Sample %in% this_arms,batch==batch_s) %>% - arrange(treatment_dose) %>% - select(contrast) %>% - distinct() %>%.$contrast - p_cut <- p_cut - - -if (is.null(exclusive_gene)) { -valid_genes <- toptable_meta %>% - filter(Sample %in% this_arms) %>% - filter(batch==batch_s) %>% - filter(adj.P.Val% - .$GeneId %>% - unique() -} else { - - valid_genes <- toptable_meta %>% - filter(Sample %in% this_arms) %>% - filter(batch==batch_s) %>% - filter(adj.P.Val% - .$GeneId %>% - unique() %>% - setdiff(exclusive_gene) - -} - -table_to_plot1<-toptable_meta %>% - filter(Sample %in% this_arms) %>% - filter(batch==batch_s) %>% - filter(GeneId %in% valid_genes) %>% - dplyr::select(GeneId, contrast, logFC) %>% - setDT %>% - data.table::dcast(GeneId ~ contrast, value.var = 'logFC') %>% - as.data.frame() - -table_to_plot1<-table_to_plot1[,c("GeneId",this_contrast)] -rownames(table_to_plot1)<-table_to_plot1$GeneId - -table_to_plot2<-toptable_meta %>% - filter(Sample %in% this_arms) %>% - filter(batch==batch_s) %>% - filter(adj.P.Val% - dplyr::select(GeneId, contrast, logFC) %>% - setDT %>% - data.table::dcast(GeneId ~ contrast, value.var = 'logFC') %>% - as.data.frame() - -missing_col<-setdiff(colnames(table_to_plot1),colnames(table_to_plot2)) -if (!identical(missing_col, character(0))) { - missing_mat<-data.frame(matrix(0, nrow = nrow(table_to_plot2), ncol =as.numeric( length(missing_col)))) - missing_mat[missing_mat==0]=NA - colnames(missing_mat)<-missing_col - table_to_plot2<-cbind(table_to_plot2,missing_mat) -} - -table_to_plot2<-table_to_plot2[,c("GeneId",this_contrast)] - -data_plot<-inner_join(table_to_plot1,table_to_plot2,by="GeneId") - - - - rownames(data_plot)<-data_plot$GeneId - data_plot<-data_plot %>% select(-"GeneId") - - axis_limt<-max(abs(data_plot),na.rm = TRUE) - - diagfun <- function(data,mapping){ - ggplot(data = data, mapping = mapping)+ - geom_density(color="darkblue", fill="lightblue") + - scale_x_continuous(limits = c(-axis_limt, axis_limt)) - # scale_y_continuous(limits = c(-axis_limt, axis_limt)) - } - - corr_heatmap <- function(data, mapping, method="p", use="pairwise", ...){ - - # grab data - x <- eval_data_col(data, mapping$x) - y <- eval_data_col(data, mapping$y) - - # calculate correlation - corr <- cor(x, y, method=method, use=use) - colFn <- colorRampPalette(c("blue", "white", "red"), interpolate ='spline') - fill <- colFn(100)[findInterval(corr, seq(-1, 1, length=100))] - ggally_cor(data = data, mapping = mapping, ...) + - theme_classic()+ - theme(panel.background = element_rect(fill=fill,colour = fill)) - } - - - pm<-ggpairs(data_plot,c(1:as.numeric(length(colnames(data_plot))/2)),columnLabels=colnames(table_to_plot1)[-1],diag=list(continuous = wrap(diagfun)), - upper = list(continuous = corr_heatmap),progress = F)+theme(axis.text = element_text(size = label_size),strip.placement = "outside", - text = element_text(size = strip_size) ) - - method<-"pearson" - pm2 <- pm - color_custum<-c(rgb(0,0,0,0.5),rgb(0,1,0,0.7),rgb(0,0,1,0.7),rgb(1,0,0,1)) - label_list<-c('not significant','x axis DE genes','y axis DE genes','shared DE genes') - for(i in 2:pm$nrow) { - for(j in 1:(i-1)) { - - x <- GGally::eval_data_col(pm[i,j]$data, pm[i,j]$mapping$x) - y <- GGally::eval_data_col(pm[i,j]$data, pm[i,j]$mapping$y) - corr<- cor(x,y,method=method,use="na.or.complete") - col_name_x_x<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$x),".x") - col_name_y_x<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$y),".x") - col_name_x_y<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$x),".y") - col_name_y_y<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$y),".y") - col_name<-c(col_name_x_x,col_name_y_x,col_name_x_y,col_name_y_y) - data_sub<-pm$data[,col_name] - - data_sub<-data_sub %>% - mutate(label = ifelse(!is.na(!!sym(col_name_x_y)) & - abs(!!sym(col_name_x_y))>1,1,0)) %>% - mutate(label = ifelse(!is.na(!!sym(col_name_y_y)) & - abs(!!sym(col_name_y_y))>1,2,label)) %>% - mutate(label = ifelse(!is.na(!!sym(col_name_x_y)) & - abs(!!sym(col_name_x_y))>1 & - !is.na(!!sym(col_name_y_y))& - abs(!!sym(col_name_y_y))>1 ,3,label)) %>% - arrange(label) - - unique_label<-sort(unique(data_sub$label)) - data_sub$label<-as.factor(data_sub$label) - levels(data_sub$label)<-label_list[unique_label+1] - - p<-ggplot(data_sub,aes(x=!!sym(pm[i,j]$labels$x),y=!!sym(pm[i,j]$labels$y),color=label))+ - geom_point(size=2,shape=21,stroke=0.5,fill="white")+theme_bw()+scale_color_manual(values = color_custum[unique_label+1])+ scale_shape(solid = TRUE) + - scale_x_continuous(limits = c(-axis_limt, axis_limt)) + - scale_y_continuous(limits = c(-axis_limt, axis_limt)) + - geom_abline(slope = 0, intercept = -1, size=0.3, linetype='dashed') + - geom_abline(slope = 0, intercept = 1, size=0.3, linetype='dashed') + - geom_vline(xintercept = -1, size=0.3, linetype='dashed') + - geom_vline(xintercept = 1, size=0.3, linetype='dashed') - pm2[i,j]=p+geom_label(data = data.frame(xlabel = -axis_limt, - ylabel = max(axis_limt, na.rm = TRUE), - lab = round(corr, digits = 3)), - mapping = ggplot2::aes(x = xlabel, y = ylabel, label = lab), - hjust = 0, vjust = 1,size = corr_size, fontface = "bold", - inherit.aes = FALSE # do not inherit anything from the ... - ) - - - } - } - return(pm2) -} - - - - -``` - - -### UMI count table preparation -```{r count matrix,message = FALSE} -meta <- read.csv(file.path(input_data_dir,'meta.csv'),stringsAsFactors = F) %>% - filter(plate_barcode %in% c("VH02001612", "VH02001614", "VH02001618")) -Exp<-lapply(unique(meta$batch), function(x){ - b<-unique(meta[meta$batch==x,]$plate_barcode) - res<-lapply(b, function(y){ - load(file.path(input_data_dir, paste0("flowcell_4000_UMI_decode_",y,".RData")),verbose=T) - annot<-meta[meta$plate_barcode==y,] - rownames(annot) <- apply( annot[ , c('plate_barcode','well_index') ] , 1 , paste , collapse = "_" ) - UMI_decode<-UMI_decode[!rownames(UMI_decode) %in% grep("ERCC-",rownames(UMI_decode),value = T),] - UMI_annot<-list(UMI_decode,annot) - names(UMI_annot)<-c("UMI.counts", "Annotation") - return(UMI_annot) - }) - names(res)<-b - return(res) -}) -names(Exp)<-unique(meta$batch) - -save(Exp, file=file.path(out_data_dir,"Exp_Init.RData"),compress = TRUE) - -``` -```{r} -check_threads <- function() { - cat("Hardware cores (logical/physical):", - parallel::detectCores(TRUE), "/", parallel::detectCores(FALSE), "\n") - cat("foreach backend/workers: ", - foreach::getDoParName(), "/", foreach::getDoParWorkers(), "\n", sep = "") - if (requireNamespace("future", quietly = TRUE)) { - cat("future plan/workers: ", capture.output(future::plan()), "\n", sep = "") - cat("future nbrOfWorkers: ", future::nbrOfWorkers(), "\n", sep = "") - } - if (requireNamespace("data.table", quietly = TRUE)) { - cat("data.table threads: ", data.table::getDTthreads(), "\n", sep = "") - } - if (requireNamespace("RhpcBLASctl", quietly = TRUE)) { - cat("BLAS threads: ", RhpcBLASctl::blas_get_num_procs(), "\n", sep = "") - cat("OMP max threads: ", RhpcBLASctl::omp_get_max_threads(), "\n", sep = "") - } else { - envs <- Sys.getenv(c("OPENBLAS_NUM_THREADS","MKL_NUM_THREADS","OMP_NUM_THREADS","VECLIB_MAXIMUM_THREADS")) - cat("BLAS/OMP env vars:\n"); print(envs[envs != ""]) - } -} -check_threads() - -``` - - -# 2 Step1 -## 2.1 run first 500 random run -```{r run using foreach, message=FALSE} - -step1_func <- function(n){ - - CTRL="DMSO" - rand_runs=TRUE - - meta <- read.csv(file.path(input_data_dir,'meta.csv'),stringsAsFactors = F) %>% - mutate(replicate=paste0('rep',plate_replicate),plate_id=paste0(plate_barcode,"_",well_index)) %>% - filter(plate_barcode %in% c("VH02001612", "VH02001614", "VH02001618")) - load(file.path(out_data_dir, 'Exp_Init.RData'),verbose = T) - - DMSO_sample <- meta %>% filter(treatment==CTRL) %>% select(batch,plate_barcode,plate_well) %>% - group_by(plate_barcode) %>% - do(sample_n(.,1)) %>% - mutate(Sample="RC_to_SA",treatment="RC_to_SA") - - - meta <- meta %>% left_join(DMSO_sample,by=c('batch','plate_barcode','plate_well'),suffix=c("","_randR")) %>% - mutate(Sample=ifelse(is.na(Sample_randR),Sample,Sample_randR)) %>% - mutate(treatment=ifelse(is.na(treatment_randR),treatment,treatment_randR)) - - DE<-DE.limma_trend(Exp=Exp,CTRL=CTRL,meta=meta,random_run =rand_runs) -} -# DE_count_500 <- foreach(n=1:10,.packages=c("dplyr","limma","tidyr","foreach","edgeR"),.combine=rbind) %do% step1_func(n) - -``` - -```{r, message=FALSE, warning=FALSE,results='hide'} -options(dplyr.summarise.inform = FALSE) -# ---- helper to pretty print seconds ---- -fmt_secs <- function(x) { - if (is.na(x)) return("NA") - if (x < 60) return(sprintf("%.2fs", x)) - if (x < 3600) return(sprintf("%dm %.1fs", floor(x/60), x %% 60)) - sprintf("%dh %dm %.0fs", floor(x/3600), floor((x%%3600)/60), x%%60) -} - -iters <- 1:50 - -# ---- total wall time + per-iteration timings ---- -# total_tm <- system.time({ -# # return a list of {data, iter, seconds} per iteration -# lst <- foreach( -# n = iters, -# .packages = c("dplyr","limma","tidyr","foreach","edgeR"), -# .combine = "c" # concatenate list-of-lists -# ) %do% { -# t0 <- proc.time()[["elapsed"]] -# res <- step1_func(n) -# dt <- proc.time()[["elapsed"]] - t0 -# list(list(data = res, iter = n, seconds = dt)) -# } -# }) - -# extract results and timings -# iter_times <- vapply(lst, function(x) x$seconds, numeric(1)) -# DE_count_500 <- dplyr::bind_rows(lapply(lst, function(x) x$data)) - - - -``` - - - - - - - - - - - - - - - - - - - - - - - - - - - -```{r, message=FALSE, warning=FALSE,results='hide'} - -# ---- RAM usage ---- -# remotes::install_github("tpq/peakRAM") -library(peakRAM) -res1 <- peakRAM({ - DE_count_500 <- foreach(n = iters, - .packages = c("dplyr","limma","tidyr","foreach","edgeR"), - .combine = rbind) %do% step1_func(n) -}) - -res2 <- peakRAM({ - DE_count_500 <- foreach(n = iters, - .packages = c("dplyr","limma","tidyr","foreach","edgeR"), - .combine = rbind) %do% step1_func(n) -}) - -res3 <- peakRAM({ - DE_count_500 <- foreach(n = iters, - .packages = c("dplyr","limma","tidyr","foreach","edgeR"), - .combine = rbind) %do% step1_func(n) -}) -``` - - -```{r} -res1 - -res2 - -res3 -``` - - - - -```{r, message=FALSE, warning=FALSE,results='hide'} -if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps") -get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2 - -rss_before <- get_rss_mb() -tm1 <- system.time({ - DE_count_500 <- foreach(n = iters, - .packages = c("dplyr","limma","tidyr","edgeR"), - .combine = rbind) %do% step1_func(n) -}) -rss_after <- get_rss_mb() -cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n", - tm1["elapsed"], rss_before, rss_after, rss_after - rss_before)) - - -``` - -```{r} -tm1 - -``` - - -```{r, message=FALSE, warning=FALSE,results='hide'} -if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps") -get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2 - -rss_before <- get_rss_mb() -tm2 <- system.time({ - DE_count_500 <- foreach(n = iters, - .packages = c("dplyr","limma","tidyr","edgeR"), - .combine = rbind) %do% step1_func(n) -}) -rss_after <- get_rss_mb() -cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n", - tm2["elapsed"], rss_before, rss_after, rss_after - rss_before)) - - -``` - -```{r} -tm2 - -``` - - -```{r, message=FALSE, warning=FALSE,results='hide'} -if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps") -get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2 - -rss_before <- get_rss_mb() -tm3 <- system.time({ - DE_count_500 <- foreach(n = iters, - .packages = c("dplyr","limma","tidyr","edgeR"), - .combine = rbind) %do% step1_func(n) -}) -rss_after <- get_rss_mb() -cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n", - tm3["elapsed"], rss_before, rss_after, rss_after - rss_before)) - - -``` - -```{r} -tm3 - -``` - diff --git a/vignettes/DRUGseq_first500runs.html b/vignettes/DRUGseq_first500runs.html deleted file mode 100644 index 5ad6b4b..0000000 --- a/vignettes/DRUGseq_first500runs.html +++ /dev/null @@ -1,2176 +0,0 @@ - - - - - - - - - - - - - -DRUG-seq data analysis pipeline - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
-
-
-
- -
- - - - - - - -
suppressPackageStartupMessages({
-  library(dplyr); library(tidyr); library(limma); library(edgeR)
-})
-
library(stringr)
-library(DT)
-library(tidyr)
-library(dplyr)
-library(foreach)
-library(limma)
-library(edgeR)
-library(umap)
-library(data.table)
-
## 
-## Attaching package: 'data.table'
-
## The following objects are masked from 'package:dplyr':
-## 
-##     between, first, last
-
library(ggplot2)
-library(GGally)
-
## Registered S3 method overwritten by 'GGally':
-##   method from   
-##   +.gg   ggplot2
-
library(plotly)
-
## 
-## Attaching package: 'plotly'
-
## The following object is masked from 'package:ggplot2':
-## 
-##     last_plot
-
## The following object is masked from 'package:stats':
-## 
-##     filter
-
## The following object is masked from 'package:graphics':
-## 
-##     layout
-
library(ggrepel)
-library(viridis)
-
## Loading required package: viridisLite
-
library(RANN)
-library(igraph)
-
## 
-## Attaching package: 'igraph'
-
## The following object is masked from 'package:plotly':
-## 
-##     groups
-
## The following object is masked from 'package:tidyr':
-## 
-##     crossing
-
## The following objects are masked from 'package:dplyr':
-## 
-##     as_data_frame, groups, union
-
## The following objects are masked from 'package:stats':
-## 
-##     decompose, spectrum
-
## The following object is masked from 'package:base':
-## 
-##     union
-
library(RColorBrewer)
-library(patchwork)
-
-

DRUGseq data analysis pipeline - DMSO selection

-
-

data path setup

-

project folder:
-/Users/liuxin/macpie_dev/DRUGseqData/DRUG-seq-main

-
-
-

1.2 functions

-
get_union_toptable <- function(fit3) {
-  contrast_names  <-  colnames(coef(fit3))
-  n_contrasts  <-  length(contrast_names)
-  
-  tt <-  foreach(i=1:n_contrasts, .combine=rbind) %do%
-  { topTable(fit3, coef = i, 
-             p.value = 1, number = Inf) %>% 
-      mutate(GeneId=rownames(.), contrast=colnames(coef(fit3))[i]) }
-  
-  tt$contrast  <-  factor(tt$contrast, levels=contrast_names)
-  return(tt)
-}
-
DE.limma_trend<-function(Exp=Exp,CTRL,meta,best_DMSO=NULL,random_run=FALSE){
-  if(!is.null(best_DMSO)){ 
-    print('Starting DE calculations using best DMSO.')
-    }else{print('Starting DE calculations.')
-    }
-  
-  de<-lapply(1:length(Exp), function(i){
-    b<-names(Exp)[i]
-    meta_sub<- meta %>% filter(batch==b) %>% mutate(plate_barcode_short = substr(plate_barcode, nchar(plate_barcode)-2, nchar(plate_barcode)))
-    if(!is.null(best_DMSO)) {
-      meta_sub <- meta_sub %>% filter(treatment!=CTRL|barcode_well %in% best_DMSO$barcode_well) #only use best DMSO as control
-    } 
-
-    rownames(meta_sub)<-meta_sub$plate_id
-    UMI_batch<-lapply(1:length(Exp[[i]]), function(j){
-      #print(paste0('i ',i,' / j ',j))
-      x<-Exp[[i]][[j]]
-      mat<-x$UMI.counts
-      mat<-mat[!rownames(mat) %in% grep("ERCC-",rownames(mat),value = T),]
-      mat<-mat[order(row.names(mat)),]
-      return(mat)
-      })
-    # active checking that the objects in the list have exactly the same names
-    unames <- unique(unlist(lapply(UMI_batch, rownames)))
-    stopifnot( all( sapply(UMI_batch, function(x) identical(rownames(x), unames)) ) )
-    UMI_batch<-do.call(cbind,UMI_batch)
-
-    UMI_batch<-UMI_batch[,meta_sub$plate_id]
-    clean<-UMI_batch[apply(UMI_batch,1,max)>quantile(apply(UMI_batch,1,max),0.75),] 
-    clean<-clean[,colSums(clean)>0] 
-
-    d<-DGEList(counts=clean)
-    d <- calcNormFactors(d, method="TMM")
-    clean.TMM<-log2(edgeR::cpm(d,  normalized.lib.sizes=T,log=F)+1) 
-
-    meta_sub<-meta_sub[colnames(clean.TMM),]
-    meta_sub$Sample<-sapply(meta_sub$Sample, function(k) gsub("-",".",k))
-
-    design<-model.matrix(~0+factor(meta_sub$Sample)+factor(meta_sub$replicate))
-    colnames(design)<-c(levels(factor(meta_sub$Sample)),levels(factor(meta_sub$replicate))[-1])
-    rownames(design)<-colnames(clean)
-    samples<-unique(meta_sub$Sample)
-    ctrl<-grep(CTRL,samples,value = T)
-    #print(ctrl)
-    samples<-samples[!samples %in% ctrl]
-
-    contrasts<-sapply(samples, function(ctrs)paste0(ctrs,"-",ctrl))
-    contrast.matrix<-limma::makeContrasts(contrasts=contrasts, levels = design)
-
-    fit = limma::lmFit(clean.TMM, design)
-    fit2 = limma::contrasts.fit(fit,contrast.matrix)
-    fit3 = limma::eBayes(fit2,  trend = T)
-
-    DE.table <- get_union_toptable(fit3) %>% 
-    separate(contrast,c('contrast','ctrl_arm'),sep="-") %>% 
-    select(-ctrl_arm) %>% 
-    merge(meta_sub %>% 
-            filter(!treatment ==CTRL) %>% 
-            select(plate_barcode_short,treatment,treatment_dose,timepoint,Sample,plate_well,batch) %>% 
-            distinct() %>% 
-            group_by(treatment,treatment_dose,timepoint,Sample,batch) %>% 
-            summarise(plate_well = paste(paste(plate_barcode_short,plate_well,sep = " "), collapse=",")),
-          sort=FALSE, by.x='contrast',by.y='Sample') %>% mutate(Sample=contrast) # combine metadata with toptable
-    
-  })
-  if (random_run==TRUE){
-    de<-do.call(rbind,de) %>% 
-      filter(Sample=="RC_to_SA") %>% 
-      group_by(batch,plate_well) %>%
-      filter(abs(logFC)>1,adj.P.Val<0.1) %>%
-      summarise(count=n()) %>%
-      mutate(run_no=n,contrast='RC_to_SA') %>% 
-      ungroup() 
-    }else{
-    de<-do.call(rbind,de) 
-    }
-  return(de)
-}
-
vpPlot_DElabel <- function(this_contrast_toptabel,batch_select=1,this_arms){
-  
-  pcut<-0.1
-  #color= black,red, blue
-  #        0     1      2   
-  color_custum<-c(rgb(0,0,0,0.5),rgb(1,0,0,1),rgb(0,0,1,1))
-  
-  this_contrast_toptabel <- this_contrast_toptabel %>% 
-    filter(treatment %in% this_arms,batch==batch_select) %>% 
-    mutate(label=0)
-  
-  
-
-  high_concentration<-this_contrast_toptabel %>% select(treatment_dose) %>% max() 
-  
-  this_contrast_toptabel_plot<-NULL
-
-  for (SampleName_lib_temp in this_arms) { 
-    upregulate_gene <- this_contrast_toptabel %>% 
-      filter(treatment == SampleName_lib_temp) %>% 
-      filter(treatment_dose==high_concentration, adj.P.Val<pcut, logFC>1) %>% 
-      select(GeneId) %>% 
-      .$GeneId
-    downregulate_gene <-this_contrast_toptabel %>% 
-    filter(treatment == SampleName_lib_temp) %>%
-    filter(treatment_dose==high_concentration, adj.P.Val<pcut, logFC< -1) %>% 
-    select(GeneId) %>% 
-    .$GeneId
-
-    this_contrast_toptabel_plot <- rbind(this_contrast_toptabel_plot,this_contrast_toptabel 
-                                      %>%  filter(treatment==SampleName_lib_temp) %>% 
-                                        mutate(label=ifelse(GeneId %in% upregulate_gene,1,label)) %>% 
-                                        mutate(label=ifelse(GeneId %in% downregulate_gene,2,label)))
-  }
-  
-  this_contrast_toptabel_plot$label<-as.factor(this_contrast_toptabel_plot$label)
-  levels(this_contrast_toptabel_plot$label)<-c("not significant ","upregulated genes","downregulated genes")
-  this_contrast_toptabel_plot$treatment<-factor(this_contrast_toptabel_plot$treatment)
-
- 
-  
-  vp<-this_contrast_toptabel_plot %>% arrange(label) %>% 
-    ggplot(aes(x=logFC, y=-log10(adj.P.Val), color=label))+
-    scale_color_manual(values = color_custum)+
-    geom_point(fill="white",shape=21, size=1, stroke=0.5) +
-    geom_abline(slope = 0, intercept = -log10(pcut), size=0.3, linetype='dashed') +
-    geom_vline(xintercept = -1, size=0.3, linetype='dashed') +
-    geom_vline(xintercept = 1, size=0.3, linetype='dashed') +
-    xlim(-8,8)+
-    theme_bw() +
-    theme(axis.text.x = element_text(face="bold",size=14),
-          axis.text.y = element_text(face="bold", size=10),
-          axis.title=element_text(size=14,face="bold"),
-        legend.text=element_text(size=12) ) +
-    facet_grid(treatment~treatment_dose)+
-    theme(strip.text = element_text(face="bold", size=12))
-
-  return(vp)
-  }
-
## works up to 3 groups comparison now
-VD_plot <- function(this_contrast_toptable,batch_select=1){
-  this_contrast_toptable_plot <- this_contrast_toptable %>% 
-  mutate(sig=ifelse(abs(logFC)>1 & adj.P.Val<0.1,-1,0)) %>% 
-    mutate(sig=ifelse(logFC>1 & adj.P.Val<0.1,1,sig)) %>%
-    mutate(contrast=paste0(treatment_dose,'uM')) %>% 
-    select(contrast,GeneId,sig) %>% 
-    reshape2::dcast(GeneId~contrast,value.var="sig")
-  
-  
-  rownames(this_contrast_toptable_plot) <- this_contrast_toptable_plot$GeneId
-  return(this_contrast_toptable_plot %>% select(-GeneId) %>% vennDiagram(circle.col=c("turquoise", "salmon","palegreen"),cex=1))
-
-}
-
pca_umap <- function(counts=NULL,center_pca=T,metric="euclidean", scale_pca=T,n_neighbors=15,n_pca=20){
-  
-  pca<-prcomp(t(counts), scale = scale_pca, center = center_pca)
-  rownames(pca$x)  <-  colnames(counts)
-  umap  <-  umap(pca$x[,1:n_pca],method = "umap-learn",metric=metric, n_neighbors=n_neighbors)
-  rownames(umap$layout)  <-  colnames(counts)
-  return(umap)
-
-  
-}
-
ggpairs_custom<-function(toptable_meta,this_arms,p_cut,corr_size=3,batch_s=1,strip_size=NULL,label_size=NULL,exclusive_gene=NULL){
-  
-  toptable_meta=toptable_meta %>% mutate(contrast=paste(treatment,treatment_dose,timepoint,sep="_"))
-  this_contrast=toptable_meta %>% 
-    filter(Sample %in% this_arms,batch==batch_s) %>% 
-    arrange(treatment_dose) %>% 
-    select(contrast) %>% 
-    distinct() %>%.$contrast
-  p_cut <- p_cut
-
-  
-if (is.null(exclusive_gene)) {
-valid_genes <- toptable_meta %>% 
-  filter(Sample %in% this_arms) %>%
-  filter(batch==batch_s) %>%
-  filter(adj.P.Val<p_cut) %>%
-  .$GeneId %>%
-  unique()
-} else {
-  
-  valid_genes <- toptable_meta %>% 
-    filter(Sample %in% this_arms) %>%
-    filter(batch==batch_s) %>%
-    filter(adj.P.Val<p_cut) %>%
-  .$GeneId %>%
-  unique() %>% 
-    setdiff(exclusive_gene)
-  
-}
-
-table_to_plot1<-toptable_meta %>% 
-  filter(Sample %in% this_arms) %>%
-  filter(batch==batch_s) %>%
-  filter(GeneId %in% valid_genes) %>%
-  dplyr::select(GeneId, contrast, logFC) %>%
-  setDT %>%
-  data.table::dcast(GeneId ~ contrast, value.var = 'logFC') %>%
-  as.data.frame()
-
-table_to_plot1<-table_to_plot1[,c("GeneId",this_contrast)]
-rownames(table_to_plot1)<-table_to_plot1$GeneId
-
-table_to_plot2<-toptable_meta %>% 
-  filter(Sample %in% this_arms) %>%
-  filter(batch==batch_s) %>%
-  filter(adj.P.Val<p_cut) %>%
-  dplyr::select(GeneId, contrast, logFC) %>%
-  setDT %>%
-  data.table::dcast(GeneId ~ contrast, value.var = 'logFC') %>%
-  as.data.frame()
-
-missing_col<-setdiff(colnames(table_to_plot1),colnames(table_to_plot2))
-if (!identical(missing_col, character(0))) {
-  missing_mat<-data.frame(matrix(0, nrow = nrow(table_to_plot2), ncol =as.numeric( length(missing_col))))
-  missing_mat[missing_mat==0]=NA
-  colnames(missing_mat)<-missing_col
-  table_to_plot2<-cbind(table_to_plot2,missing_mat)
-}
-
-table_to_plot2<-table_to_plot2[,c("GeneId",this_contrast)]
-
-data_plot<-inner_join(table_to_plot1,table_to_plot2,by="GeneId")
-  
-  
-
-  rownames(data_plot)<-data_plot$GeneId
-  data_plot<-data_plot %>% select(-"GeneId")
-  
-  axis_limt<-max(abs(data_plot),na.rm = TRUE)
-  
-  diagfun <- function(data,mapping){
-    ggplot(data = data, mapping = mapping)+
-      geom_density(color="darkblue", fill="lightblue") +
-       scale_x_continuous(limits = c(-axis_limt, axis_limt)) 
-      # scale_y_continuous(limits = c(-axis_limt, axis_limt))
-    }
-  
-  corr_heatmap <- function(data, mapping, method="p", use="pairwise", ...){
-
-              # grab data
-              x <- eval_data_col(data, mapping$x)
-              y <- eval_data_col(data, mapping$y)
-
-              # calculate correlation
-              corr <- cor(x, y, method=method, use=use)
-              colFn <- colorRampPalette(c("blue", "white", "red"), interpolate ='spline')
-              fill <- colFn(100)[findInterval(corr, seq(-1, 1, length=100))]
-              ggally_cor(data = data, mapping = mapping, ...) + 
-                theme_classic()+
-                theme(panel.background = element_rect(fill=fill,colour = fill))
-  }
-  
-  
-  pm<-ggpairs(data_plot,c(1:as.numeric(length(colnames(data_plot))/2)),columnLabels=colnames(table_to_plot1)[-1],diag=list(continuous = wrap(diagfun)),
-              upper = list(continuous = corr_heatmap),progress = F)+theme(axis.text = element_text(size = label_size),strip.placement = "outside", 
-                                                                          text = element_text(size = strip_size) )
-  
-  method<-"pearson"
-  pm2 <- pm
-  color_custum<-c(rgb(0,0,0,0.5),rgb(0,1,0,0.7),rgb(0,0,1,0.7),rgb(1,0,0,1))
-  label_list<-c('not significant','x axis DE genes','y axis DE genes','shared DE genes')
-  for(i in 2:pm$nrow) {
-       for(j in 1:(i-1)) {
-         
-         x <- GGally::eval_data_col(pm[i,j]$data, pm[i,j]$mapping$x)
-         y <- GGally::eval_data_col(pm[i,j]$data, pm[i,j]$mapping$y)
-         corr<- cor(x,y,method=method,use="na.or.complete")
-         col_name_x_x<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$x),".x")
-         col_name_y_x<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$y),".x")
-         col_name_x_y<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$x),".y")
-         col_name_y_y<-paste0(gsub("\\.[^\\.]*$","",pm[i,j]$labels$y),".y")
-         col_name<-c(col_name_x_x,col_name_y_x,col_name_x_y,col_name_y_y)
-         data_sub<-pm$data[,col_name]
-         
-         data_sub<-data_sub %>% 
-           mutate(label = ifelse(!is.na(!!sym(col_name_x_y)) & 
-                                   abs(!!sym(col_name_x_y))>1,1,0)) %>%
-          mutate(label = ifelse(!is.na(!!sym(col_name_y_y)) &
-                                  abs(!!sym(col_name_y_y))>1,2,label)) %>%
-          mutate(label = ifelse(!is.na(!!sym(col_name_x_y)) &
-                        abs(!!sym(col_name_x_y))>1 &
-                          !is.na(!!sym(col_name_y_y))&
-                           abs(!!sym(col_name_y_y))>1 ,3,label)) %>% 
-           arrange(label)
-         
-          unique_label<-sort(unique(data_sub$label))
-          data_sub$label<-as.factor(data_sub$label)
-          levels(data_sub$label)<-label_list[unique_label+1]
-          
-          p<-ggplot(data_sub,aes(x=!!sym(pm[i,j]$labels$x),y=!!sym(pm[i,j]$labels$y),color=label))+
-               geom_point(size=2,shape=21,stroke=0.5,fill="white")+theme_bw()+scale_color_manual(values = color_custum[unique_label+1])+ scale_shape(solid = TRUE) +
-            scale_x_continuous(limits = c(-axis_limt, axis_limt)) +
-            scale_y_continuous(limits = c(-axis_limt, axis_limt)) +
-            geom_abline(slope = 0, intercept = -1, size=0.3, linetype='dashed') +
-            geom_abline(slope = 0, intercept =  1, size=0.3, linetype='dashed') +
-            geom_vline(xintercept = -1, size=0.3, linetype='dashed') +
-            geom_vline(xintercept = 1, size=0.3, linetype='dashed')
-          pm2[i,j]=p+geom_label(data = data.frame(xlabel = -axis_limt,
-                                          ylabel = max(axis_limt, na.rm = TRUE),
-                                          lab = round(corr, digits = 3)),
-                                mapping = ggplot2::aes(x = xlabel, y = ylabel, label = lab),
-                                hjust = 0, vjust = 1,size = corr_size, fontface = "bold",
-                          inherit.aes = FALSE # do not inherit anything from the ...
-                          ) 
-          
-          
-       }
-   }
-  return(pm2)
-}
-
-

UMI count table preparation

-
meta <- read.csv(file.path(input_data_dir,'meta.csv'),stringsAsFactors = F) %>% 
-  filter(plate_barcode %in% c("VH02001612", "VH02001614", "VH02001618"))
-Exp<-lapply(unique(meta$batch), function(x){
-  b<-unique(meta[meta$batch==x,]$plate_barcode)
-  res<-lapply(b, function(y){
-    load(file.path(input_data_dir, paste0("flowcell_4000_UMI_decode_",y,".RData")),verbose=T)
-    annot<-meta[meta$plate_barcode==y,]
-    rownames(annot) <- apply( annot[ , c('plate_barcode','well_index') ] , 1 , paste , collapse = "_" )
-    UMI_decode<-UMI_decode[!rownames(UMI_decode) %in% grep("ERCC-",rownames(UMI_decode),value = T),]
-    UMI_annot<-list(UMI_decode,annot)
-    names(UMI_annot)<-c("UMI.counts", "Annotation")
-    return(UMI_annot)
-  })
-  names(res)<-b
-  return(res)
-})
-
## Loading objects:
-##   UMI_decode
-## Loading objects:
-##   UMI_decode
-## Loading objects:
-##   UMI_decode
-
names(Exp)<-unique(meta$batch)
-
-save(Exp, file=file.path(out_data_dir,"Exp_Init.RData"),compress = TRUE)
-
check_threads <- function() {
-  cat("Hardware cores (logical/physical):",
-      parallel::detectCores(TRUE), "/", parallel::detectCores(FALSE), "\n")
-  cat("foreach backend/workers: ",
-      foreach::getDoParName(), "/", foreach::getDoParWorkers(), "\n", sep = "")
-  if (requireNamespace("future", quietly = TRUE)) {
-    cat("future plan/workers: ", capture.output(future::plan()), "\n", sep = "")
-    cat("future nbrOfWorkers: ", future::nbrOfWorkers(), "\n", sep = "")
-  }
-  if (requireNamespace("data.table", quietly = TRUE)) {
-    cat("data.table threads: ", data.table::getDTthreads(), "\n", sep = "")
-  }
-  if (requireNamespace("RhpcBLASctl", quietly = TRUE)) {
-    cat("BLAS threads: ", RhpcBLASctl::blas_get_num_procs(), "\n", sep = "")
-    cat("OMP max threads: ", RhpcBLASctl::omp_get_max_threads(), "\n", sep = "")
-  } else {
-    envs <- Sys.getenv(c("OPENBLAS_NUM_THREADS","MKL_NUM_THREADS","OMP_NUM_THREADS","VECLIB_MAXIMUM_THREADS"))
-    cat("BLAS/OMP env vars:\n"); print(envs[envs != ""])
-  }
-}
-check_threads()
-
## Hardware cores (logical/physical): 12 / 12 
-## foreach backend/workers: /1
-
## Registered S3 method overwritten by 'future':
-##   method               from      
-##   all.equal.connection parallelly
-
## future plan/workers: sequential:- args: function (..., gc = FALSE, earlySignal = FALSE, envir = parent.frame())- tweaked: FALSE- call: plan(sequential)FutureBackend to be launched
-## future nbrOfWorkers: 1
-## data.table threads: 6
-## BLAS threads: 1
-## OMP max threads: NA
-
-
-
-
-

2 Step1

-
-

2.1 run first 500 random run

-
step1_func <- function(n){
-  
-  CTRL="DMSO"
-  rand_runs=TRUE
-  
-  meta <- read.csv(file.path(input_data_dir,'meta.csv'),stringsAsFactors = F) %>%  
-    mutate(replicate=paste0('rep',plate_replicate),plate_id=paste0(plate_barcode,"_",well_index)) %>%
-    filter(plate_barcode %in% c("VH02001612", "VH02001614", "VH02001618"))
-  load(file.path(out_data_dir, 'Exp_Init.RData'),verbose = T)
-  
-  DMSO_sample <- meta %>% filter(treatment==CTRL) %>% select(batch,plate_barcode,plate_well) %>%
-    group_by(plate_barcode) %>% 
-    do(sample_n(.,1)) %>% 
-    mutate(Sample="RC_to_SA",treatment="RC_to_SA")  
-
-
-  meta <- meta %>% left_join(DMSO_sample,by=c('batch','plate_barcode','plate_well'),suffix=c("","_randR")) %>%
-    mutate(Sample=ifelse(is.na(Sample_randR),Sample,Sample_randR)) %>% 
-    mutate(treatment=ifelse(is.na(treatment_randR),treatment,treatment_randR))
-  
-  DE<-DE.limma_trend(Exp=Exp,CTRL=CTRL,meta=meta,random_run =rand_runs)
-}
-# DE_count_500 <- foreach(n=1:10,.packages=c("dplyr","limma","tidyr","foreach","edgeR"),.combine=rbind) %do% step1_func(n)
-
options(dplyr.summarise.inform = FALSE)
-# ---- helper to pretty print seconds ----
-fmt_secs <- function(x) {
-  if (is.na(x)) return("NA")
-  if (x < 60) return(sprintf("%.2fs", x))
-  if (x < 3600) return(sprintf("%dm %.1fs", floor(x/60), x %% 60))
-  sprintf("%dh %dm %.0fs", floor(x/3600), floor((x%%3600)/60), x%%60)
-}
-
-iters <- 1:50
-
-# ---- total wall time + per-iteration timings ----
-# total_tm <- system.time({
-#   # return a list of {data, iter, seconds} per iteration
-#   lst <- foreach(
-#     n = iters,
-#     .packages = c("dplyr","limma","tidyr","foreach","edgeR"),
-#     .combine  = "c"     # concatenate list-of-lists
-#   ) %do% {
-#     t0  <- proc.time()[["elapsed"]]
-#     res <- step1_func(n)
-#     dt  <- proc.time()[["elapsed"]] - t0
-#     list(list(data = res, iter = n, seconds = dt))
-#   }
-# })
-
-# extract results and timings
-# iter_times <- vapply(lst, function(x) x$seconds, numeric(1))
-# DE_count_500 <- dplyr::bind_rows(lapply(lst, function(x) x$data))
- - - - - - - - - - - - - - - - - - - - - - - - -
# ---- RAM usage ----
-# remotes::install_github("tpq/peakRAM")
-library(peakRAM)
-res1 <- peakRAM({
-  DE_count_500 <- foreach(n = iters,
-                          .packages = c("dplyr","limma","tidyr","foreach","edgeR"),
-                          .combine  = rbind) %do% step1_func(n)
-})
-
-res2 <- peakRAM({
-  DE_count_500 <- foreach(n = iters,
-                          .packages = c("dplyr","limma","tidyr","foreach","edgeR"),
-                          .combine  = rbind) %do% step1_func(n)
-})
-
-res3 <- peakRAM({
-  DE_count_500 <- foreach(n = iters,
-                          .packages = c("dplyr","limma","tidyr","foreach","edgeR"),
-                          .combine  = rbind) %do% step1_func(n)
-})
-
res1
-
##                                                                                                             Function_Call
-## 1 {DE_count_500<-foreach(n=iters,.packages=c("dplyr","limma","tidyr","foreach","edgeR"),.combine=rbind)%do%step1_func(n)}
-##   Elapsed_Time_sec Total_RAM_Used_MiB Peak_RAM_Used_MiB
-## 1         1759.191               33.3            3220.6
-
res2
-
##                                                                                                             Function_Call
-## 1 {DE_count_500<-foreach(n=iters,.packages=c("dplyr","limma","tidyr","foreach","edgeR"),.combine=rbind)%do%step1_func(n)}
-##   Elapsed_Time_sec Total_RAM_Used_MiB Peak_RAM_Used_MiB
-## 1         1709.094                  0            2937.3
-
res3
-
##                                                                                                             Function_Call
-## 1 {DE_count_500<-foreach(n=iters,.packages=c("dplyr","limma","tidyr","foreach","edgeR"),.combine=rbind)%do%step1_func(n)}
-##   Elapsed_Time_sec Total_RAM_Used_MiB Peak_RAM_Used_MiB
-## 1         1695.753                  0            2937.4
-
if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps")
-get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2
-
-rss_before <- get_rss_mb()
-tm1 <- system.time({
-  DE_count_500 <- foreach(n = iters,
-                          .packages = c("dplyr","limma","tidyr","edgeR"),
-                          .combine  = rbind) %do% step1_func(n)
-})
-rss_after  <- get_rss_mb()
-cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n",
-            tm1["elapsed"], rss_before, rss_after, rss_after - rss_before))
-
tm1
-
##     user   system  elapsed 
-## 1539.299  130.819 1686.507
-
if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps")
-get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2
-
-rss_before <- get_rss_mb()
-tm2 <- system.time({
-  DE_count_500 <- foreach(n = iters,
-                          .packages = c("dplyr","limma","tidyr","edgeR"),
-                          .combine  = rbind) %do% step1_func(n)
-})
-rss_after  <- get_rss_mb()
-cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n",
-            tm2["elapsed"], rss_before, rss_after, rss_after - rss_before))
-
tm2
-
##     user   system  elapsed 
-## 1526.717  144.779 1697.510
-
if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps")
-get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2
-
-rss_before <- get_rss_mb()
-tm3 <- system.time({
-  DE_count_500 <- foreach(n = iters,
-                          .packages = c("dplyr","limma","tidyr","edgeR"),
-                          .combine  = rbind) %do% step1_func(n)
-})
-rss_after  <- get_rss_mb()
-cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n",
-            tm3["elapsed"], rss_before, rss_after, rss_after - rss_before))
-
tm3
-
##     user   system  elapsed 
-## 1533.484  131.026 1682.296
-
-
- - - -
-
- -
- - - - - - - - - - - - - - - - diff --git a/vignettes/DRUGseq_first500runs.knit.log b/vignettes/DRUGseq_first500runs.knit.log deleted file mode 100644 index c224f8b..0000000 --- a/vignettes/DRUGseq_first500runs.knit.log +++ /dev/null @@ -1,2 +0,0 @@ -Error: pandoc version 1.12.3 or higher is required and was not found (see the help page ?rmarkdown::pandoc_available). -Execution halted diff --git a/vignettes/benchmark_DMSOruntime.Rmd b/vignettes/benchmark_DMSOruntime.Rmd deleted file mode 100644 index 148e58b..0000000 --- a/vignettes/benchmark_DMSOruntime.Rmd +++ /dev/null @@ -1,476 +0,0 @@ ---- -title: "Benchmark: runtime and memory of DMSO selection" -output: - rmarkdown::html_document: - theme: flatly - toc: true - toc_float: true - toc_depth: 5 -vignette: > - %\VignetteIndexEntry{benchmarking DMSO runtime and memory} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} - %\VignetteBuild{true} ---- - -```{r set_wd, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -dir <- "/Users/liuxin/macpie_Dev/" -devtools::load_all(paste0(dir, "macpie/")) - -``` - - -```{r setup} - -suppressMessages(library(macpie)) -suppressMessages(library(tibble)) -suppressMessages(library(stringr)) -suppressMessages(library(pheatmap)) -suppressMessages(library(ggiraph)) -suppressMessages(library(tidyseurat)) -suppressMessages(library(purrr)) -suppressMessages(library(ggplot2)) -suppressMessages(library(peakRAM)) -suppressMessages(library(ps)) - -``` - - -```{r} -options(scipen=999, digits=3) -``` - - - -```{r} -input_data_dir <- file.path(paste0(dir,"/DRUGseqData/DRUG-seq-main/data")) -meta <- read.csv(file.path(input_data_dir,'meta.csv'),stringsAsFactors = F) %>% - filter(plate_barcode %in% c("VH02001612", "VH02001614", "VH02001618")) -Exp<-lapply(unique(meta$batch), function(x){ - b<-unique(meta[meta$batch==x,]$plate_barcode) - res<-lapply(b, function(y){ - load(file.path(input_data_dir, paste0("flowcell_4000_UMI_decode_",y,".RData")),verbose=T) - annot<-meta[meta$plate_barcode==y,] - rownames(annot) <- apply( annot[ , c('plate_barcode','well_index') ] , 1 , paste , collapse = "_" ) - UMI_decode<-UMI_decode[!rownames(UMI_decode) %in% grep("ERCC-",rownames(UMI_decode),value = T),] - UMI_annot<-list(UMI_decode,annot) - names(UMI_annot)<-c("UMI.counts", "Annotation") - return(UMI_annot) - }) - names(res)<-b - return(res) -}) -names(Exp)<-unique(meta$batch) -batch3 <- Exp$`3` -``` - -```{r} -metadata <- batch3 %>% - map_dfr(~ { - .x$Annotation %>% - mutate( - Plate_ID = plate_barcode, - Well_ID = plate_well, - Barcode = paste0(plate_barcode, "_", well_index), - Row = substr(plate_well, 1, 1), - Column = as.integer(substr(plate_well, 2, nchar(plate_well))), - Species = "human", - Cell_type = "U2OS", - Model_type = "2D_adherent", - Time = "24", - Unit = "h", - Treatment_1 = treatment, - Concentration_1 = treatment_dose, - Unit_1 = treatment_dose_unit, - Sample_type = ifelse(treatment == "DMSO", "Control", "Treated") - - ) - }) - - -metadata <- metadata%>%select(-c(plate_barcode, index, plate_replicate, plate_well, - step, readout.comment, well_index, cell.line.name, - batch, treatment, treatment_dose, treatment_dose_unit, timepoint)) - - -batch3_counts <- batch3 %>% - map(~ { - .x$UMI.counts %>% - as.data.frame() %>% - rownames_to_column("gene_id") %>% - separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>% - mutate(gene_name = make.unique(gene_name)) %>% - select(-chrom) %>% - tibble::column_to_rownames(var = "gene_name") %>% - as.matrix() - }) - -binded_counts <- do.call(cbind, batch3_counts) - -as_mac <- CreateSeuratObject(counts = binded_counts, - min.cells = 1, - min.features = 1) - -as_mac<- as_mac%>% inner_join(metadata, by = c(".cell"="Barcode")) -as_mac$combined_id <- paste0(as_mac$Treatment_1, "_", as_mac$Concentration_1) -batch3_mac_filtered <- filter_genes_by_expression(as_mac, - group_by = "combined_id", - min_counts = 5, - min_samples = 1) - -``` - -```{r} -select_top_replicates_tmmcpm <- function( - data, - combined_id, # e.g. "CB_43_EP73_0" - orig_ident, # e.g. "VH02012942" - cpm_filter = 1, # CPM threshold for gene filtering - min_samps = 16, # number of samples a gene must be expressed in - corr_method = c("spearman","pearson"), - top_n = 5, - make_plots = TRUE -){ - - corr_method <- match.arg(corr_method) - - # Subset to the plate/condition of interest - subgroup <- subset(data, subset = combined_id == !!combined_id & orig.ident == !!orig_ident) - - # Counts and human-friendly column names - counts_d <- GetAssayData(subgroup, assay = "RNA", layer = "counts") - well_colnames <- paste0(subgroup$orig.ident, "_", subgroup$Well_ID) - names(well_colnames) <- rownames(subgroup@meta.data) - colnames(counts_d) <- well_colnames[colnames(counts_d)] - - # edgeR container + gene filtering - y <- edgeR::DGEList(counts_d, group = subgroup$orig.ident) - keep <- rowSums(edgeR::cpm(y) > cpm_filter) >= min_samps - y <- y[keep, , keep.lib.sizes = FALSE] - - # TMMwsp → log2-CPM - y <- edgeR::calcNormFactors(y, method = "TMMwsp") - log_cpm_tmm <- edgeR::cpm(y, log = TRUE, normalized.lib.sizes = TRUE) - - # Long data for boxplot - df_long <- as.data.frame(log_cpm_tmm) |> - tibble::rownames_to_column(var = "gene") |> - tidyr::pivot_longer(-gene, names_to = "sample", values_to = "log_cpm") - - if (make_plots) { - # Boxplot - print( - ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) + - ggplot2::geom_boxplot(outlier.size = 0.5) + - ggplot2::labs(x = "Sample", y = "log2 CPM", - title = "Boxplot of log2-CPM (TMMwsp)") + - ggplot2::theme_classic() + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) - ) - } - - # Correlation matrices - cors_pear <- stats::cor(log_cpm_tmm, method = "pearson") - cors_spear <- stats::cor(log_cpm_tmm, method = "spearman") - - if (make_plots) { - pheatmap::pheatmap(cors_pear, main = "Sample–sample correlation (Pearson, log2-CPM)") - pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)") - } - - # Ranking by mean Fisher-z correlation to all *other* samples - R <- stats::cor(log_cpm_tmm, method = corr_method, use = "pairwise.complete.obs") - diag(R) <- NA_real_ - - # clip, Fisher z, average, back-transform - Z <- atanh(pmin(pmax(R, -0.999999), 0.999999)) - score_z <- rowMeans(Z, na.rm = TRUE) - score_r <- tanh(score_z) - - # top-N names and scores - ord <- order(score_r, decreasing = TRUE, na.last = NA) - srt <- score_r[ord] - cutoff <- srt[top_n] # N-th best score - keep <- srt >= cutoff # keep all tied at the cutoff - topN <- srt[keep] - - # return everything useful - list( - subset_obj = subgroup, - dge = y, - log_cpm_tmm = log_cpm_tmm, - boxplot_df = df_long, - cor_pearson = cors_pear, - cor_spearman = cors_spear, - ranking_method = corr_method, - scores_mean_to_others = sort(score_r, decreasing = TRUE), - topN = topN - ) -} - - -``` - - -```{r} -res1 <- peakRAM({ -dmso_VH02001612 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001612", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - dmso_VH02001614 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001614", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - - dmso_VH02001618 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001618", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - } -) - -res2 <- peakRAM({ -dmso_VH02001612 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001612", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - dmso_VH02001614 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001614", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - - dmso_VH02001618 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001618", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - } -) - -res3 <- peakRAM({ -dmso_VH02001612 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001612", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - dmso_VH02001614 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001614", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - - dmso_VH02001618 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001618", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - } -) - - -``` - - -```{r} -res1 - -res2 - -res3 -``` - -```{r} - -if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps") -get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2 - -rss_before <- get_rss_mb() -tm1 <- system.time({ - dmso_VH02001612 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001612", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - dmso_VH02001614 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001614", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - - dmso_VH02001618 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001618", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) -}) -rss_after <- get_rss_mb() -cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n", - tm1["elapsed"], rss_before, rss_after, rss_after - rss_before)) - - -get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2 - -rss_before <- get_rss_mb() -tm2 <- system.time({ - dmso_VH02001612 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001612", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - dmso_VH02001614 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001614", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - - dmso_VH02001618 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001618", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) -}) -rss_after <- get_rss_mb() -cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n", - tm2["elapsed"], rss_before, rss_after, rss_after - rss_before)) - - -get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2 - -rss_before <- get_rss_mb() -tm3 <- system.time({ - dmso_VH02001612 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001612", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - dmso_VH02001614 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001614", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - - dmso_VH02001618 <- select_top_replicates_tmmcpm( - batch3_mac_filtered, - combined_id = "DMSO_0", - orig_ident = "VH02001618", - cpm_filter = 5, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) -}) -rss_after <- get_rss_mb() -cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n", - tm3["elapsed"], rss_before, rss_after, rss_after - rss_before)) - - - - -``` - - -```{r} -tm1 - -tm2 - -tm3 -``` - - -### Summary of runtime and memory usage - -The `select_top_replicates_tmmcpm` function runs on same plates (VH02001612, VH02001614, VH02001618) in approximately 2.463 seconds for three plates, with a peak memory usage of around 2403 Mib. This demonstrates that the function is efficient and can be used effectively for selecting DMSO controls in a reasonable timeframe without excessive memory consumption. \ No newline at end of file diff --git a/vignettes/benchmark_DMSOruntime.html b/vignettes/benchmark_DMSOruntime.html deleted file mode 100644 index cff298d..0000000 --- a/vignettes/benchmark_DMSOruntime.html +++ /dev/null @@ -1,2016 +0,0 @@ - - - - - - - - - - - - - -Benchmark: runtime and memory of DMSO selection - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
-
-
-
- -
- - - - - - - -
suppressMessages(library(macpie))
-suppressMessages(library(tibble))
-suppressMessages(library(stringr))
-suppressMessages(library(pheatmap))
-suppressMessages(library(ggiraph))
-suppressMessages(library(tidyseurat))
-suppressMessages(library(purrr))
-suppressMessages(library(ggplot2))
-suppressMessages(library(peakRAM))
-suppressMessages(library(ps))
-
options(scipen=999, digits=3)
-
input_data_dir <- file.path(paste0(dir,"/DRUGseqData/DRUG-seq-main/data"))
-meta <- read.csv(file.path(input_data_dir,'meta.csv'),stringsAsFactors = F) %>% 
-  filter(plate_barcode %in% c("VH02001612", "VH02001614", "VH02001618"))
-Exp<-lapply(unique(meta$batch), function(x){
-  b<-unique(meta[meta$batch==x,]$plate_barcode)
-  res<-lapply(b, function(y){
-    load(file.path(input_data_dir, paste0("flowcell_4000_UMI_decode_",y,".RData")),verbose=T)
-    annot<-meta[meta$plate_barcode==y,]
-    rownames(annot) <- apply( annot[ , c('plate_barcode','well_index') ] , 1 , paste , collapse = "_" )
-    UMI_decode<-UMI_decode[!rownames(UMI_decode) %in% grep("ERCC-",rownames(UMI_decode),value = T),]
-    UMI_annot<-list(UMI_decode,annot)
-    names(UMI_annot)<-c("UMI.counts", "Annotation")
-    return(UMI_annot)
-  })
-  names(res)<-b
-  return(res)
-})
-
## Loading objects:
-##   UMI_decode
-## Loading objects:
-##   UMI_decode
-## Loading objects:
-##   UMI_decode
-
names(Exp)<-unique(meta$batch)
-batch3 <- Exp$`3`
-
metadata <- batch3 %>% 
-  map_dfr(~ {
-    .x$Annotation %>%
-      mutate(
-        Plate_ID        = plate_barcode,
-        Well_ID         = plate_well,
-        Barcode         = paste0(plate_barcode, "_", well_index),
-        Row             = substr(plate_well, 1, 1),
-        Column          = as.integer(substr(plate_well, 2, nchar(plate_well))),
-        Species         = "human",
-        Cell_type       = "U2OS",
-        Model_type      = "2D_adherent",
-        Time            = "24",
-        Unit            = "h",
-        Treatment_1     = treatment,
-        Concentration_1 = treatment_dose,
-        Unit_1          = treatment_dose_unit,
-        Sample_type         = ifelse(treatment == "DMSO", "Control", "Treated")
-
-      )
-  })
-
-
-metadata <- metadata%>%select(-c(plate_barcode, index, plate_replicate, plate_well,
-                                 step, readout.comment, well_index, cell.line.name, 
-                                 batch, treatment, treatment_dose, treatment_dose_unit, timepoint))
-
-
-batch3_counts <- batch3 %>%
-  map(~ {
-    .x$UMI.counts %>%
-      as.data.frame() %>% 
-      rownames_to_column("gene_id") %>%
-      separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>%
-      mutate(gene_name = make.unique(gene_name)) %>%
-      select(-chrom) %>%
-      tibble::column_to_rownames(var = "gene_name") %>%
-      as.matrix()
-  })
-
-binded_counts <- do.call(cbind, batch3_counts)
-
-as_mac <- CreateSeuratObject(counts = binded_counts, 
-  min.cells = 1, 
-  min.features = 1)
-
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
-## ('-')
-
## Warning: Data is of class matrix. Coercing to dgCMatrix.
-
as_mac<- as_mac%>% inner_join(metadata, by = c(".cell"="Barcode"))
-as_mac$combined_id <- paste0(as_mac$Treatment_1, "_", as_mac$Concentration_1)
-batch3_mac_filtered <- filter_genes_by_expression(as_mac,
-                                     group_by = "combined_id",
-                                     min_counts = 5,
-                                     min_samples = 1)
-
select_top_replicates_tmmcpm <- function(
-  data,
-  combined_id,                 # e.g. "CB_43_EP73_0"
-  orig_ident,                  # e.g. "VH02012942"
-  cpm_filter    = 1,           # CPM threshold for gene filtering
-  min_samps     = 16,          # number of samples a gene must be expressed in
-  corr_method   = c("spearman","pearson"),
-  top_n         = 5,
-  make_plots    = TRUE
-){
-  
-  corr_method <- match.arg(corr_method)
-
-  # Subset to the plate/condition of interest
-  subgroup <- subset(data, subset = combined_id == !!combined_id & orig.ident == !!orig_ident)
-
-  # Counts and human-friendly column names
-  counts_d <- GetAssayData(subgroup, assay = "RNA", layer = "counts")
-  well_colnames <- paste0(subgroup$orig.ident, "_", subgroup$Well_ID)
-  names(well_colnames) <- rownames(subgroup@meta.data)
-  colnames(counts_d) <- well_colnames[colnames(counts_d)]
-
-  # edgeR container + gene filtering
-  y <- edgeR::DGEList(counts_d, group = subgroup$orig.ident)
-  keep <- rowSums(edgeR::cpm(y) > cpm_filter) >= min_samps
-  y <- y[keep, , keep.lib.sizes = FALSE]
-
-  # TMMwsp → log2-CPM
-  y <- edgeR::calcNormFactors(y, method = "TMMwsp")
-  log_cpm_tmm <- edgeR::cpm(y, log = TRUE, normalized.lib.sizes = TRUE)
-
-  #  Long data for boxplot
-  df_long <- as.data.frame(log_cpm_tmm) |>
-    tibble::rownames_to_column(var = "gene") |>
-    tidyr::pivot_longer(-gene, names_to = "sample", values_to = "log_cpm")
-
-  if (make_plots) {
-    # Boxplot
-    print(
-      ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) +
-        ggplot2::geom_boxplot(outlier.size = 0.5) +
-        ggplot2::labs(x = "Sample", y = "log2 CPM",
-                      title = "Boxplot of log2-CPM (TMMwsp)") +
-        ggplot2::theme_classic() +
-        ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
-    )
-  }
-
-  # Correlation matrices
-  cors_pear  <- stats::cor(log_cpm_tmm, method = "pearson")
-  cors_spear <- stats::cor(log_cpm_tmm, method = "spearman")
-
-  if (make_plots) {
-    pheatmap::pheatmap(cors_pear,  main = "Sample–sample correlation (Pearson, log2-CPM)")
-    pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)")
-  }
-
-  # Ranking by mean Fisher-z correlation to all *other* samples
-  R <- stats::cor(log_cpm_tmm, method = corr_method, use = "pairwise.complete.obs")
-  diag(R) <- NA_real_
-
-  # clip, Fisher z, average, back-transform
-  Z <- atanh(pmin(pmax(R, -0.999999), 0.999999))
-  score_z <- rowMeans(Z, na.rm = TRUE)
-  score_r <- tanh(score_z)
-
-  # top-N names and scores
-  ord  <- order(score_r, decreasing = TRUE, na.last = NA)
-  srt <- score_r[ord]
-  cutoff <- srt[top_n]              # N-th best score
-  keep   <- srt >= cutoff    # keep all tied at the cutoff
-  topN <- srt[keep]
-
-  # return everything useful
-  list(
-    subset_obj   = subgroup,
-    dge          = y,
-    log_cpm_tmm  = log_cpm_tmm,
-    boxplot_df   = df_long,
-    cor_pearson  = cors_pear,
-    cor_spearman = cors_spear,
-    ranking_method = corr_method,
-    scores_mean_to_others = sort(score_r, decreasing = TRUE),
-    topN = topN
-  )
-}
-
res1 <- peakRAM({
-dmso_VH02001612 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001612",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  dmso_VH02001614 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001614",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  
-  dmso_VH02001618 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001618",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  }
-)
-
-res2 <- peakRAM({
-dmso_VH02001612 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001612",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  dmso_VH02001614 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001614",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  
-  dmso_VH02001618 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001618",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  }
-)
-
-res3 <- peakRAM({
-dmso_VH02001612 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001612",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  dmso_VH02001614 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001614",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  
-  dmso_VH02001618 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001618",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  }
-)
-
res1
-
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Function_Call
-## 1 {dmso_VH02001612<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001612",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02001614<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001614",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02001618<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001618",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)}
-##   Elapsed_Time_sec Total_RAM_Used_MiB Peak_RAM_Used_MiB
-## 1             2.52               37.3              2406
-
res2
-
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Function_Call
-## 1 {dmso_VH02001612<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001612",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02001614<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001614",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02001618<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001618",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)}
-##   Elapsed_Time_sec Total_RAM_Used_MiB Peak_RAM_Used_MiB
-## 1              2.3                  0              2403
-
res3
-
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Function_Call
-## 1 {dmso_VH02001612<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001612",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02001614<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001614",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02001618<-select_top_replicates_tmmcpm(batch3_mac_filtered,combined_id="DMSO_0",orig_ident="VH02001618",cpm_filter=5,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)}
-##   Elapsed_Time_sec Total_RAM_Used_MiB Peak_RAM_Used_MiB
-## 1              2.3                  0              2403
-
if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps")
-get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2
-
-rss_before <- get_rss_mb()
-tm1 <- system.time({
-  dmso_VH02001612 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001612",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  dmso_VH02001614 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001614",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  
-  dmso_VH02001618 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001618",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-})
-rss_after  <- get_rss_mb()
-cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n",
-            tm1["elapsed"], rss_before, rss_after, rss_after - rss_before))
-
## Elapsed: 2.3s | RSS before→after: 3168→4695 MiB (Δ 1528 MiB)
-
get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2
-
-rss_before <- get_rss_mb()
-tm2 <- system.time({
-  dmso_VH02001612 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001612",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  dmso_VH02001614 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001614",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  
-  dmso_VH02001618 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001618",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-})
-rss_after  <- get_rss_mb()
-cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n",
-            tm2["elapsed"], rss_before, rss_after, rss_after - rss_before))
-
## Elapsed: 2.3s | RSS before→after: 4694→4756 MiB (Δ 62 MiB)
-
get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2
-
-rss_before <- get_rss_mb()
-tm3 <- system.time({
-  dmso_VH02001612 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001612",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  dmso_VH02001614 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001614",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  
-  dmso_VH02001618 <- select_top_replicates_tmmcpm(
-    batch3_mac_filtered,
-    combined_id = "DMSO_0",
-    orig_ident  = "VH02001618",
-    cpm_filter  = 5,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-})
-rss_after  <- get_rss_mb()
-cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n",
-            tm3["elapsed"], rss_before, rss_after, rss_after - rss_before))
-
## Elapsed: 2.3s | RSS before→after: 4756→4514 MiB (Δ -242 MiB)
-
tm1
-
##    user  system elapsed 
-##   2.208   0.119   2.328
-
tm2
-
##    user  system elapsed 
-##   2.211   0.129   2.341
-
tm3
-
##    user  system elapsed 
-##   2.199   0.123   2.346
-
-

Summary of runtime and memory usage

-

The select_top_replicates_tmmcpm function runs on same -plates (VH02001612, VH02001614, VH02001618) in approximately 2.463 -seconds for three plates, with a peak memory usage of around 2403 Mib. -This demonstrates that the function is efficient and can be used -effectively for selecting DMSO controls in a reasonable timeframe -without excessive memory consumption.

-
- - - -
-
- -
- - - - - - - - - - - - - - - - diff --git a/vignettes/benchmark_DMSOselection.Rmd b/vignettes/benchmark_DMSOselection.Rmd deleted file mode 100644 index 287874c..0000000 --- a/vignettes/benchmark_DMSOselection.Rmd +++ /dev/null @@ -1,422 +0,0 @@ ---- -title: "Benchmark: DMSO control selection" -output: - rmarkdown::html_document: - theme: flatly - toc: true - toc_float: true - toc_depth: 5 -vignette: > - %\VignetteIndexEntry{benchmarking DMSO control selection} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} - %\VignetteBuild{true} ---- - -```{r set_wd, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -dir <- "/Users/liuxin/macpie_Dev/" -devtools::load_all(paste0(dir, "macpie/")) - -``` - - -```{r setup} - -suppressMessages(library(macpie)) -suppressMessages(library(tibble)) -suppressMessages(library(stringr)) -suppressMessages(library(pheatmap)) -suppressMessages(library(ggiraph)) -suppressMessages(library(tidyseurat)) -suppressMessages(library(purrr)) -suppressMessages(library(ggplot2)) - -``` - -```{r} -options(scipen=999, digits=3) -``` - - - -## DRUGseq DMSO control selection - - -First we load the file of good DMSO control wells identified by the DRUGseq authors in their github repo. - -Here, we show the good DMSO wells for batch 24, which contains the three plates we are interested in (VH02012942, VH02012944, VH02012956). - -```{r} -good_DMSO_DRUGseq <- read.csv(paste0(dir, "DRUGseqData/robust_RC_ReferenceControl_DMSO_wells.txt"), sep="") -good_DMSO_DRUGseq %>% filter(batch_id==24) %>% select(batch_id, plate_barcode, well_id) -``` - -As from their results of good DMSO controls, these are the good DMSO wells for batch 24: - - VH02012942: I23, M23 - - VH02012944: D23, H23 - - VH02012956: I23, J23 - - -*** -
- - - -## macpie DMSO control selection - - -Now we will use a function `select_top_replicates_tmmcpm` to identify good DMSO controls from batch 24. - -This function: - - - applies CPM filtering, - - - performs TMMwsp normalisation and computes log-CPM, - - - ranks wells by their mean Fisher-z–transformed sample–sample correlation to all other wells (Pearson or Spearman) - - - selects the top n wells (user-defined) as good controls. - -```{r} -select_top_replicates_tmmcpm <- function( - data, - combined_id, # e.g. "CB_43_EP73_0" - orig_ident, # e.g. "VH02012942" - cpm_filter = 1, # CPM threshold for gene filtering - min_samps = 16, # number of samples a gene must be expressed in - corr_method = c("spearman","pearson"), - top_n = 5, - make_plots = TRUE -){ - - corr_method <- match.arg(corr_method) - - # Subset to the plate/condition of interest - subgroup <- subset(data, subset = combined_id == !!combined_id & orig.ident == !!orig_ident) - - # Counts and human-friendly column names - counts_d <- GetAssayData(subgroup, assay = "RNA", layer = "counts") - well_colnames <- paste0(subgroup$orig.ident, "_", subgroup$Well_ID) - names(well_colnames) <- rownames(subgroup@meta.data) - colnames(counts_d) <- well_colnames[colnames(counts_d)] - - # edgeR container + gene filtering - y <- edgeR::DGEList(counts_d, group = subgroup$orig.ident) - keep <- rowSums(edgeR::cpm(y) > cpm_filter) >= min_samps - y <- y[keep, , keep.lib.sizes = FALSE] - - # TMMwsp → log2-CPM - y <- edgeR::calcNormFactors(y, method = "TMMwsp") - log_cpm_tmm <- edgeR::cpm(y, log = TRUE, normalized.lib.sizes = TRUE) - - # Long data for boxplot - df_long <- as.data.frame(log_cpm_tmm) |> - tibble::rownames_to_column(var = "gene") |> - tidyr::pivot_longer(-gene, names_to = "sample", values_to = "log_cpm") - - if (make_plots) { - # Boxplot - print( - ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) + - ggplot2::geom_boxplot(outlier.size = 0.5) + - ggplot2::labs(x = "Sample", y = "log2 CPM", - title = "Boxplot of log2-CPM (TMMwsp)") + - ggplot2::theme_classic() + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) - ) - } - - # Correlation matrices - cors_pear <- stats::cor(log_cpm_tmm, method = "pearson") - cors_spear <- stats::cor(log_cpm_tmm, method = "spearman") - - if (make_plots) { - pheatmap::pheatmap(cors_pear, main = "Sample–sample correlation (Pearson, log2-CPM)") - pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)") - } - - # Ranking by mean Fisher-z correlation to all *other* samples - R <- stats::cor(log_cpm_tmm, method = corr_method, use = "pairwise.complete.obs") - diag(R) <- NA_real_ - - # clip, Fisher z, average, back-transform - Z <- atanh(pmin(pmax(R, -0.999999), 0.999999)) - score_z <- rowMeans(Z, na.rm = TRUE) - score_r <- tanh(score_z) - - # top-N names and scores - ord <- order(score_r, decreasing = TRUE, na.last = NA) - srt <- score_r[ord] - cutoff <- srt[top_n] # N-th best score - keep <- srt >= cutoff # keep all tied at the cutoff - topN <- srt[keep] - - # return everything useful - list( - subset_obj = subgroup, - dge = y, - log_cpm_tmm = log_cpm_tmm, - boxplot_df = df_long, - cor_pearson = cors_pear, - cor_spearman = cors_spear, - ranking_method = corr_method, - scores_mean_to_others = sort(score_r, decreasing = TRUE), - topN = topN - ) -} - - -``` - -```{r filter_genes_batch24} -# mac_filtered <- filter_genes_by_expression(as_mac, -# group_by = "combined_id", min_counts = 10, -# min_samples = min_sample_num) - -# saveRDS(mac_filtered, -# file = paste0(dir, "DRUGseqData/macpie_filtered_batch24.Rds")) - - -mac_filtered <- readRDS(paste0(dir, "/DRUGseqData/macpie_filtered_batch24.Rds")) -``` - - - -```{r violin_plots_batch24, fig.width=10, fig.height=6} -mac_filtered[["percent.mt"]] <- PercentageFeatureSet(mac_filtered, pattern = "^mt-|^MT-") -mac_filtered[["percent.ribo"]] <- PercentageFeatureSet(mac_filtered, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA") -``` - - -```{r} -mac_filtered$combined_id <- str_replace_all(mac_filtered$combined_id, "-","_") -``` - - -### RLE plot - -RLE plot to quickly visualise raw counts of DMSO wells in batch 24 plates - -```{r rle_plot_DMSO_batch24, fig.width=10, fig.height=6} -# mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0","FF_86_NH56_10")) -mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0")) -plot_rle(subset(mac_filtered_cp, subset=orig.ident=="VH02012942"), label_column = "Well_ID", normalisation = "raw") -plot_rle(subset(mac_filtered_cp, subset=orig.ident=="VH02012944"), label_column = "Well_ID", normalisation = "raw") -plot_rle(subset(mac_filtered_cp, subset=orig.ident=="VH02012956"), label_column = "Well_ID", normalisation = "raw") - -``` - - -From the RLE plots above, we can see that there are some level of variation among DMSO wells in each plate. It's important to select the best DMSO wells as controls for downstream analysis. - -### Select good DMSO controls using macpie - -Now we will apply the `select_top_replicates_tmmcpm` function to each of the three plates in batch 24 to identify good DMSO controls. - -This function generates three plots: - - - Boxplot of log2-CPM (TMMwsp) - - - Sample–sample correlation (Pearson, log2-CPM) - - - Sample–sample correlation (Spearman, log2-CPM) - - These plots help to visualize the distribution of gene expression and the correlation between samples, aiding in the assessment of DMSO control quality. - - -#### Plate VH02012942 - -```{r} -dmso_VH02012942 <- select_top_replicates_tmmcpm( - mac_filtered, - combined_id = "CB_43_EP73_0", - orig_ident = "VH02012942", - cpm_filter = 1, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = TRUE -) -``` - -Apart from the boxplot and correlation heatmaps, the function also returns a ranking of wells by their mean correlation to all other wells. - -```{r} -dmso_VH02012942$scores_mean_to_others -``` - -Finally, it returns the top N wells as good DMSO controls. - -```{r} -dmso_VH02012942$topN -``` - -Now we can see for plate VH02012942, the 2 of top 5 DMSO wells selected by macpie are I23 and J23, which are exactly the same as the good DMSO wells identified by the DRUGseq authors. - - -Let's repeat the same process for the other two plates in batch 24. - -#### Plate VH02012944 - - -```{r} -dmso_VH02012944 <- select_top_replicates_tmmcpm( - mac_filtered, - combined_id = "CB_43_EP73_0", - orig_ident = "VH02012944", - cpm_filter = 1, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = TRUE -) -``` - -```{r} -dmso_VH02012944$scores_mean_to_others -``` -```{r} -dmso_VH02012944$topN -``` - -For plate VH02012944, the DRUGseq selected D23 and H23 DMSO wells are not in our top 5. However, H23 is ranked 6th by macpie, which is very close to the top 5. - - -#### Plate VH02012956 - - -```{r} -dmso_VH02012956 <- select_top_replicates_tmmcpm( - mac_filtered, - combined_id = "CB_43_EP73_0", - orig_ident = "VH02012956", - cpm_filter = 1, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = TRUE -) -``` - -```{r} -dmso_VH02012956$scores_mean_to_others -``` - -```{r} -dmso_VH02012956$topN -``` - -For plate VH02012956, the DRUGseq selected I23 and J23 DMSO wells are exactly the same as our top DMSO wells selected by macpie. - - - -```{r} -library(peakRAM) -library(ps) -``` - -### Runtime and Memory - -Now we will measure the runtime and memory usage of the `select_top_replicates_tmmcpm` function using the `peakRAM` package. - - -```{r} -res <- peakRAM({ - dmso_VH02012942 <- select_top_replicates_tmmcpm( - mac_filtered, - combined_id = "CB_43_EP73_0", - orig_ident = "VH02012942", - cpm_filter = 1, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - dmso_VH02012944 <- select_top_replicates_tmmcpm( - mac_filtered, - combined_id = "CB_43_EP73_0", - orig_ident = "VH02012944", - cpm_filter = 1, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - - dmso_VH02012956 <- select_top_replicates_tmmcpm( - mac_filtered, - combined_id = "CB_43_EP73_0", - orig_ident = "VH02012956", - cpm_filter = 1, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - } -) - -res -``` - - -```{r} -if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps") -get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2 - -rss_before <- get_rss_mb() -tm <- system.time({ - dmso_VH02012942 <- select_top_replicates_tmmcpm( - mac_filtered, - combined_id = "CB_43_EP73_0", - orig_ident = "VH02012942", - cpm_filter = 1, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - dmso_VH02012944 <- select_top_replicates_tmmcpm( - mac_filtered, - combined_id = "CB_43_EP73_0", - orig_ident = "VH02012944", - cpm_filter = 1, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) - - dmso_VH02012956 <- select_top_replicates_tmmcpm( - mac_filtered, - combined_id = "CB_43_EP73_0", - orig_ident = "VH02012956", - cpm_filter = 1, - min_samps = 8, - corr_method = "spearman", - top_n = 5, - make_plots = FALSE - ) -}) -rss_after <- get_rss_mb() -cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n", - tm["elapsed"], rss_before, rss_after, rss_after - rss_before)) - -``` - - -### Summary - -In summary, for the three plates in batch 24, macpie successfully identified 4 out of 6 good DMSO wells that were also selected by the DRUGseq authors. Only for plate VH02012944, one of the DRUGseq selected DMSO wells (D23) was not in our top 5, but the other well (H23) was ranked 6th by macpie, which is very close to the top 5. This demonstrates that macpie is effective in selecting high-quality DMSO controls for downstream analysis without running permutation tests, making it a computationally efficient choice. - -This function runs for each plate, it does not take into account any batches or plates. If it's a cross-plates design, it is recommended to either compute within-plate or remove plate effects (e.g. using ComBat, limma removeBatchEffect functions) first. - -The select_top_replicates_tmmcpm function runs in approximately 1.38 seconds for three plates, with a peak memory usage of around 313 Mib. This demonstrates that the function is efficient and can be used effectively for selecting DMSO controls in a reasonable timeframe without excessive memory consumption. - - - - diff --git a/vignettes/benchmark_DMSOselection.html b/vignettes/benchmark_DMSOselection.html deleted file mode 100644 index 9064b5d..0000000 --- a/vignettes/benchmark_DMSOselection.html +++ /dev/null @@ -1,2017 +0,0 @@ - - - - - - - - - - - - - -Benchmark: DMSO control selection - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
-
-
-
- -
- - - - - - - -
suppressMessages(library(macpie))
-suppressMessages(library(tibble))
-suppressMessages(library(stringr))
-suppressMessages(library(pheatmap))
-suppressMessages(library(ggiraph))
-suppressMessages(library(tidyseurat))
-suppressMessages(library(purrr))
-suppressMessages(library(ggplot2))
-
options(scipen=999, digits=3)
-
-

DRUGseq DMSO control selection

-

First we load the file of good DMSO control wells identified by the -DRUGseq authors in their github repo.

-

Here, we show the good DMSO wells for batch 24, which contains the -three plates we are interested in (VH02012942, VH02012944, -VH02012956).

-
good_DMSO_DRUGseq <- read.csv(paste0(dir, "DRUGseqData/robust_RC_ReferenceControl_DMSO_wells.txt"), sep="")
-good_DMSO_DRUGseq %>% filter(batch_id==24) %>% select(batch_id, plate_barcode, well_id)
-
##   batch_id plate_barcode well_id
-## 1       24    VH02012942     I23
-## 2       24    VH02012942     M23
-## 3       24    VH02012944     D23
-## 4       24    VH02012944     H23
-## 5       24    VH02012956     I23
-## 6       24    VH02012956     J23
-

As from their results of good DMSO controls, these are the good DMSO -wells for batch 24:

-

VH02012942: I23, M23

-

VH02012944: D23, H23

-

VH02012956: I23, J23

-
-


-
-
-

macpie DMSO control selection

-

Now we will use a function select_top_replicates_tmmcpm -to identify good DMSO controls from batch 24.

-

This function:

-
    -
  • applies CPM filtering,

  • -
  • performs TMMwsp normalisation and computes log-CPM,

  • -
  • ranks wells by their mean Fisher-z–transformed sample–sample -correlation to all other wells (Pearson or Spearman)

  • -
  • selects the top n wells (user-defined) as good controls.

  • -
-
select_top_replicates_tmmcpm <- function(
-  data,
-  combined_id,                 # e.g. "CB_43_EP73_0"
-  orig_ident,                  # e.g. "VH02012942"
-  cpm_filter    = 1,           # CPM threshold for gene filtering
-  min_samps     = 16,          # number of samples a gene must be expressed in
-  corr_method   = c("spearman","pearson"),
-  top_n         = 5,
-  make_plots    = TRUE
-){
-  
-  corr_method <- match.arg(corr_method)
-
-  # Subset to the plate/condition of interest
-  subgroup <- subset(data, subset = combined_id == !!combined_id & orig.ident == !!orig_ident)
-
-  # Counts and human-friendly column names
-  counts_d <- GetAssayData(subgroup, assay = "RNA", layer = "counts")
-  well_colnames <- paste0(subgroup$orig.ident, "_", subgroup$Well_ID)
-  names(well_colnames) <- rownames(subgroup@meta.data)
-  colnames(counts_d) <- well_colnames[colnames(counts_d)]
-
-  # edgeR container + gene filtering
-  y <- edgeR::DGEList(counts_d, group = subgroup$orig.ident)
-  keep <- rowSums(edgeR::cpm(y) > cpm_filter) >= min_samps
-  y <- y[keep, , keep.lib.sizes = FALSE]
-
-  # TMMwsp → log2-CPM
-  y <- edgeR::calcNormFactors(y, method = "TMMwsp")
-  log_cpm_tmm <- edgeR::cpm(y, log = TRUE, normalized.lib.sizes = TRUE)
-
-  #  Long data for boxplot
-  df_long <- as.data.frame(log_cpm_tmm) |>
-    tibble::rownames_to_column(var = "gene") |>
-    tidyr::pivot_longer(-gene, names_to = "sample", values_to = "log_cpm")
-
-  if (make_plots) {
-    # Boxplot
-    print(
-      ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) +
-        ggplot2::geom_boxplot(outlier.size = 0.5) +
-        ggplot2::labs(x = "Sample", y = "log2 CPM",
-                      title = "Boxplot of log2-CPM (TMMwsp)") +
-        ggplot2::theme_classic() +
-        ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
-    )
-  }
-
-  # Correlation matrices
-  cors_pear  <- stats::cor(log_cpm_tmm, method = "pearson")
-  cors_spear <- stats::cor(log_cpm_tmm, method = "spearman")
-
-  if (make_plots) {
-    pheatmap::pheatmap(cors_pear,  main = "Sample–sample correlation (Pearson, log2-CPM)")
-    pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)")
-  }
-
-  # Ranking by mean Fisher-z correlation to all *other* samples
-  R <- stats::cor(log_cpm_tmm, method = corr_method, use = "pairwise.complete.obs")
-  diag(R) <- NA_real_
-
-  # clip, Fisher z, average, back-transform
-  Z <- atanh(pmin(pmax(R, -0.999999), 0.999999))
-  score_z <- rowMeans(Z, na.rm = TRUE)
-  score_r <- tanh(score_z)
-
-  # top-N names and scores
-  ord  <- order(score_r, decreasing = TRUE, na.last = NA)
-  srt <- score_r[ord]
-  cutoff <- srt[top_n]              # N-th best score
-  keep   <- srt >= cutoff    # keep all tied at the cutoff
-  topN <- srt[keep]
-
-  # return everything useful
-  list(
-    subset_obj   = subgroup,
-    dge          = y,
-    log_cpm_tmm  = log_cpm_tmm,
-    boxplot_df   = df_long,
-    cor_pearson  = cors_pear,
-    cor_spearman = cors_spear,
-    ranking_method = corr_method,
-    scores_mean_to_others = sort(score_r, decreasing = TRUE),
-    topN = topN
-  )
-}
-
# mac_filtered <- filter_genes_by_expression(as_mac,
-#                 group_by = "combined_id", min_counts = 10,
-#                 min_samples = min_sample_num)
-
-# saveRDS(mac_filtered,
-# file = paste0(dir, "DRUGseqData/macpie_filtered_batch24.Rds"))
-
-
-mac_filtered <- readRDS(paste0(dir, "/DRUGseqData/macpie_filtered_batch24.Rds"))
-
mac_filtered[["percent.mt"]] <- PercentageFeatureSet(mac_filtered, pattern = "^mt-|^MT-")
-mac_filtered[["percent.ribo"]] <- PercentageFeatureSet(mac_filtered, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA")
-
mac_filtered$combined_id <- str_replace_all(mac_filtered$combined_id, "-","_")
-
-

RLE plot

-

RLE plot to quickly visualise raw counts of DMSO wells in batch 24 -plates

-
# mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0","FF_86_NH56_10"))
-mac_filtered_cp <- mac_filtered %>% filter(combined_id %in% c("CB_43_EP73_0"))
-plot_rle(subset(mac_filtered_cp, subset=orig.ident=="VH02012942"), label_column = "Well_ID", normalisation = "raw")
-
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
-

-
plot_rle(subset(mac_filtered_cp, subset=orig.ident=="VH02012944"), label_column = "Well_ID", normalisation = "raw")
-
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
-

-
plot_rle(subset(mac_filtered_cp, subset=orig.ident=="VH02012956"), label_column = "Well_ID", normalisation = "raw")
-
## tidyseurat says: Key columns are missing. A data frame is returned for independent data analysis.
-

-

From the RLE plots above, we can see that there are some level of -variation among DMSO wells in each plate. It’s important to select the -best DMSO wells as controls for downstream analysis.

-
-
-

Select good DMSO controls using macpie

-

Now we will apply the select_top_replicates_tmmcpm -function to each of the three plates in batch 24 to identify good DMSO -controls.

-

This function generates three plots:

-
    -
  • Boxplot of log2-CPM (TMMwsp)

  • -
  • Sample–sample correlation (Pearson, log2-CPM)

  • -
  • Sample–sample correlation (Spearman, log2-CPM)

  • -
-

These plots help to visualize the distribution of gene expression and -the correlation between samples, aiding in the assessment of DMSO -control quality.

-
-

Plate VH02012942

-
dmso_VH02012942 <- select_top_replicates_tmmcpm(
-  mac_filtered,
-  combined_id = "CB_43_EP73_0",
-  orig_ident  = "VH02012942",
-  cpm_filter  = 1,
-  min_samps   = 8,
-  corr_method = "spearman",
-  top_n       = 5,
-  make_plots  = TRUE
-)
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <e2>
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <80>
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <93>
-

-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <e2>
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <80>
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <93>
-

-

Apart from the boxplot and correlation heatmaps, the function also -returns a ranking of wells by their mean correlation to all other -wells.

-
dmso_VH02012942$scores_mean_to_others
-
## VH02012942_K23 VH02012942_M23 VH02012942_J23 VH02012942_I23 VH02012942_C23 
-##          0.878          0.877          0.875          0.874          0.872 
-## VH02012942_N23 VH02012942_L23 VH02012942_E23 VH02012942_G23 VH02012942_H23 
-##          0.870          0.869          0.869          0.866          0.863 
-## VH02012942_O23 VH02012942_D23 VH02012942_P23 VH02012942_F23 VH02012942_B23 
-##          0.860          0.858          0.858          0.851          0.849 
-## VH02012942_A23 
-##          0.843
-

Finally, it returns the top N wells as good DMSO controls.

-
dmso_VH02012942$topN
-
## VH02012942_K23 VH02012942_M23 VH02012942_J23 VH02012942_I23 VH02012942_C23 
-##          0.878          0.877          0.875          0.874          0.872
-

Now we can see for plate VH02012942, the 2 of top 5 DMSO wells -selected by macpie are I23 and J23, which are exactly the same as the -good DMSO wells identified by the DRUGseq authors.

-

Let’s repeat the same process for the other two plates in batch -24.

-
-
-

Plate VH02012944

-
dmso_VH02012944 <- select_top_replicates_tmmcpm(
-  mac_filtered,
-  combined_id = "CB_43_EP73_0",
-  orig_ident  = "VH02012944",
-  cpm_filter  = 1,
-  min_samps   = 8,
-  corr_method = "spearman",
-  top_n       = 5,
-  make_plots  = TRUE
-)
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <e2>
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <80>
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <93>
-

-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <e2>
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <80>
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <93>
-

-
dmso_VH02012944$scores_mean_to_others
-
## VH02012944_I23 VH02012944_M23 VH02012944_J23 VH02012944_G23 VH02012944_K23 
-##          0.906          0.904          0.903          0.902          0.901 
-## VH02012944_H23 VH02012944_C23 VH02012944_L23 VH02012944_E23 VH02012944_D23 
-##          0.900          0.899          0.898          0.897          0.897 
-## VH02012944_O23 VH02012944_F23 VH02012944_N23 VH02012944_P23 VH02012944_A23 
-##          0.897          0.897          0.895          0.890          0.887 
-## VH02012944_B23 
-##          0.880
-
dmso_VH02012944$topN
-
## VH02012944_I23 VH02012944_M23 VH02012944_J23 VH02012944_G23 VH02012944_K23 
-##          0.906          0.904          0.903          0.902          0.901
-

For plate VH02012944, the DRUGseq selected D23 and H23 DMSO wells are -not in our top 5. However, H23 is ranked 6th by macpie, which is very -close to the top 5.

-
-
-

Plate VH02012956

-
dmso_VH02012956 <- select_top_replicates_tmmcpm(
-  mac_filtered,
-  combined_id = "CB_43_EP73_0",
-  orig_ident  = "VH02012956",
-  cpm_filter  = 1,
-  min_samps   = 8,
-  corr_method = "spearman",
-  top_n       = 5,
-  make_plots  = TRUE
-)
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <e2>
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <80>
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Pearson, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <93>
-

-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <e2>
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <80>
-
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
-## conversion failure on 'Sample–sample correlation (Spearman, log2-CPM)' in
-## 'mbcsToSbcs': dot substituted for <93>
-

-
dmso_VH02012956$scores_mean_to_others
-
## VH02012956_J23 VH02012956_O23 VH02012956_M23 VH02012956_I23 VH02012956_K23 
-##          0.912          0.904          0.902          0.902          0.901 
-## VH02012956_N23 VH02012956_G23 VH02012956_H23 VH02012956_L23 VH02012956_P23 
-##          0.901          0.900          0.899          0.899          0.897 
-## VH02012956_F23 VH02012956_C23 VH02012956_D23 VH02012956_E23 VH02012956_A23 
-##          0.894          0.893          0.892          0.891          0.888 
-## VH02012956_B23 
-##          0.886
-
dmso_VH02012956$topN
-
## VH02012956_J23 VH02012956_O23 VH02012956_M23 VH02012956_I23 VH02012956_K23 
-##          0.912          0.904          0.902          0.902          0.901
-

For plate VH02012956, the DRUGseq selected I23 and J23 DMSO wells are -exactly the same as our top DMSO wells selected by macpie.

-
library(peakRAM)
-library(ps)
-
-
-
-

Runtime and Memory

-

Now we will measure the runtime and memory usage of the -select_top_replicates_tmmcpm function using the -peakRAM package.

-
res <- peakRAM({
-  dmso_VH02012942 <- select_top_replicates_tmmcpm(
-    mac_filtered,
-    combined_id = "CB_43_EP73_0",
-    orig_ident  = "VH02012942",
-    cpm_filter  = 1,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  dmso_VH02012944 <- select_top_replicates_tmmcpm(
-    mac_filtered,
-    combined_id = "CB_43_EP73_0",
-    orig_ident  = "VH02012944",
-    cpm_filter  = 1,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  
-  dmso_VH02012956 <- select_top_replicates_tmmcpm(
-    mac_filtered,
-    combined_id = "CB_43_EP73_0",
-    orig_ident  = "VH02012956",
-    cpm_filter  = 1,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  }
-)
-
-res
-
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             Function_Call
-## 1 {dmso_VH02012942<-select_top_replicates_tmmcpm(mac_filtered,combined_id="CB_43_EP73_0",orig_ident="VH02012942",cpm_filter=1,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02012944<-select_top_replicates_tmmcpm(mac_filtered,combined_id="CB_43_EP73_0",orig_ident="VH02012944",cpm_filter=1,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)dmso_VH02012956<-select_top_replicates_tmmcpm(mac_filtered,combined_id="CB_43_EP73_0",orig_ident="VH02012956",cpm_filter=1,min_samps=8,corr_method="spearman",top_n=5,make_plots=FALSE)}
-##   Elapsed_Time_sec Total_RAM_Used_MiB Peak_RAM_Used_MiB
-## 1             1.34                9.4               313
-
if (!requireNamespace("ps", quietly = TRUE)) install.packages("ps")
-get_rss_mb <- function() ps::ps_memory_info(ps::ps_handle())[1] / 1024^2
-
-rss_before <- get_rss_mb()
-tm <- system.time({
-  dmso_VH02012942 <- select_top_replicates_tmmcpm(
-    mac_filtered,
-    combined_id = "CB_43_EP73_0",
-    orig_ident  = "VH02012942",
-    cpm_filter  = 1,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  dmso_VH02012944 <- select_top_replicates_tmmcpm(
-    mac_filtered,
-    combined_id = "CB_43_EP73_0",
-    orig_ident  = "VH02012944",
-    cpm_filter  = 1,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-  
-  dmso_VH02012956 <- select_top_replicates_tmmcpm(
-    mac_filtered,
-    combined_id = "CB_43_EP73_0",
-    orig_ident  = "VH02012956",
-    cpm_filter  = 1,
-    min_samps   = 8,
-    corr_method = "spearman",
-    top_n       = 5,
-    make_plots  = FALSE
-  )
-})
-rss_after  <- get_rss_mb()
-cat(sprintf("Elapsed: %.1fs | RSS before→after: %.0f→%.0f MiB (Δ %.0f MiB)\n",
-            tm["elapsed"], rss_before, rss_after, rss_after - rss_before))
-
## Elapsed: 1.4s | RSS before→after: 2436→2431 MiB (Δ -5 MiB)
-
-
-

Summary

-

In summary, for the three plates in batch 24, macpie successfully -identified 4 out of 6 good DMSO wells that were also selected by the -DRUGseq authors. Only for plate VH02012944, one of the DRUGseq selected -DMSO wells (D23) was not in our top 5, but the other well (H23) was -ranked 6th by macpie, which is very close to the top 5. This -demonstrates that macpie is effective in selecting high-quality DMSO -controls for downstream analysis without running permutation tests, -making it a computationally efficient choice.

-

This function runs for each plate, it does not take into account any -batches or plates. If it’s a cross-plates design, it is recommended to -either compute within-plate or remove plate effects (e.g. using ComBat, -limma removeBatchEffect functions) first.

-

The select_top_replicates_tmmcpm function runs in approximately 1.38 -seconds for three plates, with a peak memory usage of around 313 Mib. -This demonstrates that the function is efficient and can be used -effectively for selecting DMSO controls in a reasonable timeframe -without excessive memory consumption.

-
-
- - - -
-
- -
- - - - - - - - - - - - - - - - diff --git a/vignettes/check_zero_inflation.Rmd b/vignettes/check_zero_inflation.Rmd deleted file mode 100644 index 724ad3b..0000000 --- a/vignettes/check_zero_inflation.Rmd +++ /dev/null @@ -1,411 +0,0 @@ ---- -title: "Check for zero-inflation in your data" -output: - rmarkdown::html_document: - theme: flatly - toc: true - toc_float: true - toc_depth: 5 -vignette: > - %\VignetteIndexEntry{check_zero_inflation} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} - %\VignetteBuild{true} - ---- - - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - digits = 3 -) -options(digits = 3) -``` - - - -## Overview - -This is a quick demonstration of the `check_zeroinflation()` function from our **macpie** package. This function is a fast diagnostic tool to help you assess whether your MACseq data exhibits zero-inflation relative to a negative binomial (NB) model. - -We will use a lightweight convenience wrapper `subsample_genes()` around **`seqgendiff::select_counts()`** to create a smaller object with a subset of genes for faster computation. This function is also part of the **macpie** package. - -Under the hood, `check_zeroinflation()` uses **edgeR** to estimate gene-wise dispersions and compute expected zero probabilities under a NB model. It then compares the observed and expected zero-inflated indexes for each gene within each group defined in the metadata. - -This function: - - - estimates gene-wise tagwise dispersions with edgeR (using all selected groups), - - - builds NB-expected zero probabilities from TMMwsp-scaled means, and - - - returns per-gene ZI (observed zeros minus NB-expected zeros) and per-group summaries (e.g., % genes with ZI > 0.05). ZI-cutoffs are user-defined. - - -The output is a list with two elements: - - - `summary_by_group`: A summary table showing the number and percentage of genes that are zero-inflated for each group at specified cutoffs. - - - `gene_metrics_by_group`: A detailed table with gene-wise metrics, including observed and expected zero numbers and proportions, zero-inflation index (ZI), mean count, and dispersion estimates for each gene in each group. - -**Note**: `check_zeroinflation()` relies on edgeR to estimate dispersion. The current implementation requires ≥2 groups in the design so that edgeR can stabilize gene-wise dispersions across groups. If you only have a single group and still want a design-aware baseline for expected zeros, fit a Gamma–Poisson/NB GLM and compute the expected zero probabilities from its fitted means and over-dispersion. - -*** -
- -```{r set_wd, include=FALSE} -dir <- "/Users/liuxin/macpie_Dev/" -devtools::load_all(paste0(dir, "macpie/")) -``` - -## Load data - -Load data and preprocess before subsampling genes and checking for zero-inflation. - -```{r setup} -#install.packages("macpie") # or devtools::install_github("PMCC-BioinformaticsCore/macpie") -library(macpie) - -# Define project variables -project_name <- "PMMSq033" -project_metadata <- system.file("extdata/PMMSq033_metadata.csv", package = "macpie") -# Load metadata -metadata <- read_metadata(project_metadata) - -``` - -```{r load_data} -# Import raw data -project_rawdata <- paste0(dir, "/macpieData/PMMSq033/raw_matrix") -project_name <- "PMMSq033" -raw_counts <- Read10X(data.dir = project_rawdata) -# Create tidySeurat object -mac <- CreateSeuratObject(counts = raw_counts, - project = project_name, - min.cells = 1, - min.features = 1) -# Join with metadata -mac <- mac %>% - inner_join(metadata, by = c(".cell" = "Barcode")) -# Add unique identifier -mac <- mac %>% - mutate(combined_id = str_c(Compound_ID, Concentration_1, sep = "_")) %>% - mutate(combined_id = gsub(" ", "", .data$combined_id)) -mac <- mac %>% - filter(Project == "Current") - -``` - -Filter genes with very low counts across all samples. This step is important because genes with extremely low expression can lead to unreliable estimates of dispersion and expected zero probabilities, which can skew the zero-inflation assessment. - -```{r} -# Filter by read count per sample group -mac <- filter_genes_by_expression(mac, - group_by = "combined_id", - min_counts = 10, - min_samples = 2) -``` - - -## Subsample genes for faster computation - -This is to randomly select a subset of 1000 genes for a quick check. For a more comprehensive analysis, consider using a larger subset. - -```{r} -# Subsample genes for faster computation -sub_mac <- subsample_genes(mac, ngene = 1000, gselect = "random", seed = 1) -sub_mac %>% nrow() -``` - - -## Check for zero-inflation - -We can now run the zero-inflation check on the subsampled data. - -### DMSO vs Camptothecin - -First, we will compare two groups: "DMSO_0" and "CAMPTOTHECIN_10". - - - -```{r} -zi_results <- check_zeroinflation(sub_mac, - group_by = "combined_id", - samples = c("DMSO_0","CAMPTOTHECIN_10"), - batch = 1, - cutoffs = c(0.1, 0.2)) - -``` - -#### View gene-wise metrics for each group - -For DMSO group: - -```{r} -zi_results$gene_metrics_by_group %>% filter(group=="DMSO_0") %>% head() -``` - -For Camptothecin group: - -```{r} -zi_results$gene_metrics_by_group %>% filter(group=="CAMPTOTHECIN_10") %>% head() -``` - -#### View summary statistics for each group - -```{r} -zi_results$summary_by_group -``` - - -From the summary table, we can see the summary statistics for each group, including the number and percentage of genes that are zero-inflated at the specified cutoffs. - -Each of the columns in the summary table are defined as follows: - - - group: The treatment group - - - n_genes: Total number of genes analyzed in the group - - - n_wells: Total number of wells/samples in the group - - - median_p0_obs: Median observed zero proportion across genes in the group - - - median_p0_nb: Median expected zero proportion under the NB model across genes in the group - - - median_ZI: Median zero-inflation index (ZI = p0_obs - p0_nb for each gene) across genes in the group - - - observed_zeros_num: Number of data points with observed zeros (it shouldn't be more than n_genes*n_wells for each group) - - - expected_zeros_num: Number of data points with expected zeros under the NB model (same here, it shouldn't be more than n_genes*n_wells for each group) - - - pct_ZI_gt_0.1: Percentage of genes with ZI greater than 0.1 - - - pct_ZI_gt_0.2: Percentage of genes with ZI greater than 0.2 - -*** -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#### See below for interpretation of the results - -For DMSO vs camptothecin, we can see that both DMSO and camptothecin groups have negative median ZI values, indicating that there is no significant zero-inflation in either group. Percentage of genes with ZI greater than 0.1 or 0.2 are 13.5% and 6.6% for camptothecin. As camptothecin acts as a DNA topoisomerase inhibitor and induces DNA damage, it can lead to cell cycle arrest and apoptosis, which may reduce the overall transcriptional activity in the cells. This reduction in transcriptional activity can result in genes being expressed at lower levels or not at all. On the other hand, DMSO is a common solvent used in biological experiments and is not expected to induce significant changes in gene expression. Therefore, it is reasonable to observe a lower level of zero-inflation in the DMSO group compared to the camptothecin group. Same as ZI results for DMSO, the percentage of genes with ZI greater than 0.1 or 0.2 are both 0%. - -Besides, the observed zeros are also lower than the expected zeros, further supporting the absence of zero-inflation. Overall, these results suggest that the data does not exhibit significant zero-inflation, and a standard negative binomial model may be appropriate for downstream analyses. Zero-inflated models (ZINB) may not be necessary for this data set. - - - -```{r, include=FALSE} -rm(zi_results) -``` - -*** -
- - - -### DMSO vs Staruosporine - -Next, we will compare two treatment groups: "DMSO_0" and "Staurosporine_10". As staurosporine is a potent inducer of apoptosis, we might expect to see more zero-inflation in this group compared to DMSO. - - -```{r} -# Check for zero-inflation -zi_results <- check_zeroinflation(sub_mac, - group_by = "combined_id", - samples = c("DMSO_0","Staurosporine_10"), - batch = 1, - cutoffs = c(0.1, 0.2)) -``` - - -#### View gene-wise metrics for each group - -For DMSO group: - -```{r} -zi_results$gene_metrics_by_group %>% filter(group=="DMSO_0") %>% head() -``` - -For Staurosporine group: -```{r} -zi_results$gene_metrics_by_group %>% filter(group=="Staurosporine_10") %>% head() -``` - -#### View summary statistics for each group - -```{r} -zi_results$summary_by_group -``` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -#### See below for interpretation of the results - -For DMSO vs staurosporine, we can see that only staurosporine has 14% of median ZI value, the observed zeros number (713) is much higher than the expected zeros number (~355), indicating that there is some level of zero-inflation in the staurosporine group. As staurosporine is a cell death control, which it is expected to see zero-inflated data in this group. In this case, it may be appropriate to consider using a zero-inflated negative binomial (ZINB) model for downstream analyses to account for the excess zeros in the data. - - - - - -### All treatment groups - -Finally, we will check for zero-inflation across all treatment groups in the data. - -```{r} -# Check for zero-inflation -all_conditions <- unique(sub_mac$combined_id) -zi_results <- check_zeroinflation(sub_mac, - group_by = "combined_id", - samples = all_conditions, - batch = 1, - cutoffs = c(0.1, 0.2)) -``` - -#### View gene-wise metrics for each group - -```{r} -zi_results$gene_metrics_by_group %>% filter(group=="Luminespib_10") %>% head() - -``` - - -```{r} -zi_results$summary_by_group %>% head(10) -``` - -```{r} -high_doses_zi_results <- zi_results$summary_by_group %>% filter(grepl("_10$", group)) -#concatenate with DMSO -high_doses_zi_results <- rbind(high_doses_zi_results, - zi_results$summary_by_group %>% filter(group=="DMSO_0")) - - -long_zi_results <- high_doses_zi_results %>% select(group, median_p0_obs, median_p0_nb, median_ZI) %>% - pivot_longer(cols = c(median_p0_obs, median_p0_nb, median_ZI), - names_to = "metric", - values_to = "value") - -library(ggplot2) -ggplot(long_zi_results, aes(x = group, y = value, fill = metric)) + - geom_bar(stat = "identity", position = "dodge") + - labs(title = "Zero-Inflation Metrics by Treatment Groups (10uM) and DMSO", - x = "Groups", - y = "zero-inflation proportion") + - theme_minimal() + - theme(axis.text.x = element_text(angle = 90, hjust = 1)) + - scale_fill_manual(values = macpie_colours$discrete[1:3]) -``` - -#### See below for interpretation of the results - - - median_p0_obs: Median observed zero proportion across genes in the group - - - median_p0_nb: Median expected zero proportion under the NB model across genes in the group - - - median_ZI: Median zero-inflation index (ZI = p0_obs - p0_nb for each gene) across genes in the group - -From the plot, we can see that most treatment groups have small or even negative median ZI values, indicating that there is no significant zero-inflation in these groups. Only Staurosporine_10 shows positive median ZI values, suggesting some level of zero-inflation in the group. As described above, staurosporine is a potent inducer of apoptosis, which can lead to increased cell death and reduced gene expression, resulting in more zero counts in the data. Therefore, it is reasonable to observe excessive zeros in the staurosporine group. For other treatment groups, the small or even negative median ZI values suggest that a standard negative binomial model may be appropriate for downstream analyses. Zero-inflated models (ZINB) may not be necessary for these groups. diff --git a/vignettes/check_zero_inflation.html b/vignettes/check_zero_inflation.html deleted file mode 100644 index cc08ea4..0000000 --- a/vignettes/check_zero_inflation.html +++ /dev/null @@ -1,2051 +0,0 @@ - - - - - - - - - - - - - -Check for zero-inflation in your data - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
-
-
-
- -
- - - - - - - -
-

Overview

-

This is a quick demonstration of the -check_zeroinflation() function from our -macpie package. This function is a fast diagnostic tool -to help you assess whether your MACseq data exhibits zero-inflation -relative to a negative binomial (NB) model.

-

We will use a lightweight convenience wrapper -subsample_genes() around -seqgendiff::select_counts() to create a -smaller object with a subset of genes for faster computation. This -function is also part of the macpie package.

-

Under the hood, check_zeroinflation() uses -edgeR to estimate gene-wise dispersions and compute -expected zero probabilities under a NB model. It then compares the -observed and expected zero-inflated indexes for each gene within each -group defined in the metadata.

-

This function:

-
    -
  • estimates gene-wise tagwise dispersions with edgeR (using all -selected groups),

  • -
  • builds NB-expected zero probabilities from TMMwsp-scaled means, -and

  • -
  • returns per-gene ZI (observed zeros minus NB-expected zeros) and -per-group summaries (e.g., % genes with ZI > 0.05). ZI-cutoffs are -user-defined.

  • -
-

The output is a list with two elements:

-
    -
  • summary_by_group: A summary table showing the number -and percentage of genes that are zero-inflated for each group at -specified cutoffs.

  • -
  • gene_metrics_by_group: A detailed table with -gene-wise metrics, including observed and expected zero numbers and -proportions, zero-inflation index (ZI), mean count, and dispersion -estimates for each gene in each group.

  • -
-

Note: check_zeroinflation() relies on -edgeR to estimate dispersion. The current implementation requires ≥2 -groups in the design so that edgeR can stabilize gene-wise dispersions -across groups. If you only have a single group and still want a -design-aware baseline for expected zeros, fit a Gamma–Poisson/NB GLM and -compute the expected zero probabilities from its fitted means and -over-dispersion.

-
-


-
-
-

Load data

-

Load data and preprocess before subsampling genes and checking for -zero-inflation.

-
#install.packages("macpie")  # or devtools::install_github("PMCC-BioinformaticsCore/macpie")
-library(macpie)
-
-# Define project variables
-project_name <- "PMMSq033"
-project_metadata <- system.file("extdata/PMMSq033_metadata.csv", package = "macpie")
-# Load metadata
-metadata <- read_metadata(project_metadata)
-
-
# Import raw data
-project_rawdata <- paste0(dir, "/macpieData/PMMSq033/raw_matrix")
-project_name <- "PMMSq033"
-raw_counts <- Read10X(data.dir = project_rawdata)
-# Create tidySeurat object
-mac <- CreateSeuratObject(counts = raw_counts,
-                          project = project_name,
-                          min.cells = 1,
-                          min.features = 1)
-#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
-#> ('-')
-# Join with metadata
-mac <- mac %>%
-  inner_join(metadata, by = c(".cell" = "Barcode"))
-# Add unique identifier
-mac <- mac %>%
-  mutate(combined_id = str_c(Compound_ID, Concentration_1, sep = "_")) %>%
-  mutate(combined_id = gsub(" ", "", .data$combined_id))
-mac <- mac %>%
-  filter(Project == "Current")
-
-

Filter genes with very low counts across all samples. This step is -important because genes with extremely low expression can lead to -unreliable estimates of dispersion and expected zero probabilities, -which can skew the zero-inflation assessment.

-
# Filter by read count per sample group
-mac <- filter_genes_by_expression(mac, 
-                                  group_by = "combined_id", 
-                                  min_counts = 10, 
-                                  min_samples = 2)
-
-
-

Subsample genes for faster computation

-

This is to randomly select a subset of 1000 genes for a quick check. -For a more comprehensive analysis, consider using a larger subset.

-
# Subsample genes for faster computation
-sub_mac <- subsample_genes(mac, ngene = 1000, gselect = "random", seed = 1)
-sub_mac %>% nrow()
-#> [1] 1000
-
-
-

Check for zero-inflation

-

We can now run the zero-inflation check on the subsampled data.

-
-

DMSO vs Camptothecin

-

First, we will compare two groups: “DMSO_0” and -“CAMPTOTHECIN_10”.

-
zi_results <- check_zeroinflation(sub_mac, 
-                                  group_by = "combined_id",
-                                  samples = c("DMSO_0","CAMPTOTHECIN_10"),
-                                  batch = 1,
-                                  cutoffs = c(0.1, 0.2))
-
-
-

View gene-wise metrics for each group

-

For DMSO group:

-
zi_results$gene_metrics_by_group %>% filter(group=="DMSO_0") %>% head()
-#>           group     gene mean_count_group dispersion p0_obs obs_zeros_num
-#> MTND1P23 DMSO_0 MTND1P23             9.68   9.77e-05      0             0
-#> MTATP6P1 DMSO_0 MTATP6P1             7.26   9.77e-05      0             0
-#> ISG15    DMSO_0    ISG15             9.58   9.77e-05      0             0
-#> PUSL1    DMSO_0    PUSL1             8.63   9.77e-05      0             0
-#> DVL1     DMSO_0     DVL1             8.05   9.77e-05      0             0
-#> MRPL20   DMSO_0   MRPL20             4.16   9.77e-05      0             0
-#>             p0_nb expected_zeros_num        ZI
-#> MTND1P23 8.79e-05            0.00167 -8.79e-05
-#> MTATP6P1 8.47e-04            0.01610 -8.47e-04
-#> ISG15    9.69e-05            0.00184 -9.69e-05
-#> PUSL1    2.34e-04            0.00445 -2.34e-04
-#> DVL1     4.03e-04            0.00765 -4.03e-04
-#> MRPL20   1.66e-02            0.31555 -1.66e-02
-

For Camptothecin group:

-
zi_results$gene_metrics_by_group %>% filter(group=="CAMPTOTHECIN_10") %>% head()
-#>                     group     gene mean_count_group dispersion p0_obs
-#> MTND1P231 CAMPTOTHECIN_10 MTND1P23             7.67   9.77e-05      0
-#> MTATP6P11 CAMPTOTHECIN_10 MTATP6P1             5.00   9.77e-05      0
-#> ISG151    CAMPTOTHECIN_10    ISG15             7.33   9.77e-05      0
-#> PUSL11    CAMPTOTHECIN_10    PUSL1             2.00   9.77e-05      0
-#> DVL11     CAMPTOTHECIN_10     DVL1             0.00   9.77e-05      1
-#> MRPL201   CAMPTOTHECIN_10   MRPL20            10.33   9.77e-05      0
-#>           obs_zeros_num    p0_nb expected_zeros_num        ZI
-#> MTND1P231             0 0.001184           0.003553 -0.001184
-#> MTATP6P11             0 0.010444           0.031333 -0.010444
-#> ISG151                0 0.001546           0.004638 -0.001546
-#> PUSL11                0 0.145972           0.437915 -0.145972
-#> DVL11                 3 1.000000           3.000000  0.000000
-#> MRPL201               0 0.000146           0.000438 -0.000146
-
-
-

View summary statistics for each group

-
zi_results$summary_by_group
-#>                           group n_genes n_wells median_p0_obs median_p0_nb
-#> CAMPTOTHECIN_10 CAMPTOTHECIN_10    1000       3             0     0.058845
-#> DMSO_0                   DMSO_0    1000      19             0     0.000491
-#>                 median_ZI observed_zeros_num expected_zeros_num pct_ZI_gt_0.1
-#> CAMPTOTHECIN_10 -0.013835                434              487.9         0.135
-#> DMSO_0          -0.000445                 17               55.2         0.000
-#>                 pct_ZI_gt_0.2
-#> CAMPTOTHECIN_10         0.066
-#> DMSO_0                  0.000
-

From the summary table, we can see the summary statistics for each -group, including the number and percentage of genes that are -zero-inflated at the specified cutoffs.

-

Each of the columns in the summary table are defined as follows:

-
    -
  • group: The treatment group

  • -
  • n_genes: Total number of genes analyzed in the group

  • -
  • n_wells: Total number of wells/samples in the group

  • -
  • median_p0_obs: Median observed zero proportion across genes in -the group

  • -
  • median_p0_nb: Median expected zero proportion under the NB model -across genes in the group

  • -
  • median_ZI: Median zero-inflation index (ZI = p0_obs - p0_nb for -each gene) across genes in the group

  • -
  • observed_zeros_num: Number of data points with observed zeros (it -shouldn’t be more than n_genes*n_wells for each group)

  • -
  • expected_zeros_num: Number of data points with expected zeros -under the NB model (same here, it shouldn’t be more than n_genes*n_wells -for each group)

  • -
  • pct_ZI_gt_0.1: Percentage of genes with ZI greater than -0.1

  • -
  • pct_ZI_gt_0.2: Percentage of genes with ZI greater than -0.2

  • -
-
-


- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-

See below for interpretation of the results

-

For DMSO vs camptothecin, we can see that both DMSO and camptothecin -groups have negative median ZI values, indicating that there is no -significant zero-inflation in either group. Percentage of genes with ZI -greater than 0.1 or 0.2 are 13.5% and 6.6% for camptothecin. As -camptothecin acts as a DNA topoisomerase inhibitor and induces DNA -damage, it can lead to cell cycle arrest and apoptosis, which may reduce -the overall transcriptional activity in the cells. This reduction in -transcriptional activity can result in genes being expressed at lower -levels or not at all. On the other hand, DMSO is a common solvent used -in biological experiments and is not expected to induce significant -changes in gene expression. Therefore, it is reasonable to observe a -lower level of zero-inflation in the DMSO group compared to the -camptothecin group. Same as ZI results for DMSO, the percentage of genes -with ZI greater than 0.1 or 0.2 are both 0%.

-

Besides, the observed zeros are also lower than the expected zeros, -further supporting the absence of zero-inflation. Overall, these results -suggest that the data does not exhibit significant zero-inflation, and a -standard negative binomial model may be appropriate for downstream -analyses. Zero-inflated models (ZINB) may not be necessary for this data -set.

-
-


-
-
-
-

DMSO vs Staruosporine

-

Next, we will compare two treatment groups: “DMSO_0” and -“Staurosporine_10”. As staurosporine is a potent inducer of apoptosis, -we might expect to see more zero-inflation in this group compared to -DMSO.

-
# Check for zero-inflation
-zi_results <- check_zeroinflation(sub_mac, 
-                                  group_by = "combined_id",
-                                  samples = c("DMSO_0","Staurosporine_10"),
-                                  batch = 1,
-                                  cutoffs = c(0.1, 0.2))
-
-

View gene-wise metrics for each group

-

For DMSO group:

-
zi_results$gene_metrics_by_group %>% filter(group=="DMSO_0") %>% head()
-#>           group     gene mean_count_group dispersion p0_obs obs_zeros_num
-#> MTND1P23 DMSO_0 MTND1P23             9.68   9.77e-05      0             0
-#> MTATP6P1 DMSO_0 MTATP6P1             7.26   4.24e-02      0             0
-#> ISG15    DMSO_0    ISG15             9.58   9.77e-05      0             0
-#> PUSL1    DMSO_0    PUSL1             8.63   9.77e-05      0             0
-#> DVL1     DMSO_0     DVL1             8.05   9.77e-05      0             0
-#> MRPL20   DMSO_0   MRPL20             4.16   5.22e-02      0             0
-#>             p0_nb expected_zeros_num        ZI
-#> MTND1P23 8.79e-05            0.00167 -8.79e-05
-#> MTATP6P1 1.99e-03            0.03784 -1.99e-03
-#> ISG15    9.69e-05            0.00184 -9.69e-05
-#> PUSL1    2.34e-04            0.00445 -2.34e-04
-#> DVL1     4.03e-04            0.00765 -4.03e-04
-#> MRPL20   2.42e-02            0.46018 -2.42e-02
-

For Staurosporine group:

-
zi_results$gene_metrics_by_group %>% filter(group=="Staurosporine_10") %>% head()
-#>                      group     gene mean_count_group dispersion p0_obs
-#> MTND1P231 Staurosporine_10 MTND1P23             4.33   9.77e-05  0.000
-#> MTATP6P11 Staurosporine_10 MTATP6P1            10.67   4.24e-02  0.000
-#> ISG151    Staurosporine_10    ISG15             5.00   9.77e-05  0.000
-#> PUSL11    Staurosporine_10    PUSL1             2.00   9.77e-05  0.667
-#> DVL11     Staurosporine_10     DVL1             2.33   9.77e-05  0.000
-#> MRPL201   Staurosporine_10   MRPL20             5.00   5.22e-02  0.000
-#>           obs_zeros_num   p0_nb expected_zeros_num       ZI
-#> MTND1P231             0 0.07082             0.2125 -0.07082
-#> MTATP6P11             0 0.00657             0.0197 -0.00657
-#> ISG151                0 0.05190             0.1557 -0.05190
-#> PUSL11                2 0.23189             0.6957  0.43478
-#> DVL11                 0 0.19278             0.5784 -0.19278
-#> MRPL201               0 0.05974             0.1792 -0.05974
-
-
-

View summary statistics for each group

-
zi_results$summary_by_group
-#>                             group n_genes n_wells median_p0_obs median_p0_nb
-#> DMSO_0                     DMSO_0    1000      19         0.000     0.000491
-#> Staurosporine_10 Staurosporine_10    1000       3         0.333     0.082998
-#>                  median_ZI observed_zeros_num expected_zeros_num pct_ZI_gt_0.1
-#> DMSO_0           -0.000457                 17               77.4         0.000
-#> Staurosporine_10  0.140550                713              354.9         0.541
-#>                  pct_ZI_gt_0.2
-#> DMSO_0                   0.000
-#> Staurosporine_10         0.409
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-

See below for interpretation of the results

-

For DMSO vs staurosporine, we can see that only staurosporine has 14% -of median ZI value, the observed zeros number (713) is much higher than -the expected zeros number (~355), indicating that there is some level of -zero-inflation in the staurosporine group. As staurosporine is a cell -death control, which it is expected to see zero-inflated data in this -group. In this case, it may be appropriate to consider using a -zero-inflated negative binomial (ZINB) model for downstream analyses to -account for the excess zeros in the data.

-
-
-
-

All treatment groups

-

Finally, we will check for zero-inflation across all treatment groups -in the data.

-
# Check for zero-inflation
-all_conditions <- unique(sub_mac$combined_id)
-zi_results <- check_zeroinflation(sub_mac, 
-                                  group_by = "combined_id",
-                                  samples = all_conditions,
-                                  batch = 1,
-                                  cutoffs = c(0.1, 0.2))
-
-

View gene-wise metrics for each group

-
zi_results$gene_metrics_by_group %>% filter(group=="Luminespib_10") %>% head()
-#>                    group     gene mean_count_group dispersion p0_obs
-#> MTND1P2364 Luminespib_10 MTND1P23             8.67   9.77e-05      0
-#> MTATP6P164 Luminespib_10 MTATP6P1             5.67   1.64e-02      0
-#> ISG1564    Luminespib_10    ISG15             7.00   9.77e-05      0
-#> PUSL164    Luminespib_10    PUSL1             8.00   9.77e-05      0
-#> DVL164     Luminespib_10     DVL1             5.33   9.77e-05      0
-#> MRPL2064   Luminespib_10   MRPL20             8.67   4.96e-02      0
-#>            obs_zeros_num    p0_nb expected_zeros_num        ZI
-#> MTND1P2364             0 0.000301           0.000904 -0.000301
-#> MTATP6P164             0 0.005591           0.016773 -0.005591
-#> ISG1564                0 0.001346           0.004038 -0.001346
-#> PUSL164                0 0.000547           0.001641 -0.000547
-#> DVL164                 0 0.006143           0.018428 -0.006143
-#> MRPL2064               0 0.001014           0.003043 -0.001014
-
zi_results$summary_by_group %>% head(10)
-#>                               group n_genes n_wells median_p0_obs median_p0_nb
-#> 5-Azacytidine_0.1 5-Azacytidine_0.1    1000       3             0     0.001313
-#> 5-Azacytidine_1     5-Azacytidine_1    1000       3             0     0.000833
-#> 5-Azacytidine_10   5-Azacytidine_10    1000       3             0     0.002611
-#> Abemaciclib_0.1     Abemaciclib_0.1    1000       3             0     0.000542
-#> Abemaciclib_1         Abemaciclib_1    1000       3             0     0.001257
-#> Abemaciclib_10       Abemaciclib_10    1000       3             0     0.011172
-#> Adavosertib_0.1     Adavosertib_0.1    1000       3             0     0.001502
-#> Adavosertib_1         Adavosertib_1    1000       3             0     0.001106
-#> Adavosertib_10       Adavosertib_10    1000       3             0     0.003791
-#> ANASTROZOLE_0.1     ANASTROZOLE_0.1    1000       3             0     0.000475
-#>                   median_ZI observed_zeros_num expected_zeros_num pct_ZI_gt_0.1
-#> 5-Azacytidine_0.1 -0.001313                  8               32.2         0.007
-#> 5-Azacytidine_1   -0.000694                 12               28.7         0.007
-#> 5-Azacytidine_10  -0.001909                 59               87.1         0.030
-#> Abemaciclib_0.1   -0.000542                  5               18.4         0.005
-#> Abemaciclib_1     -0.000974                 16               30.6         0.013
-#> Abemaciclib_10    -0.007505                112               86.7         0.098
-#> Adavosertib_0.1   -0.001502                  7               19.8         0.007
-#> Adavosertib_1     -0.001098                 13               23.5         0.012
-#> Adavosertib_10    -0.002832                 46               65.1         0.037
-#> ANASTROZOLE_0.1   -0.000475                  1               26.2         0.001
-#>                   pct_ZI_gt_0.2
-#> 5-Azacytidine_0.1         0.007
-#> 5-Azacytidine_1           0.004
-#> 5-Azacytidine_10          0.018
-#> Abemaciclib_0.1           0.005
-#> Abemaciclib_1             0.012
-#> Abemaciclib_10            0.084
-#> Adavosertib_0.1           0.006
-#> Adavosertib_1             0.010
-#> Adavosertib_10            0.025
-#> ANASTROZOLE_0.1           0.001
-
high_doses_zi_results <- zi_results$summary_by_group %>% filter(grepl("_10$", group))
-#concatenate with DMSO
-high_doses_zi_results <- rbind(high_doses_zi_results,
-                                zi_results$summary_by_group %>% filter(group=="DMSO_0"))
-
-
-long_zi_results <- high_doses_zi_results %>% select(group, median_p0_obs, median_p0_nb, median_ZI) %>%
-  pivot_longer(cols = c(median_p0_obs, median_p0_nb, median_ZI), 
-               names_to = "metric", 
-               values_to = "value")
-
-library(ggplot2)
-ggplot(long_zi_results, aes(x = group, y = value, fill = metric)) +
-  geom_bar(stat = "identity", position = "dodge") +
-  labs(title = "Zero-Inflation Metrics by Treatment Groups (10uM) and DMSO",
-       x = "Groups",
-       y = "zero-inflation proportion") +
-  theme_minimal() +
-  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
-  scale_fill_manual(values = macpie_colours$discrete[1:3])
-

-
-
-

See below for interpretation of the results

-
    -
  • median_p0_obs: Median observed zero proportion across genes in -the group

  • -
  • median_p0_nb: Median expected zero proportion under the NB model -across genes in the group

  • -
  • median_ZI: Median zero-inflation index (ZI = p0_obs - p0_nb for -each gene) across genes in the group

  • -
-

From the plot, we can see that most treatment groups have small or -even negative median ZI values, indicating that there is no significant -zero-inflation in these groups. Only Staurosporine_10 shows positive -median ZI values, suggesting some level of zero-inflation in the group. -As described above, staurosporine is a potent inducer of apoptosis, -which can lead to increased cell death and reduced gene expression, -resulting in more zero counts in the data. Therefore, it is reasonable -to observe excessive zeros in the staurosporine group. For other -treatment groups, the small or even negative median ZI values suggest -that a standard negative binomial model may be appropriate for -downstream analyses. Zero-inflated models (ZINB) may not be necessary -for these groups.

-
-
-
- - - -
-
- -
- - - - - - - - - - - - - - - - diff --git a/vignettes/macpie_short.Rmd b/vignettes/macpie_short.Rmd deleted file mode 100644 index d71eb15..0000000 --- a/vignettes/macpie_short.Rmd +++ /dev/null @@ -1,408 +0,0 @@ ---- -title: "macpie: overview" -output: - rmarkdown::html_document: - theme: flatly - toc: true - toc_float: true - toc_depth: 5 -vignette: > - %\VignetteIndexEntry{overview} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} - %\VignetteBuild{true} ---- - - -```{r setup, include = FALSE} -# keep vignette light & deterministic on CI -is_ci <- identical(Sys.getenv("CI"), "true") -if (is_ci) { - # Disable parallelization so nothing is exported to workers - if (requireNamespace("future", quietly = TRUE)) { - future::plan("sequential") - } - options(mc.cores = 1) -} -knitr::opts_chunk$set(message = FALSE, warning = FALSE) -``` - - - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - warning = FALSE, - message = FALSE -) -library(dplyr) -library(stringr) # str_c -library(ggplot2) # ggplot, geom_point, etc. -library(tidyr) # pivot_wider -library(tibble) # column_to_rownames -library(ggiraph) # girafe() -library(pheatmap) # pheatmap() -library(enrichR) # enrichr, plotEnrich -library(gridExtra) # grid.arrange -library(Seurat) -library(ggrepel) -``` - -## Introduction - -MAC-seq is a cost-effective, high-throughput transcriptomic platform, developed as a collaboration between Victorian Centre for Functional Genomics (VCFG) and Molecular Genomics Core (MGC) facilities at Peter MacCallum Cancer Centre (Peter Mac), primarily designed for small molecule screening. However, its versatility extends beyond this application, thanks to its integration with high-throughput microscopy and 3D cell culturing techniques. - -macpie is a toolkit designed for researchers, originally with MAC-seq data in mind, but validated for general High-Throughput Transcriptomics (HTTr) data applications. Its primary aim is to deliver the latest tools for quality control (QC), visualization, and analysis. macpie is a result of a collaborative effort by a workgroup at the PeterMac, with a substantial support of the VCFG amd MGC core facilities. - -In this vignette we cover the basic functionality of macpie, from input, quality control to transcriptional and screen-related analyses. For more in-depth workflows, please refer to other vignettes at https://pmcc-bioinformaticscore.github.io/macpie/articles/macpie.html. - -### 1. Metadata import and QC - -**Key points**: - - - metadata has to contain at least columns **Plate_ID**, **Well_ID**, **Row**, **Column**, **Species**, **Sample_type**, **Treatment_1**, **Concentration_1** and **Barcode**. - -Metadata has to be in a tabular format and contain a standardised set of columns to define coordinates of a sample on a plate, provide minimum information about the sample, and allow connection with the transcriptomic data through the sample barcodes. - -To ensure the integrity of metadata for future analyses, we provide the user with a set of tools to verify metadata consistency and visualize the key variables, described in more depth in the [QC vignette](articles/quality_control.html). We will first visually inspect all experimental variables, in order to identify potential artefacts. - -```{r set_wd, include=FALSE} -dir <- "/Users/liuxin/macpie_Dev/" -devtools::load_all(paste0(dir, "macpie/")) -``` - -```{r metadata_plot, fig.width = 8, fig.height = 6} -#install.packages("macpie") # or devtools::install_github("PMCC-BioinformaticsCore/macpie") -library(macpie) -library(enrichR) -library(randomForest) -library(pheatmap) - -# Load metadata -project_metadata <- system.file("extdata/PMMSq033_metadata.csv", package = "macpie") - -# Load metadata -metadata <- read_metadata(project_metadata) -plot_metadata_heatmap(metadata) -``` - -> **Key Lessons for Robust Experimental Design** -> -> - Based on our experience, the specific metadata you need will vary greatly with your experiment's design. Here are some crucial lessons we've learned to help you achieve reliable results: -> -> - Plate Layout Matters: Always place replicate sample wells on the same assay plate, not across multiple plates. -> -> - Minimum Replicates: Aim for a minimum of 3 replicates per condition to ensure statistical robustness. -> -> - Strategic Negative Controls: For negative control wells, we recommend including 10 wells randomized across the same assay plate. This provides a more robust baseline. - - - - -### 2. Sequencing data import and QC - -**Key points**: - - - pay special attention to removal of lowly expressed genes and then: - - use **plot_plate_layout** to check plate-level effects (edge vs centre, between plates) - - use **plot_mds** to check sample grouping (umap/pca is also available using Seurat's functions) - - use **compute_qc_metrics**, **plot_qc_metrics_heatmap**, and **plot_distance** to check sample variability and outliers - - use **plot_rle** to check any row/column/plate effects and compare normalization methods - - more detailed methods avaailable in vignette [Quality control](articles/quality_control.html) - -#### 2.1 Import data to tidySeurat object - -**Data access**: - - - load your raw counts by providing Read10X function with a path to a directory containing matrix.mtx.gz, barcodes.tsv.gz, and features.tsv.gz, commonly "raw_matrix" from CellRanger or StarSolo output - - the full manuscript dataset (>10 MB of `.rds` files) is publicly available at https://zenodo.org/records/15778812. - - use data(mini_mac) to load a subsample (308 samples, 200 genes) of the full dataset, as in the commented code below - -```{r load_read_data} -## 1. Load your own gene counts per sample or 2. data from the publication -#project_rawdata <- "raw_matrix" -#project_name <- "PMMSq033" -#raw_counts <- Read10X(data.dir = project_rawdata) -# -## Create tidySeurat object -#mac <- CreateSeuratObject(counts = raw_counts, -# project = project_name, -# min.cells = 1, -# min.features = 1) - -# 3. Alternatively, load a pre-made example: -data(mini_mac) -mac <- mini_mac - -# Join gene counts per sample with metadata (if not already included) -if (nrow(mac[[]])==0) { - mac <- mac %>% - inner_join(metadata, by = c(".cell" = "Barcode")) -} - -# Create unique identifier for your treatments based on metadata -mac <- mac %>% - mutate(combined_id = str_c(Treatment_1, Concentration_1, sep = "_")) %>% - mutate(combined_id = gsub(" ", "", .data$combined_id)) %>% - mutate(combined_id = make.names(combined_id)) - -# # Filter by read count per sample group -mac <- filter_genes_by_expression(mac, - group_by = "combined_id", - min_counts = 1, - min_samples = 1) -``` - -#### 2.2 Basic QC metrics - -macpie allows you to use the common QC plots from the Seurat package to visualise the number of genes, reads, and percentage of mitochondrial and ribosomal genes per sample. -```{r violin_plot, fig.width = 8, fig.height = 4} -# Calculate the percent of mitochondrial and ribosomal genes -mac[["percent.mt"]] <- PercentageFeatureSet(mac, pattern = "^mt-|^MT-") -mac[["percent.ribo"]] <- PercentageFeatureSet(mac, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA") - -# Example of a function from Seurat quality control -VlnPlot(mac, features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.ribo"), - ncol = 4, group.by = "Sample_type") & - scale_fill_manual(values = macpie_colours$discrete) -``` - -In addition, you can apply tidyverse functions to further explore the dataset. For example, let's subset the Seurat object based on the column "Project" in the metadata and visualise the grouping of data on the plate vs on an MDS plot. Plate layout plots provide an interactive way to inspect spatial patterns across a plate, helping to identify anomalies or unexpected trends. When hovering over the plot, sample groups are automatically highlighted to aid interpretation. - -```{r subset_seurat, fig.width = 8, fig.height = 3} -unique(mac$Project) -mac <- mac %>% - filter(Project == "Current") - -# Interactive QC plot plate layout (all metadata columns can be used): -p <- plot_plate_layout(mac, "nCount_RNA", "combined_id") -girafe(ggobj = p, - fonts = list(sans = "sans"), - options = list( - opts_hover(css = "stroke:black; stroke-width:0.8px;") # <- slight darkening - )) -``` - -##### 2.2.1 Sample grouping with MDS plot - -In order to assess sample grouping, we should visualise sample similarity based on the limma's MDS (MultiDimensional Scaling) function. Samples that are treated with a lower concentration of compound will often cluster close to the negative (vehicle) control. Hovering over the individual dots reveals sample identity and grouping. - -```{r mds_plot, fig.width = 8, fig.height = 6} -p <- plot_mds(mac, group_by = "Sample_type", label = "combined_id", n_labels = 30) -girafe(ggobj = p, fonts = list(sans = "sans")) - -``` - -##### 2.2.2 Distribution of read counts - -Visualising distribution of read counts across treatments is an easy way to compare the effects of treatments and estimate sample variability. Read count is commonly directly proportional to the number of cells. - -```{r qc_stats_plot, fig.width = 8, fig.height = 4} -qc_stats <- compute_qc_metrics(mac, group_by = "combined_id", order_by = "median") -``` - -##### 2.2.3 Variability among all replicates - -In relation to the previous plot, we want the user to have the ability to quantify the dispersion of reads between sample replicates. Therefore, we provide access to several statistical metrics such as standard deviation (sd_value), z score (z_score), mad (mad_value) and IQR (IQR) which can be used as a parameter to the function plot_qc_metrics individually, or assessed at once with the function plot_qc_metrics_heatmap. - -```{r qc_metrics, fig.width = 8, fig.height = 4} -plot_qc_metrics_heatmap(qc_stats$stats_summary) -``` - -Identifying outliers and batch effects, especially in the untreated samples, is especially important for downstream analysis. For further statistical methods to quantify variability among sample groups and inter-replicate variability please refer to vignette [Quality control](articles/quality_control.html). - -##### 2.2.4 Correction of the batch effect - -Several methods are available for scaling and normalizing transcriptomic data, with their effects most clearly visualized using RLE (Relative Log Expression) plots. In our case, limma_voom provides the lowest average coefficient of variation, when compared to other methods such as "raw", Seurat "SCT" or "edgeR". - -```{r plot_rle, fig.width = 8, fig.height = 4} -# First we will subset the data to look at control, DMSO samples only -mac_dmso <- mac %>% - filter(Treatment_1 == "DMSO") - -# Run the RLE function -plot_rle(mac_dmso, label_column = "Row", normalisation = "limma_voom") -``` - - -### 3. Differential gene expression - -**Key points**: - - - use **compute_single_de** to perform a differential expression analysis for one treatment group vs control - - use **compute_multi_de** to perform differential expression analyses for all treatment groups vs control - - use volcano plot, box plot and heatmap to show results from the analyses and visualise gene expression levels - - use **enrichr** for pathway enrichment analysis - - more detailed methods available in vignette [Transcriptional analyses](articles/transcriptional_analyses.html) - - -#### 3.1. Single comparison - -Similar to RNA-seq, the quality of differential gene expression analysis in MAC-seq depends on low variability among replicates and the suitability of the statistical model. These aspects are assessed during the quality control stage of the workflow. - -Results of differential expression analysis are classically visualised with a volcano plot. - - -```{r de_analysis, fig.width = 8, fig.height = 6} -# First perform the differential expression analysis -treatment_samples <- "Staurosporine_10" -control_samples <- "DMSO_0" -top_table <- compute_single_de(mac, treatment_samples, control_samples, method = "limma_voom") - -# Let's visualise the results with a volcano plot -plot_volcano(top_table, max.overlaps = 16) -``` - -Based on the results, we can quickly check gene expression levels in counts per million (CPM) for selected genes between treatment and control samples as described below. - -```{r plot_counts, fig.width = 8, fig.height = 6} -genes <- top_table$gene[1:6] -group_by <- "combined_id" -plot_counts(mac, genes, group_by, treatment_samples, control_samples, normalisation = "cpm", color_by = "combined_id") -``` - -#### 3.2. Pathway analysis - -Differential gene expression results for individual comparisons of treatment vs control are usually performed with functions from package enrichR and fgsea. In the following case, the effect of Staurosporine on breast cancer cells through Myc inactivation can be observed through pathway enrichment analyses. - -```{r pathway_analysis_single, fig.width = 8, fig.height = 4} -top_genes <- top_table %>% - filter(p_value_adj < 0.05) %>% - select(gene) %>% - pull() - -enriched <- enrichR::enrichr(top_genes, c("MSigDB_Hallmark_2020")) -p1 <- enrichR::plotEnrich(enriched[[1]]) + - macpie_theme(legend_position_ = 'right') + - scale_fill_gradientn(colors = macpie_colours$divergent) - -gridExtra::grid.arrange(p1, ncol = 1) -``` - -#### 3.3. Differential gene expression - multiple comparisons - -Since MAC-seq is commonly used for high-throughput screening of compound libraries, we often want to compare multiple samples in a screen vs the control. This process can easily be parallelised. First we select a vector of "treatments" as combined_ids that do not contain the word "DMSO". (Warning, due to the limitations of "mclapply", parallelisation speedup currently only works on OSX and Linux machines, and not on Windows.) - -```{r de_multi, fig.width = 8, fig.height = 5} -# Filter out lower concentrations of compounds and untreated samples -treatments <- mac %>% - filter(Concentration_1 == 10) %>% - select(combined_id) %>% - filter(!grepl("DMSO", combined_id)) %>% - pull() %>% - unique() -mac <- compute_multi_de(mac, treatments, control_samples = "DMSO_0", method = "limma_voom", num_cores = 1) -``` - - -We often want to ask which genes are differentially expressed in more than one treatment group. - -Here, we can visualise treatment groups with shared differentially expressed genes, defined as the top 5 DE genes from each single drug comparison (treatment vs control) that are found in at least 2 different treatment groups. - -The heatmap below shows shared differentially expressed genes with corresponding log2FC values. - -```{r plot_multi_de, fig.width=10, fig.height=6} -plot_multi_de(mac, group_by = "combined_id", value = "log2FC", p_value_cutoff = 0.01, direction="up", n_genes = 5, control = "DMSO_0", by="fc") -``` - -#### 3.4. Pathway analysis - multiple comparisons - -The pathway enrichment analysis is done with R package enrichR, and can be summarised with a heatmap, visualising direct and offtarget effects of the perturbations. - -```{r enriched_pathways, fig.width = 8, fig.height = 12} -# Load genesets from enrichr for a specific species or define your own -enrichr_genesets <- download_geneset("human", "MSigDB_Hallmark_2020") -mac <- compute_multi_enrichr(mac, genesets = enrichr_genesets) - -enriched_pathways_mat <- Tool(mini_mac, slot = "pathway_enrichment") %>% - bind_rows() %>% - select(combined_id, Term, Combined.Score) %>% - pivot_wider(names_from = combined_id, values_from = Combined.Score) %>% - column_to_rownames(var = "Term") %>% - mutate(across(everything(), ~ ifelse(is.na(.), 0, log1p(.)))) %>% # Replace NA with 0 across all columns - as.matrix() - -pheatmap(enriched_pathways_mat, color = macpie_colours$continuous_rev) -``` - -### 4. Methods for compound screening - -**Key points**: - - - use **plot_de_umap** to find compounds that behave similarly based on proximity on UMAP maps - - use **compute_single_dose_response** to evaluate impact of compound concentrations on - gene expression or pathway enrichment - - use **compute_multi_screen_profile** to find perturbations similar to your target profile or - a known gene set - - more detailed methods avaailable in vignette [Compound screening](articles/compound_screening.html) - -#### 4.1. UMAP clustering based on DE genes - -Instead of plotting UMAP on individual samples, we can also visualise the samples on UMAP of differential gene expression vs control. This allows us to use batch-corrected data and reduce replicate noise, while showing the grouping of treatments. - -```{r compute_de_umap, fig.width = 8, fig.height = 5} -mac_agg <- aggregate_by_de(mac) -mac_agg <- compute_de_umap(mac_agg) -mac_agg <- FindNeighbors(mac_agg, reduction = "umap_de", dims = 1:2) -mac_agg <- FindClusters(mac_agg, resolution = 1.3) - -cell_coords <- Embeddings(mac_agg, reduction = "umap_de") %>% - as.data.frame() %>% - rownames_to_column("combined_id") %>% - left_join(mac_agg[[]], by = "combined_id") - -# Plot with clusters and labels -ggplot(cell_coords, aes(x = UMAPde_1, y = UMAPde_2, color = seurat_clusters)) + - geom_point(size = 2) + - geom_text_repel(aes(label = combined_id), size = 3, max.overlaps = 10, force_pull = 1) + - theme_minimal() + - guides(color = guide_legend(title = "Cluster")) + - labs(x = "UMAP 1", y = "UMAP 2", title = "UMAP with Cell Names") + - theme(legend.position = "right") -``` - -#### 4.2. Similarity to a treatment profile or phenotype - -Additionally, when performing a screen, sometimes we want to measure similarity to either an existing profile, or to a user-defined gene-set that defines a desired phenotype. - -```{r compute_multi_screen_profile, fig.width = 8, fig.height = 5} -mac <- compute_multi_screen_profile(mac, target = "Staurosporine_10", n_genes_profile = 50, direction = "down", num_cores = 1) -mac_screen_profile <- Tool(mac, slot = "screen_profile") %>% - mutate(logPadj = c(-log10(padj))) %>% - arrange(desc(NES)) %>% - mutate(target_id = factor(target_id, levels = unique(target_id))) - -ggplot(mac_screen_profile, aes(target_id, NES)) + - #geom_point(aes(size = logPadj)) + - geom_point() + - facet_wrap(~pathway, scales = "free") + - macpie_theme(x_labels_angle = 90, show_x_title = FALSE) -``` - -#### 4.3. Estimate of dose-response - -macpie can be used to calculate dose-response curves for individual genes, pathways or any other external measurement such as cell viability that is available in your metadata, based on the R package drc These are also available in a paralelisable format with the function “compute_multiple_dose_response”. - -```{r compute_EC50_curves, fig.width = 6, fig.height = 3} -treatments <- mac %>% - select(combined_id) %>% - filter(!grepl("DMSO", combined_id)) %>% - pull() %>% - unique() -mac <- compute_multi_de(mac, treatments, control_samples = "DMSO_0", method = "limma_voom", num_cores = 1) -mac <- compute_multi_enrichr(mac, genesets = enrichr_genesets) -res <- compute_single_dose_response(data = mac, - gene = "DIP2C", - normalisation = "limma_voom", - treatment_value = "Staurosporine") -res$plot -res <- compute_single_dose_response(data = mac, - pathway = "p53 Pathway", - treatment_value = "Camptothecin") -res$plot -``` - -```{r session-info, echo=FALSE} -sessionInfo() -``` \ No newline at end of file diff --git a/vignettes/model_selection.Rmd b/vignettes/model_selection.Rmd deleted file mode 100644 index 1d78341..0000000 --- a/vignettes/model_selection.Rmd +++ /dev/null @@ -1,177 +0,0 @@ ---- -title: "Model selection " -output: - rmarkdown::html_document: - theme: flatly - toc: true - toc_float: true - toc_depth: 5 -vignette: > - %\VignetteIndexEntry{check_zero_inflation} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} - %\VignetteBuild{true} - ---- - - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - - - -```{r set_wd, include=FALSE} -dir <- "/Users/liuxin/macpie_Dev/" -devtools::load_all(paste0(dir, "macpie/")) -``` - -```{r setup} -#install.packages("macpie") # or devtools::install_github("PMCC-BioinformaticsCore/macpie") -library(macpie) - -# Define project variables -project_name <- "PMMSq033" -project_metadata <- system.file("extdata/PMMSq033_metadata.csv", package = "macpie") -# Load metadata -metadata <- read_metadata(project_metadata) - -``` - -```{r load_data} -# Import raw data -project_rawdata <- paste0(dir, "/macpieData/PMMSq033/raw_matrix") -project_name <- "PMMSq033" -raw_counts <- Read10X(data.dir = project_rawdata) -# Create tidySeurat object -mac <- CreateSeuratObject(counts = raw_counts, - project = project_name, - min.cells = 1, - min.features = 1) -# Join with metadata -mac <- mac %>% - inner_join(metadata, by = c(".cell" = "Barcode")) -# Add unique identifier -mac <- mac %>% - mutate(combined_id = str_c(Compound_ID, Concentration_1, sep = "_")) %>% - mutate(combined_id = gsub(" ", "", .data$combined_id)) -mac <- mac %>% - filter(Project == "Current") - -``` - -Filter genes with very low counts across all samples. This step is important because genes with extremely low expression can lead to unreliable estimates of dispersion and expected zero probabilities, which can skew the zero-inflation assessment. - -```{r} -# Filter by read count per sample group -mac <- filter_genes_by_expression(mac, - group_by = "combined_id", - min_counts = 10, - min_samples = 2) -``` - - -## Subsample genes for faster computation - -This is to randomly select a subset of 1000 genes for a quick check. For a more comprehensive analysis, consider using a larger subset. - -```{r} -# Subsample genes for faster computation -sub_mac <- subsample_genes(mac, ngene = 1000, gselect = "random", seed = 1) -sub_mac %>% nrow() -``` - - - - -## DMSO vs Camptothecin - -We can now run the zero-inflation check on the subsampled data. - -First, we will compare two treatment groups: "DMSO_0" and "CAMPTOTHECIN_10". - - -### Check for zero-inflation - - -```{r} -zi_results <- check_zeroinflation(sub_mac, - group_by = "combined_id", - samples = c("DMSO_0","CAMPTOTHECIN_10"), - batch = 1, - cutoffs = c(0.1, 0.20)) -zi_results$summary_by_group -``` - -From the summary table, we can see the summary statistics for each group, including the number and percentage of genes that are zero-inflated at the specified cutoffs. - -Each of the columns in the summary table are defined as follows: - - - group: The treatment group - - - n_genes: Total number of genes analyzed in the group - - - n_wells: Total number of wells/samples in the group - - - median_p0_obs: Median observed zero proportion across genes in the group - - - median_p0_nb: Median expected zero proportion under the NB model across genes in the group - - - median_ZI: Median zero-inflation index (ZI = p0_nb - p0_obs for each gene) across genes in the group - - - observed_zeros_num: Number of data points with observed zeros (it shouldn't be more than n_genes*n_wells for each group) - - - expected_zeros_num: Number of data points with expected zeros under the NB model (same here, it shouldn't be more than n_genes*n_wells for each group) - - - pct_ZI_gt_0.1: Percentage of genes with ZI greater than 0.1 - - - pct_ZI_gt_0.2: Percentage of genes with ZI greater than 0.2 - - - - -### RLE plots for visualisation - -RLE (Relative Log Expression) plots can be used here to visualize the distribution of gene expression levels across samples within each treatment/control group. It helps to identify any systematic biases or differences in expression distributions that may indicate RUV method s are needed for normalization. - -```{r} -de_groups <- subset(sub_mac, subset = combined_id %in% c("DMSO_0", "CAMPTOTHECIN_10")) -plot_rle(de_groups, label_column = "combined_id") -plot_rle(de_groups, label_column = "combined_id", normalisation = "limma_voom") -plot_rle(de_groups, label_column = "combined_id", normalisation = "DESeq2") -``` - - - - - -### DMSO vs Staruosporine - -Next, we will compare two treatment groups: "DMSO_0" and "Staurosporine_10". As staurosporine is a potent inducer of apoptosis, we might expect to see more zero-inflation in this group compared to DMSO. - - -```{r} -# Check for zero-inflation -zi_results <- check_zeroinflation(sub_mac, - group_by = "combined_id", - samples = c("DMSO_0","Staurosporine_10"), - batch = 1, - cutoffs = c(0.1, 0.20)) -``` - -```{r} -zi_results$summary_by_group -``` - - -```{r} -#select DMSO and Staurosporine groups -de_groups <- subset(sub_mac, subset = combined_id %in% c("DMSO_0", "Staurosporine_10")) -plot_rle(de_groups, label_column = "combined_id") -plot_rle(de_groups, label_column = "combined_id", normalisation = "limma_voom") -plot_rle(de_groups, label_column = "combined_id", normalisation = "DESeq2") -# plot_rle(de_groups, label_column = "combined_id", normalisation = "zinb") -``` \ No newline at end of file From 4b845b2d86793b30711ae8c86a76f325a0e2ea31 Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Wed, 22 Oct 2025 16:42:22 +1100 Subject: [PATCH 10/16] finish up select_robust_controls --- R/select_robust_controls.R | 38 +++++++++++++++++--- man/select_robust_controls.Rd | 8 ++--- tests/testthat/test-select_robust_controls.R | 7 ++++ 3 files changed, 44 insertions(+), 9 deletions(-) create mode 100644 tests/testthat/test-select_robust_controls.R diff --git a/R/select_robust_controls.R b/R/select_robust_controls.R index b250c93..0bf62f8 100644 --- a/R/select_robust_controls.R +++ b/R/select_robust_controls.R @@ -7,7 +7,7 @@ #' plots per-sample expression distributions and sample–sample correlation heatmaps. #' #' @param data A tidyseurat object containing an RNA assay with a **counts** layer. -#' @param combined_id Character scalar: the control/treatment label to keep +#' @param samples the control/treatment label to keep in column samples #' (e.g., `"CB_43_EP73_0"`). Only cells/samples with this label are considered. #' @param orig_ident Character scalar: the plate/batch identifier to keep #' (e.g., `"VH02012942"`). Only cells/samples from this batch are considered. @@ -24,7 +24,7 @@ #' #' @details #' Workflow: -#' 1) Subset to the specified `combined_id` **and** `orig_ident` (plate/batch). +#' 1) Subset to the specified `samples` **and** `orig_ident` (plate/batch). #' 2) Build an `edgeR::DGEList`, filter lowly expressed genes using CPM and `min_samps`. #' 3) Normalize with **TMMwsp** and compute log2-CPM. #' 4) Rank samples by mean Fisher z–transformed correlation to all *other* samples @@ -49,7 +49,7 @@ #' #' @examples #' data(mini_mac) -#' res <- select_robust_controls(mini_mac,combined_id = "DMSO_0", orig_ident = "PMMSq033_mini") +#' res <- select_robust_controls(mini_mac,samples = "DMSO_0", orig_ident = "PMMSq033_mini") #' #' #' @@ -60,7 +60,7 @@ select_robust_controls <- function( data, - combined_id, # e.g. "CB_43_EP73_0" + samples, # e.g. "CB_43_EP73_0" orig_ident, # e.g. "VH02012942" cpm_filter = 1, # CPM threshold for gene filtering min_samps = 16, # number of samples a gene must be expressed in @@ -68,8 +68,36 @@ select_robust_controls <- function( top_n = 5, make_plots = TRUE ){ + validate_inputs <- function(data, samples, orig_ident) { + if (!inherits(data, "Seurat")) { + stop("argument 'data' must be a Seurat or TidySeurat object.") + } + + # check samples and orig_ident columns + if (colnames(data@meta.data)%in% c("combined_id","orig.ident") %>% sum() < 2) { + stop("The 'data' object must contain 'combined_id' and 'orig.ident' columns in its metadata.") + } + # check samples in samples column + if (is.null(samples)){ + stop("Please provide a value for 'samples'.") + } else if (!all(samples %in% unique(data$combined_id))) { + stop("Some values in 'samples' are not found in the 'combined_id' column of 'data'.") + } + # check orig.ident in the orig.ident column + if (is.null(orig_ident)){ + stop("Please provide a value for 'orig_ident'.") + } else if (!orig_ident %in% unique(data$orig.ident)) { + stop("The value of 'orig_ident' is not found in the 'orig.ident' column of 'data'.") + } + return(list(data = data, samples = samples, orig_ident = orig_ident)) + } + validated <- validate_inputs(data = data, samples = samples, orig_ident = orig_ident) + data <- validated$data + group_by <- validated$orig_ident + samples <- validated$samples + corr_method <- match.arg(corr_method) - sel_cells <- colnames(data)[data$combined_id == combined_id & + sel_cells <- colnames(data)[data$combined_id == samples & data$orig.ident == orig_ident] if (length(sel_cells) == 0L) { stop("No cells/samples match the specified 'combined_id' and 'orig_ident'.") diff --git a/man/select_robust_controls.Rd b/man/select_robust_controls.Rd index 0d4de26..e05a997 100644 --- a/man/select_robust_controls.Rd +++ b/man/select_robust_controls.Rd @@ -6,7 +6,7 @@ \usage{ select_robust_controls( data, - combined_id, + samples, orig_ident, cpm_filter = 1, min_samps = 16, @@ -18,7 +18,7 @@ select_robust_controls( \arguments{ \item{data}{A tidyseurat object containing an RNA assay with a \strong{counts} layer.} -\item{combined_id}{Character scalar: the control/treatment label to keep +\item{samples}{the control/treatment label to keep in column samples (e.g., \code{"CB_43_EP73_0"}). Only cells/samples with this label are considered.} \item{orig_ident}{Character scalar: the plate/batch identifier to keep @@ -61,7 +61,7 @@ plots per-sample expression distributions and sample–sample correlation heatma \details{ Workflow: \enumerate{ -\item Subset to the specified \code{combined_id} \strong{and} \code{orig_ident} (plate/batch). +\item Subset to the specified \code{samples} \strong{and} \code{orig_ident} (plate/batch). \item Build an \code{edgeR::DGEList}, filter lowly expressed genes using CPM and \code{min_samps}. \item Normalize with \strong{TMMwsp} and compute log2-CPM. \item Rank samples by mean Fisher z–transformed correlation to all \emph{other} samples @@ -75,7 +75,7 @@ for easier visual inspection in plots. } \examples{ data(mini_mac) -res <- select_robust_controls(mini_mac,combined_id = "DMSO_0", orig_ident = "PMMSq033_mini") +res <- select_robust_controls(mini_mac,samples = "DMSO_0", orig_ident = "PMMSq033_mini") diff --git a/tests/testthat/test-select_robust_controls.R b/tests/testthat/test-select_robust_controls.R new file mode 100644 index 0000000..018484e --- /dev/null +++ b/tests/testthat/test-select_robust_controls.R @@ -0,0 +1,7 @@ +test_that("multiplication works", { + # Load test dataset from tests/testthat/testdata + testdata <- get_testdata() + expect_error(select_robust_controls(testdata, samples=NULL, orig_ident="PLATE")) + res <- select_robust_controls(testdata, samples="DMSO_0", orig_ident="testdata", make_plots = FALSE) + expect_true(is.list(res)) +}) From d1f0aa6478c1d26d14fc60061c81391cb3e09c5c Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Wed, 22 Oct 2025 16:44:15 +1100 Subject: [PATCH 11/16] update vignettes/cross_platform_compatibility.Rmd --- NAMESPACE | 5 + _pkgdown.yaml | 4 +- vignettes/cross_platform_compatibility.Rmd | 972 ++++++++++++++++++++- 3 files changed, 969 insertions(+), 12 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 131faf9..e0b53ce 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -35,6 +35,7 @@ export(plot_qc_metrics_heatmap) export(plot_rle) export(plot_volcano) export(read_metadata) +export(select_robust_controls) export(subsample_genes) export(summarise_de) export(validate_metadata) @@ -66,6 +67,9 @@ importFrom(dplyr,reframe) importFrom(dplyr,rename) importFrom(dplyr,select) importFrom(dplyr,where) +importFrom(edgeR,DGEList) +importFrom(edgeR,calcNormFactors) +importFrom(edgeR,cpm) importFrom(ggiraph,geom_point_interactive) importFrom(ggiraph,girafe) importFrom(ggplot2,aes) @@ -124,6 +128,7 @@ importFrom(stringr,str_trim) importFrom(tibble,column_to_rownames) importFrom(tibble,rownames_to_column) importFrom(tidyr,drop_na) +importFrom(tidyr,pivot_longer) importFrom(tidyr,pivot_wider) importFrom(tidyr,unnest) importFrom(utils,capture.output) diff --git a/_pkgdown.yaml b/_pkgdown.yaml index 3fc5d00..e9510fd 100644 --- a/_pkgdown.yaml +++ b/_pkgdown.yaml @@ -31,8 +31,8 @@ navbar: href: articles/integration_across_modalities.html - text: "Working with Bioconductor classes" href: articles/macpie_bioc.html - - text: "Chck zero-inflation" - href: articles/check_zero_inflation.html + - text: "Assessing zero-inflation" + href: articles/assessing_zero_inflation.html - text: "Reference" href: reference/index.html # Links to your function reference docs. right: diff --git a/vignettes/cross_platform_compatibility.Rmd b/vignettes/cross_platform_compatibility.Rmd index 6ec1a2a..61c8f02 100644 --- a/vignettes/cross_platform_compatibility.Rmd +++ b/vignettes/cross_platform_compatibility.Rmd @@ -40,6 +40,10 @@ This vignette will cover the following two parts: - Performing differential expression and pathway enrichment tests on merged data with limma-voom correction. +**3. Benchmark** + + - Benchmarking the performance of macpie functions on DRUGseq data. It includes both filtering robust DMSO controls and DEG concordance. + The DRUGseq dataset is a large-scale drug screening dataset that includes a large set of small molecules (N = 4,343) tested on U2OS cells. This dataset was retrieved from [Zenodo](https://doi.org/10.5281/zenodo.14291446) (Ozer et al., 2024). @@ -64,15 +68,19 @@ devtools::load_all(paste0(dir, "macpie/")) ```{r setup} -library(macpie) -library(tibble) -library(stringr) -library(pheatmap) -library(ggiraph) -library(tidyseurat) -library(purrr) +suppressMessages(library(macpie)) +suppressMessages(library(tibble)) +suppressMessages(library(stringr)) +suppressMessages(library(pheatmap)) +suppressMessages(library(ggiraph)) +suppressMessages(library(tidyseurat)) +suppressMessages(library(purrr)) +suppressMessages(library(ggrepel)) ``` +```{r} +options(scipen=999, digits=3) +``` ## Converting a DRUGseq plate to a macpie object @@ -318,7 +326,7 @@ Now, we convert the metadata format to macpie metadata format. The idea is to make a combined metadata and a combined count matrix for three plates with the `plate_ID` labelled. -```{r metdata_for_batch24} +```{r} #make a combined metadata for three plates batch24_metadata <- batch24 %>% map_dfr(~ { @@ -353,7 +361,7 @@ batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_i -```{r count_matrix_for_batch24} +```{r} # create a combined UMI matrix for 3 plates batch24_counts <- batch24 %>% map(~ { @@ -382,7 +390,7 @@ binded_counts <- do.call(cbind, batch24_counts) Then, we can read in the combined count matrix and metadata. -```{r load_data_batch24} +```{r} as_mac <- CreateSeuratObject(counts = binded_counts, min.cells = 1, min.features = 1) @@ -626,4 +634,948 @@ pheatmap(enriched_pathways_mat, color = macpie_colours$continuous_rev) ``` +## Benchmark + +### Selecting robust DMSO controls + +We noticed that DRUGseq authors implemented a permutation strategy to identify robust DMSO wells in their analytics pipeline. We agree that robust control selection is critical for comparative benchmarking. The Novartis DRUGseq workflow selects DMSO controls via a 500-permutation procedure that retains wells minimising spurious DMSO–DMSO differentially expressed genes. To make this step practical for routine QC and for researchers with fewer DMSO wells, we implemented an alternative method in our macpie package `select_robust_controls` function that uses correlation-based selection of control wells. + +This function ranks control wells based on their average Fisher-z-transformed correlation coefficients to all other control wells, selecting those with the highest average correlation scores as the 'robust' controls. + + +The DRUGseq data contains three replicate plates with 48 DMSO controls (CB-43-EP73). We applied our `select_robust_controls` function to these DMSO wells. It filters genes, normalises with TMMwsp, computes log-CPM, and ranks wells by mean Fisher-z–transformed correlation to all other replicate wells. The top 5 wells with the highest transformed correlation were then compared to DRUGseq results. + +We benchmark the performance of macpie's `select_robust_controls` function on DRUGseq data to select robust DMSO controls from three replicate plates. + + +#### DRUGseq: 6 robust DMSO wells + +First we load the file of robust DMSO control wells identified by the DRUGseq authors in their github repo. + +Here, we show the robust DMSO wells for batch 24, which contains the three plates we are interested in (VH02012942, VH02012944, VH02012956). + +These 6 robust DMSO wells are used as the benchmark standard to compare with macpie selected robust DMSO wells. + +```{r} +robust_DMSO_DRUGseq <- read.csv(paste0(dir, "DRUGseqData/robust_RC_ReferenceControl_DMSO_wells.txt"), sep="") +robust_DMSO_DRUGseq %>% filter(batch_id==24) %>% select(batch_id, plate_barcode, well_id) +``` + +As from their results of robust DMSO controls, these are the robust DMSO wells for batch 24: + + VH02012942: I23, M23 + + VH02012944: D23, H23 + + VH02012956: I23, J23 + + +*** +
+ + +#### macpie: select robust DMSO wells + + + +Now we will use a function `select_robust_controls` to identify robust DMSO controls from batch 24. + +This function: + + - applies CPM filtering, + + - performs TMMwsp normalisation and computes log-CPM, + + - ranks wells by their mean Fisher-z–transformed sample–sample correlation to all other wells (Pearson or Spearman) + + - selects the top n wells (user-defined) as robust controls. + +```{r} +mac_filtered <- readRDS(paste0(dir, "/DRUGseqData/macpie_filtered_batch24.Rds")) +mac_filtered$combined_id <- str_replace_all(mac_filtered$combined_id, "-","_") +``` + + + + +Now we will apply the `select_robust_controls` function to each of the three plates in batch 24 to identify robust DMSO controls. + +This function generates three plots: + + - Boxplot of log2-CPM (TMMwsp) + + - Sample–sample correlation (Pearson, log2-CPM) + + - Sample–sample correlation (Spearman, log2-CPM) + + These plots help to visualize the distribution of gene expression and the correlation between samples, aiding in the assessment of DMSO control quality. + + +```{r} +#to use lapply on three plates +plates <- c("VH02012942", "VH02012944", "VH02012956") +results <- lapply(plates, function(plate) { + select_robust_controls( + mac_filtered, + combined_id = "CB_43_EP73_0", + orig_ident = plate, + cpm_filter = 1, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) +}) + +names(results) <- plates +``` + + +As make_plots = FALSE in the above function, we can now visualise the plots for each plate separately. If you set make_plots = TRUE, the function will automatically generate the plots for you. + +##### Plate VH02012942 + +```{r} +pheatmap::pheatmap(results$VH02012942$cor_pearson, + main = "Sample-sample Pearson correlation (log2-CPM)", + fontsize_row = 8, fontsize_col = 8) +pheatmap::pheatmap(results$VH02012942$cor_spearman, + main = "Sample-sample Spearman correlation (log2-CPM)", + fontsize_row = 8, fontsize_col = 8) +``` + + +Apart from correlation heatmaps, the function also returns a ranking of wells by their mean correlation to all other wells. + +```{r} +results$VH02012942$scores_mean_to_others +``` + +Finally, it returns the top N wells as robust DMSO controls. + +```{r} +results$VH02012942$topN +``` + +Now we can see for plate VH02012942, the 2 of top 5 DMSO wells selected by macpie are I23 and J23, which are exactly the same as the robust DMSO wells identified by the DRUGseq authors. + + +Let's repeat the same process for the other two plates in batch 24. + +##### Plate VH02012944 + +```{r} +pheatmap::pheatmap(results$VH02012944$cor_pearson, + main = "Sample-sample Pearson correlation (log2-CPM)", + fontsize_row = 8, fontsize_col = 8) +pheatmap::pheatmap(results$VH02012944$cor_spearman, + main = "Sample-sample Spearman correlation (log2-CPM)", + fontsize_row = 8, fontsize_col = 8) +``` + + +Apart from correlation heatmaps, the function also returns a ranking of wells by their mean correlation to all other wells. + +```{r} +results$VH02012944$scores_mean_to_others +``` + +Finally, it returns the top N wells as robust DMSO controls. + +```{r} +results$VH02012944$topN +``` + +For plate VH02012944, the DRUGseq selected D23 and H23 DMSO wells are not in our top 5. However, H23 is ranked 6th by macpie, which is very close to the top 5. + + +##### Plate VH02012956 +```{r} +pheatmap::pheatmap(results$VH02012956$cor_pearson, + main = "Sample-sample Pearson correlation (log2-CPM)", + fontsize_row = 8, fontsize_col = 8) +pheatmap::pheatmap(results$VH02012956$cor_spearman, + main = "Sample-sample Spearman correlation (log2-CPM)", + fontsize_row = 8, fontsize_col = 8) +``` + + +Apart from correlation heatmaps, the function also returns a ranking of wells by their mean correlation to all other wells. + +```{r} +results$VH02012956$scores_mean_to_others +``` + +Finally, it returns the top N wells as robust DMSO controls. + +```{r} +results$VH02012942$topN +``` + +For plate VH02012956, the DRUGseq selected I23 and J23 DMSO wells are exactly the same as our top DMSO wells selected by macpie. + + +### Summary + +To summarise the performance of macpie in selecting robust DMSO wells, we compare our selected top 5 DMSO wells with the DRUGseq authors' selected robust DMSO wells for each plate in batch 24. + +```{r} +# DRUG-seq authors' robust DMSO wells (batch 24) +expected_df <- as_tibble(data.frame( + plate = c("VH02012942", "VH02012942", "VH02012944", "VH02012944", "VH02012956", "VH02012956"), + well = c("I23", "M23", "D23", "H23", "I23", "J23"), + source = "expected")) + + +# Helper: extract topN wells (names are like "VH02012942_I23") +extract_top_wells <- function(res_per_plate) { + tibble(sample = names(res_per_plate$topN), + score = as.numeric(res_per_plate$topN)) |> + mutate( + plate = sub("_.*$", "", sample), + well = sub("^.*_", "", sample), + rank = row_number() + ) |> + select(plate, well, rank, score) +} + +top_df <- map_df(names(results), ~{ + extract_top_wells(results[[.x]]) +}) + + +matched_df <- expected_df |> + left_join(top_df, by = c("plate", "well")) |> + mutate(found = !is.na(rank)) + + +plate_summary <- matched_df |> + group_by(plate) |> + summarise( + expected = n(), + recovered = sum(found), + recovery_rate = recovered / expected + ) + +plate_summary + +overall <- summarise(plate_summary, + total_expected = sum(expected), + total_recovered = sum(recovered), + overall_rate = total_recovered / total_expected) +overall + +``` +```{r} +ggplot(plate_summary, aes(x = plate, y = recovered, fill = plate)) + + geom_col(width = 0.6, show.legend = FALSE) + + geom_text(aes(label = sprintf("%d / %d (%.0f%%)", + recovered, expected, 100*recovery_rate)), + vjust = -0.6) + + ylim(0, max(plate_summary$expected) + 0.8) + + labs(title = "Recovered DRUG-seq robust DMSO wells by plate", + x = "Plate", y = "Recovered wells") + + theme_classic() + +``` + + +In summary, for the three plates in batch 24, macpie successfully identified 4 out of 6 robust DMSO wells (with ~66.7% overall recovery rate) that were also selected by the DRUGseq authors. Only for plate VH02012944, one of the DRUGseq selected DMSO wells (D23) was not in our top 5, but the other well (H23) was ranked 6th by macpie, which is very close to the top 5. This demonstrates that macpie is effective in selecting high-quality DMSO controls for downstream analysis without running permutation tests, making it a computationally efficient choice. + +This function runs for each plate, it does not take into account any batches or plates. If it's a cross-plates design, it is recommended to either compute within-plate or remove plate effects (e.g. using ComBat, limma removeBatchEffect functions) first. + +*** +
+ + +### DEG concordance + +Now we compare macpie implementations of limma-voom, edgeR, DEseq2 and limma-trend against DRUGseq limma-trend result on the FF_86_NH56 (10uM) vs DMSO control from batch 24. We evaluated three control settings: (i) all DMSO wells (48 wells), (ii) top 15 DMSO selected by our correlation-based approach, and (iii) 6 DMSO wells identified by DRUGseq 500 permutation-based method. + +We only focus on up-regulated genes in FF_86_NH56 (genes were called DEG if BH-adjusted p < 0.05 and log2FC > 0). + + + + +#### DRUGseq DEG results + +For the purpose of this vignette, we only load the DRUGseq DEG results for FF_86_NH56 (10uM) vs DMSO control from batch 24. + +```{r} +batch24_de <- readRDS(paste0(dir,"DRUGseqData/DE_batch24.Rds")) +FF86_res <- batch24_de %>% filter(cmpd_sample_id=="FF-86-NH56") +ff86_res_toptable <- FF86_res[,13:18] +ff86_res_toptable <- ff86_res_toptable %>% + separate(gene.ID, into = c("gene", "chrom"), sep = ",") %>% + select(-chrom) %>% mutate(combined_id ="FF_86_NH56_10") %>% + rename(log2FC=logFC, p_value_adj = adj.P.Val) + +``` + +```{r, fig.height=6, fig.width=8} +plot_volcano(ff86_res_toptable, max.overlaps =3) +``` + +```{r} +ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% nrow() + +drugseq_deg <- ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% select(gene) %>% pull() +``` + +There are 1423 up-regulated DEGs identified by DRUGseq limma-trend method for FF_86_NH56 (10uM) vs DMSO control from batch 24. + + + +#### macpie DEG results with all DMSO wells + + +Load filered data + +```{r} +mac_filtered <- readRDS(paste0(dir, "/DRUGseqData/macpie_filtered_batch24.Rds")) +mac_filtered$combined_id <- str_replace_all(mac_filtered$combined_id, "-","_") +``` + + + + +##### Differential gene expression + +In here, you can specify a single condition in the combined_id column and compare with DMSO (i.e.CB_43_EP73_0). By using the plate IDs in the column of orig.ident as the input for batch parameter, `compute_singe_de` function can perform differential expression analysis using the preferred method (limma voom in this example) with batch information. + + + +```{r} +methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") + +methods_res <- lapply(methods, function(m){ + + message("\n","Processing method: ", m,"\n") + # m<-"limma_voom" + treatment_samples <- "FF_86_NH56_10" + control_samples <- "CB_43_EP73_0" + subset <- mac_filtered%>%filter(combined_id%in%c(treatment_samples,control_samples)) + + batch <- subset$orig.ident + + + top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch) + + # plot(plot_volcano(top_table, max.overlaps = 5)) + alldmso_degs <- top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + length(intersect(alldmso_degs, drugseq_deg)) + + top_table <- top_table %>% + arrange(p_value_adj, desc(log2FC)) %>% + mutate(gene = factor(gene, levels = unique(gene))) + # add a column if there are in the intersect(alldmso_degs, drugseq_deg) + top_table <- top_table %>% + mutate(in_drugseq_deg = ifelse(gene %in% intersect(alldmso_degs, drugseq_deg), "yes", "no")) + + # label a few top overlapping genes + lab_genes <- top_table[top_table$in_drugseq_deg=="yes", ] |> + dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) + + volcano_overlap <- ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + + geom_point(alpha = 0.6, size = 1.2) + + geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 50) + + scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+ + theme_classic() + + + #return + result_list <- list(top_table = top_table, + num_degs_macpie = length(alldmso_degs), + n_overlap = length(intersect(alldmso_degs, drugseq_deg)), + volcano_plot = volcano_overlap) + + return(result_list) + +}) + +names(methods_res) <- methods + +``` + + +##### Summary table + + +```{r} +#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method +deg_summary <- map_df(methods_res, function(x) { + data.frame( + num_degs_macpie = x$num_degs_macpie, + n_overlap = x$n_overlap, + num_degs_DRUGseq = length(drugseq_deg) + ) +}, .id = paste0("macpie_methods")) + +deg_summary +``` + + + +##### Overlapping volcano plot + +```{r} +methods_res$limma_voom$volcano_plot +methods_res$DESeq2$volcano_plot +methods_res$edgeR$volcano_plot +methods_res$limma_trend$volcano_plot +``` + + + + +#### macpie DEG results with 15 robust DMSO wells + +From our select_robust_controls function above, we identified the following top 15 DMSO wells from three plates in batch 24 using `select_robust_controls`: + + +```{r} +batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds")) +names(batch24) +``` + +```{r metdata} +#make a combined metadata for three plates +batch24_metadata <- batch24 %>% + map_dfr(~ { + .x$Annotation %>% + mutate( + Plate_ID = plate_barcode, + Well_ID = well_id, + Barcode = paste0(plate_barcode, "_", well_index), + Row = LETTERS[row], + Column = as.integer(col), + Species = "human", + Cell_type = "U2OS", + Model_type = "2D_adherent", + Time = as.factor(hours_post_treatment), + Unit = "h", + Treatment_1 = cmpd_sample_id, + Concentration_1 = as.numeric(concentration), + Unit_1 = unit, + Sample_type = if_else(well_type == "SA" & col == 24, + "Positive Control", + well_type) + ) + }) + + +batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id, + well_index, col, row, biosample_id, external_biosample_id, + cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample)) +``` + + + +```{r count_matrix} +# create a combined UMI matrix for 3 plates +batch24_counts <- batch24 %>% + map(~ { + .x$UMI.counts %>% + as.data.frame() %>% + rownames_to_column("gene_id") %>% + separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>% + mutate(gene_name = make.unique(gene_name)) %>% + select(-chrom) %>% + tibble::column_to_rownames(var = "gene_name") %>% + as.matrix() + }) + +binded_counts <- do.call(cbind, batch24_counts) + + + +``` + + + +```{r load_data} +as_mac <- CreateSeuratObject(counts = binded_counts, + min.cells = 1, + min.features = 1) + +as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode")) +``` + + +```{r} + +as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1) + +badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>% + filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23", "K23", "J23","C23"))) | + (Plate_ID == "VH02012944" & !(Well_ID %in% c("I23", "M23", "J23", "G23", "K23")))| + (Plate_ID == "VH02012956" & ! (Well_ID %in% c("I23", "J23", "O23","M23","K23")))) + + + +keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO)) + + +mac_badDSMOremoved <- as_mac[,keep_wells] + +mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_") + + +``` + + +```{r} +min_sample_num <- min(table(mac_badDSMOremoved$combined_id)) +mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved, + group_by = "combined_id", min_counts =10, + min_samples = min_sample_num) + +``` + + + + + +##### Differential gene expression + +```{r} +methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") + +five_dmso_methods_res <- lapply(methods, function(m){ + message("Processing method: ", m) + # m<-"limma_voom" + treatment_samples <- "FF_86_NH56_10" + control_samples <- "CB_43_EP73_0" + subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples)) + + batch <- subset$orig.ident + + badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch) + + + # plot(plot_volcano(top_table, max.overlaps = 5)) + badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + length(intersect(badDMSO_out_degs, drugseq_deg)) + + badDMSO_out_top_table <- badDMSO_out_top_table %>% + arrange(p_value_adj, desc(log2FC)) %>% + mutate(gene = factor(gene, levels = unique(gene))) + # add a column if there are in the intersect(alldmso_degs, drugseq_deg) + badDMSO_out_top_table <- badDMSO_out_top_table %>% + mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no")) + + # label a few top overlapping genes + lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |> + dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) + + volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + + geom_point(alpha = 0.6, size = 1.2) + + geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) + + scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+ + theme_classic() + + #return + result_list <- list(top_table = badDMSO_out_top_table, + num_degs_macpie = length(badDMSO_out_degs), + n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)), + volcano_plot = volcano_overlap) + return(result_list) + +}) + +names(five_dmso_methods_res) <- methods + +``` + +##### Summary table + +```{r} +#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method +deg_summary <- map_df(five_dmso_methods_res, function(x) { + data.frame( + num_degs_macpie = x$num_degs_macpie, + n_overlap = x$n_overlap, + num_degs_DRUGseq = length(drugseq_deg) + ) +}, .id = paste0("macpie_methods")) + +deg_summary +``` + + +##### Overlapping volcano plot + +```{r} +five_dmso_methods_res$limma_voom$volcano_plot +five_dmso_methods_res$DESeq2$volcano_plot +five_dmso_methods_res$edgeR$volcano_plot +five_dmso_methods_res$limma_trend$volcano_plot +``` + + + + +#### macpie DEG results with 6 robust DMSO wells from DRUGseq + +From public DRUG-seq analysis pipeline, authors identified two reference controls: all DMSO wells and the ‘robust DMSO’ wells. + +We know these robust DMSO wells for batch 24 from their published data: + + - VH02012942: I23, M23 + + - VH02012944: D23, H23 + + - VH02012956: I23, J23 + + +```{r} +batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds")) +names(batch24) +``` + +```{r metdata_for_batch24} +#make a combined metadata for three plates +batch24_metadata <- batch24 %>% + map_dfr(~ { + .x$Annotation %>% + mutate( + Plate_ID = plate_barcode, + Well_ID = well_id, + Barcode = paste0(plate_barcode, "_", well_index), + Row = LETTERS[row], + Column = as.integer(col), + Species = "human", + Cell_type = "U2OS", + Model_type = "2D_adherent", + Time = as.factor(hours_post_treatment), + Unit = "h", + Treatment_1 = cmpd_sample_id, + Concentration_1 = as.numeric(concentration), + Unit_1 = unit, + Sample_type = if_else(well_type == "SA" & col == 24, + "Positive Control", + well_type) + ) + }) + + +batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id, + well_index, col, row, biosample_id, external_biosample_id, + cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample)) +``` + + + +```{r count_matrix_for_batch24} +# create a combined UMI matrix for 3 plates +batch24_counts <- batch24 %>% + map(~ { + .x$UMI.counts %>% + as.data.frame() %>% + rownames_to_column("gene_id") %>% + separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>% + mutate(gene_name = make.unique(gene_name)) %>% + select(-chrom) %>% + tibble::column_to_rownames(var = "gene_name") %>% + as.matrix() + }) + +binded_counts <- do.call(cbind, batch24_counts) + + + +``` + + + +```{r load_data_batch24} +as_mac <- CreateSeuratObject(counts = binded_counts, + min.cells = 1, + min.features = 1) + +as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode")) +``` + + + + +```{r} + +as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1) + +badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>% + filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23"))) | + (Plate_ID == "VH02012944" & !(Well_ID %in% c("D23", "H23")))| + (Plate_ID == "VH02012956" & ! (Well_ID %in% c("I23", "J23")))) + + + +keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO)) + + +mac_badDSMOremoved <- as_mac[,keep_wells] + +mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_") + + +``` + + +```{r} +min_sample_num <- min(table(mac_badDSMOremoved$combined_id)) +mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved, + group_by = "combined_id", min_counts =10, + min_samples = min_sample_num) + +``` + + + + + + +##### Differential gene expression + + +```{r} +methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") + +robust_dmso_methods_res <- lapply(methods, function(m){ + message("Processing method: ", m) + # m<-"limma_voom" + treatment_samples <- "FF_86_NH56_10" + control_samples <- "CB_43_EP73_0" + subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples)) + + batch <- subset$orig.ident + + badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch) + + # plot(plot_volcano(top_table, max.overlaps = 5)) + badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + length(intersect(badDMSO_out_degs, drugseq_deg)) + + badDMSO_out_top_table <- badDMSO_out_top_table %>% + arrange(p_value_adj, desc(log2FC)) %>% + mutate(gene = factor(gene, levels = unique(gene))) + # add a column if there are in the intersect(alldmso_degs, drugseq_deg) + badDMSO_out_top_table <- badDMSO_out_top_table %>% + mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no")) + + # label a few top overlapping genes + lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |> + dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) + + volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + + geom_point(alpha = 0.6, size = 1.2) + + geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) + + scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+ + theme_classic() + + #return + result_list <- list(top_table = badDMSO_out_top_table, + num_degs_macpie = length(badDMSO_out_degs), + n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)), + volcano_plot = volcano_overlap) + return(result_list) + +}) + +names(robust_dmso_methods_res) <- methods + +``` + +##### Summary table + +```{r} +#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method +deg_summary <- map_df(robust_dmso_methods_res, function(x) { + data.frame( + num_degs_macpie = x$num_degs_macpie, + n_overlap = x$n_overlap, + num_degs_DRUGseq = length(drugseq_deg) + ) +}, .id = paste0("macpie_methods")) + +deg_summary +``` + + +##### Overlapping volcano plot + +```{r} +robust_dmso_methods_res$limma_voom$volcano_plot +robust_dmso_methods_res$DESeq2$volcano_plot +robust_dmso_methods_res$edgeR$volcano_plot +robust_dmso_methods_res$limma_trend$volcano_plot +``` + + + + +### Summary of DEG concordance + +To compare DEGs with different replicate numbers and different methods + +```{r} +methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") + +get_jaccard <- function(deg_set, drugseq_deg){ + intersection <- length(intersect(deg_set, drugseq_deg)) + union <- length(union(deg_set, drugseq_deg)) + jaccard_index <- intersection / union + return(jaccard_index) +} + +jaccard_index <- lapply(methods, function(m){ + # all dmso + degs <- methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + jaccard_all <- get_jaccard(degs, drugseq_deg) + # five dmso + degs <- five_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + jaccard_five <- get_jaccard(degs, drugseq_deg) + # three dmso + degs <- robust_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + jaccard_three <- get_jaccard(degs, drugseq_deg) + jaccard_index <- data.frame( + method = m, + jaccard_all = jaccard_all, + jaccard_five = jaccard_five, + jaccard_three = jaccard_three + ) + return(jaccard_index) +}) + +df <- as.data.frame(do.call(rbind, jaccard_index)) +rownames(df) <- df$method +df <- df %>% select(-method) +colnames(df) <- c("All DMSO", "macpie: 15 DMSO", "DRUGseq: 6 DMSO") +pheatmap::pheatmap(df, + cluster_rows = FALSE, + cluster_cols = FALSE, + display_numbers = TRUE, + main = "Jaccard Index between macpie DEGs and DRUGseq DEGs") + +``` + +#### Overlap of DEGs using all DMSO wells + + +```{r} +library(UpSetR) +all_dmso <- list( + limma_voom = methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DESeq2 = methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + edgeR = methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + limma_trend = methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DRUGseq = drugseq_deg +) +upset(fromList(all_dmso), + nsets = 5, + order.by = "freq", + main.bar.color = "black", + sets.bar.color = "gray23", + text.scale = c(2, 2, 2, 1.5, 2, 1.5), + mainbar.y.label = "Number of common DEGs", + sets.x.label = "Number of DEGs") +``` + +#### Overlap of DEGs using 15 DMSO wells + +```{r} +five_dmso <- list( + limma_voom = five_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DESeq2 = five_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + edgeR = five_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + limma_trend = five_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DRUGseq = drugseq_deg +) +upset(fromList(five_dmso), + nsets = 5, + order.by = "freq", + main.bar.color = "black", + sets.bar.color = "gray23", + text.scale = c(2, 2, 2, 1.5, 2, 1.5), + mainbar.y.label = "Number of common DEGs", + sets.x.label = "Number of DEGs") +``` + + +#### Overlap of DEGs using 6 DMSO wells + +```{r} +robust_dmso <- list( + limma_voom = robust_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DESeq2 = robust_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + edgeR = robust_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + limma_trend = robust_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DRUGseq = drugseq_deg +) +upset(fromList(robust_dmso), + nsets = 5, + order.by = "freq", + main.bar.color = "black", + sets.bar.color = "gray23", + text.scale = c(2, 2, 2, 1.5, 2, 1.5), + mainbar.y.label = "Number of common DEGs", + sets.x.label = "Number of DEGs") +``` + +From Jaccard heatmap and UpSet plots, with all 48 DMSO controls, DESeq2 and edgeR show the largest overlaps with DRGseq (Jaccard = 0.29 and 0.24; UpSet intersections in the hundreds). Limma-voom shows moderate similarity (J=0.15), and limma-trend returns a smaller set (J=0.18). Using 15 macpie-selected DMSO reduces totals and overlaps for most methods; DEseq2 and edgeR remains relatively stable (J = 0.28 and 0.24). With only 6 DRUGseq controls, the DEG sets shrink and pair-wise intersection numbers drop. Especially DEseq2 drops down to J=0.01 and limma_voom reduces to J=0.08. While edgeR remains relatively stable with J=0.23. limma-trend run yields very few DEGs under this setting. + + +*** +
+ + + + +## Conclusion + +### Why robust DMSO subsets can reduce concordance with DRUGseq DEGs? + +Possible reasons are that with fewer control samples, our group-aware filter (>= 10 counts in at least 3 samples) becomes more stringent, leading to fewer genes being tested in the DE analysis. This reduction in the number of tested genes can impact the identification of DEGs and their overlap with DRUGseq results. Additionally, having fewer control samples can increase variability in the estimates of dispersion, which can affect the statistical power to detect true DEGs. This increased variability may lead to less consistent results across different methods, thereby reducing concordance with DRUGseq DEGs. + +Robust DMSO are selected based on their similarity in overall expression profiles, which may slightly shift normalisation and the mean-variance trend when estimating by TMM/TMMwsp. + + +### Minimal workflow we recommend for macpie DEG analysis + +1. QC and filtering: + +- Use group-aware filtering `filter_genes_by_expression` to retain genes with consistent expression within each treatment. + +- Examine any batch/unwanted variation, or potential outliers in the data. + +2. Check zero-inflation: + +- Run `check_zeroinflation` before and after filtering to ensure that zero-inflation is minimized. + +- Decision heuristic: for example, if observed zero proportions are significantly higher than expected under NB after filtering, consider using methods that account for zero-inflation. + +3. Pick controls consciously: + +- Start with all available controls and assess any potential outliers. + +4. Choose a DE method: + +- If zero-inflation is present, use a method that accounts for it (`compute_single_de` edgeR with ZINB-WaVE weights). + +- If zero-inflation is not a concern, + + - edgeR (QLFit) and limma-voom (with TMMwsp): both methods are designed to account for gene-specific variability and handle heteroscedasticity effectively. + + - limma-trend: remains suitable for uniformly sequenced experiments as it assumes similar library sizes/sequencing depth. + + - DESeq2: strong shrinkage & automatic independent filtering. + +- If batch/unwanted variation, + + - include them in the design (~ batch + condition) or adjust with limma’s removeBatchEffect / edgeR design. + + - if strong, apply RUVseq (e.g. RUVg with empirical negative controls or RUVs with replicate samples) before DE. Re-compute normalization after RUV and re-fit your DE model. + + +*** +
From 8d1c8882488297fe5f929171ea4408cc922a1bd5 Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Wed, 22 Oct 2025 17:30:49 +1100 Subject: [PATCH 12/16] fix non-ASCII in select_robust_controls.R --- R/select_robust_controls.R | 70 ++++++++++++++++++----------------- R/utils.R | 3 +- man/select_robust_controls.Rd | 59 ++++++++++++++--------------- 3 files changed, 68 insertions(+), 64 deletions(-) diff --git a/R/select_robust_controls.R b/R/select_robust_controls.R index 0bf62f8..1a8d3eb 100644 --- a/R/select_robust_controls.R +++ b/R/select_robust_controls.R @@ -2,57 +2,59 @@ #' #' @description #' For a given control group (e.g., DMSO) on a specific plate/batch, this function -#' ranks samples by their average correlation (Fisher z–averaged) to all *other* +#' ranks samples by their average correlation (Fisher z-averaged) to all *other* #' samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally) -#' plots per-sample expression distributions and sample–sample correlation heatmaps. +#' plots per-sample expression distributions and sample-sample correlation heatmaps. #' #' @param data A tidyseurat object containing an RNA assay with a **counts** layer. #' @param samples the control/treatment label to keep in column samples -#' (e.g., `"CB_43_EP73_0"`). Only cells/samples with this label are considered. +#' (e.g., "CB_43_EP73_0"). Only cells/samples with this label are considered. #' @param orig_ident Character scalar: the plate/batch identifier to keep -#' (e.g., `"VH02012942"`). Only cells/samples from this batch are considered. +#' (e.g., "VH02012942"). Only cells/samples from this batch are considered. #' @param cpm_filter Numeric scalar; CPM threshold used for gene filtering prior to -#' normalization (default `1`). -#' @param min_samps Integer; a gene must be expressed (CPM > `cpm_filter`) in at least -#' this many samples to be retained (default `16`). +#' normalization (default 1). +#' @param min_samps Integer; a gene must be expressed (CPM > cpm_filter) in at least +#' this many samples to be retained (default 16). #' @param corr_method Correlation type used for ranking; one of -#' `c("spearman","pearson")` (default `"spearman"`). -#' @param top_n Integer; the number of top-ranked samples to report in `topN`. -#' Ties at the cutoff are kept (default `5`). -#' @param make_plots Logical; if `TRUE`, print a log2-CPM boxplot and Pearson/Spearman -#' correlation heatmaps (default `TRUE`). +#' c("spearman","pearson") (default "spearman"). +#' @param top_n Integer; the number of top-ranked samples to report in topN. +#' Ties at the cutoff are kept (default 5). +#' @param make_plots Logical; if TRUE, print a log2-CPM boxplot and Pearson/Spearman +#' correlation heatmaps (default TRUE). #' #' @details #' Workflow: -#' 1) Subset to the specified `samples` **and** `orig_ident` (plate/batch). -#' 2) Build an `edgeR::DGEList`, filter lowly expressed genes using CPM and `min_samps`. -#' 3) Normalize with **TMMwsp** and compute log2-CPM. -#' 4) Rank samples by mean Fisher z–transformed correlation to all *other* samples -#' (according to `corr_method`). +#' 1) Subset to the specified samples and orig_ident (plate/batch). +#' 2) Build an edgeR::DGEList, filter lowly expressed genes using CPM and min_samps. +#' 3) Normalize with TMMwsp and compute log2-CPM. +#' 4) Rank samples by mean Fisher z transformed correlation to all other samples +#' (according to corr_method). #' 5) Return the ranking, correlation matrices, the normalized matrix, and (optionally) #' plots for QC. #' -#' Column names of the counts matrix are rewritten to `"_"` +#' Column names of the counts matrix are rewritten to "_" #' for easier visual inspection in plots. #' #' @return A list with elements: -#' \item{subset_obj}{The Seurat object subset used for analysis.} -#' \item{dge}{The filtered `edgeR::DGEList`.} -#' \item{log_cpm_tmm}{Matrix of TMMwsp log2-CPM (genes × samples).} -#' \item{boxplot_df}{Long-format data frame used for the boxplot (`gene`, `sample`, `log_cpm`).} -#' \item{cor_pearson}{Sample–sample Pearson correlation matrix.} -#' \item{cor_spearman}{Sample–sample Spearman correlation matrix.} -#' \item{ranking_method}{The correlation method used for ranking.} -#' \item{scores_mean_to_others}{Named numeric vector of mean Fisher-z back-transformed -#' correlations (higher = better), sorted decreasing.} -#' \item{topN}{Named numeric vector of the top-ranked samples (ties at the cutoff kept).} -#' +#' * subset_obj: The Seurat object subset used for analysis. +#' * dge: The filtered edgeR::DGEList +#' * log_cpm_tmm: Matrix of TMMwsp log2-CPM. +#' * boxplot_df: Long-format data frame used for the boxplot (gene, sample, log_cpm). +#' * cor_pearson: Sample-sample Pearson correlation matrix. +#' * cor_spearman: Sample-sample Spearman correlation matrix. +#' * ranking_method: The correlation method used for ranking. +#' * scores_mean_to_others: Named numeric vector of mean Fisher-z back-transformed +#' correlations (higher = better), sorted decreasing. +#' * topN: Named numeric vector of the top-ranked samples (ties at the cutoff kept). + +#' +#' #' @examples #' data(mini_mac) #' res <- select_robust_controls(mini_mac,samples = "DMSO_0", orig_ident = "PMMSq033_mini") #' #' -#' +#' @importFrom rlang .data #' @importFrom edgeR DGEList calcNormFactors cpm #' @importFrom tibble rownames_to_column #' @importFrom tidyr pivot_longer @@ -72,7 +74,7 @@ select_robust_controls <- function( if (!inherits(data, "Seurat")) { stop("argument 'data' must be a Seurat or TidySeurat object.") } - + # check samples and orig_ident columns if (colnames(data@meta.data)%in% c("combined_id","orig.ident") %>% sum() < 2) { stop("The 'data' object must contain 'combined_id' and 'orig.ident' columns in its metadata.") @@ -123,7 +125,7 @@ select_robust_controls <- function( warning("Package 'ggplot2' not available; skipping boxplot.") } else { print( - ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) + + ggplot2::ggplot(df_long, ggplot2::aes(x = .data$sample, y = .data$log_cpm)) + ggplot2::geom_boxplot(outlier.size = 0.5) + ggplot2::labs(x = "Sample", y = "log2 CPM", title = "Boxplot of log2-CPM (TMMwsp)") + @@ -139,8 +141,8 @@ select_robust_controls <- function( if (!requireNamespace("pheatmap", quietly = TRUE)) { warning("Package 'pheatmap' not available; skipping heatmaps.") } else { - pheatmap::pheatmap(cors_pear, main = "Sample–sample correlation (Pearson, log2-CPM)") - pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)") + pheatmap::pheatmap(cors_pear, main = "Pearson correlation") + pheatmap::pheatmap(cors_spear, main = "Spearman correlation") } } # Ranking by mean Fisher-z correlation to all *other* samples diff --git a/R/utils.R b/R/utils.R index 0718920..3036c14 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,3 +1,4 @@ utils::globalVariables(c("diff_expressed", "gene_labels", ".cell", "UMAPde_1", "UMAPde_2", - "padj", "target", "target_id")) \ No newline at end of file + "padj", "target", "target_id", + "log_cpm")) \ No newline at end of file diff --git a/man/select_robust_controls.Rd b/man/select_robust_controls.Rd index e05a997..fb4254f 100644 --- a/man/select_robust_controls.Rd +++ b/man/select_robust_controls.Rd @@ -19,58 +19,60 @@ select_robust_controls( \item{data}{A tidyseurat object containing an RNA assay with a \strong{counts} layer.} \item{samples}{the control/treatment label to keep in column samples -(e.g., \code{"CB_43_EP73_0"}). Only cells/samples with this label are considered.} +(e.g., "CB_43_EP73_0"). Only cells/samples with this label are considered.} \item{orig_ident}{Character scalar: the plate/batch identifier to keep -(e.g., \code{"VH02012942"}). Only cells/samples from this batch are considered.} +(e.g., "VH02012942"). Only cells/samples from this batch are considered.} \item{cpm_filter}{Numeric scalar; CPM threshold used for gene filtering prior to -normalization (default \code{1}).} +normalization (default 1).} -\item{min_samps}{Integer; a gene must be expressed (CPM > \code{cpm_filter}) in at least -this many samples to be retained (default \code{16}).} +\item{min_samps}{Integer; a gene must be expressed (CPM > cpm_filter) in at least +this many samples to be retained (default 16).} \item{corr_method}{Correlation type used for ranking; one of -\code{c("spearman","pearson")} (default \code{"spearman"}).} +c("spearman","pearson") (default "spearman").} -\item{top_n}{Integer; the number of top-ranked samples to report in \code{topN}. -Ties at the cutoff are kept (default \code{5}).} +\item{top_n}{Integer; the number of top-ranked samples to report in topN. +Ties at the cutoff are kept (default 5).} -\item{make_plots}{Logical; if \code{TRUE}, print a log2-CPM boxplot and Pearson/Spearman -correlation heatmaps (default \code{TRUE}).} +\item{make_plots}{Logical; if TRUE, print a log2-CPM boxplot and Pearson/Spearman +correlation heatmaps (default TRUE).} } \value{ A list with elements: -\item{subset_obj}{The Seurat object subset used for analysis.} -\item{dge}{The filtered \code{edgeR::DGEList}.} -\item{log_cpm_tmm}{Matrix of TMMwsp log2-CPM (genes × samples).} -\item{boxplot_df}{Long-format data frame used for the boxplot (\code{gene}, \code{sample}, \code{log_cpm}).} -\item{cor_pearson}{Sample–sample Pearson correlation matrix.} -\item{cor_spearman}{Sample–sample Spearman correlation matrix.} -\item{ranking_method}{The correlation method used for ranking.} -\item{scores_mean_to_others}{Named numeric vector of mean Fisher-z back-transformed -correlations (higher = better), sorted decreasing.} -\item{topN}{Named numeric vector of the top-ranked samples (ties at the cutoff kept).} +\itemize{ +\item subset_obj: The Seurat object subset used for analysis. +\item dge: The filtered edgeR::DGEList +\item log_cpm_tmm: Matrix of TMMwsp log2-CPM. +\item boxplot_df: Long-format data frame used for the boxplot (gene, sample, log_cpm). +\item cor_pearson: Sample-sample Pearson correlation matrix. +\item cor_spearman: Sample-sample Spearman correlation matrix. +\item ranking_method: The correlation method used for ranking. +\item scores_mean_to_others: Named numeric vector of mean Fisher-z back-transformed +correlations (higher = better), sorted decreasing. +\item topN: Named numeric vector of the top-ranked samples (ties at the cutoff kept). +} } \description{ For a given control group (e.g., DMSO) on a specific plate/batch, this function -ranks samples by their average correlation (Fisher z–averaged) to all \emph{other} +ranks samples by their average correlation (Fisher z-averaged) to all \emph{other} samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally) -plots per-sample expression distributions and sample–sample correlation heatmaps. +plots per-sample expression distributions and sample-sample correlation heatmaps. } \details{ Workflow: \enumerate{ -\item Subset to the specified \code{samples} \strong{and} \code{orig_ident} (plate/batch). -\item Build an \code{edgeR::DGEList}, filter lowly expressed genes using CPM and \code{min_samps}. -\item Normalize with \strong{TMMwsp} and compute log2-CPM. -\item Rank samples by mean Fisher z–transformed correlation to all \emph{other} samples -(according to \code{corr_method}). +\item Subset to the specified samples and orig_ident (plate/batch). +\item Build an edgeR::DGEList, filter lowly expressed genes using CPM and min_samps. +\item Normalize with TMMwsp and compute log2-CPM. +\item Rank samples by mean Fisher z transformed correlation to all other samples +(according to corr_method). \item Return the ranking, correlation matrices, the normalized matrix, and (optionally) plots for QC. } -Column names of the counts matrix are rewritten to \code{"_"} +Column names of the counts matrix are rewritten to "_" for easier visual inspection in plots. } \examples{ @@ -78,5 +80,4 @@ data(mini_mac) res <- select_robust_controls(mini_mac,samples = "DMSO_0", orig_ident = "PMMSq033_mini") - } From e35a142cbabeb162a0c8f88b962e6d619f9b824c Mon Sep 17 00:00:00 2001 From: nbartonicek Date: Tue, 21 Oct 2025 13:50:07 +1100 Subject: [PATCH 13/16] Fix single_de Seurat --- R/compute_single_de.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/compute_single_de.R b/R/compute_single_de.R index 3d7043b..2758cd8 100644 --- a/R/compute_single_de.R +++ b/R/compute_single_de.R @@ -179,7 +179,7 @@ compute_single_de <- function(data = NULL, latent.vars = "batch", test.use = "wilcox") %>% select("avg_log2FC", "p_val", "p_val_adj") %>% - rename("log2FC" = "avg_log2FC", "p_val" = "p_val", "p_value_adj" = "p_val_adj") %>% + rename("log2FC" = "avg_log2FC", "p_val" = "p_value", "p_value_adj" = "p_val_adj") %>% mutate(metric = 1) %>% rownames_to_column("gene") @@ -191,7 +191,7 @@ compute_single_de <- function(data = NULL, model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else model.matrix(~0 + combined_id + batch) if (length(spikes) == 0) { - warning("List of control genes not provided for RUVg, using default.") + warning("List of control genes not provided for RUVg, using default human housekeeping genes.") spikes <- c( "ACTB", "GAPDH", From 11ef97014f815b5411a8a67bbfdd3cd8abacb840 Mon Sep 17 00:00:00 2001 From: nbartonicek Date: Tue, 21 Oct 2025 14:00:08 +1100 Subject: [PATCH 14/16] Fix single_de Seurat --- R/compute_single_de.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/compute_single_de.R b/R/compute_single_de.R index 2758cd8..5f85405 100644 --- a/R/compute_single_de.R +++ b/R/compute_single_de.R @@ -179,7 +179,7 @@ compute_single_de <- function(data = NULL, latent.vars = "batch", test.use = "wilcox") %>% select("avg_log2FC", "p_val", "p_val_adj") %>% - rename("log2FC" = "avg_log2FC", "p_val" = "p_value", "p_value_adj" = "p_val_adj") %>% + rename("log2FC" = "avg_log2FC", "p_value" = "p_val", "p_value_adj" = "p_val_adj") %>% mutate(metric = 1) %>% rownames_to_column("gene") From f03d5a03e3631c119d35b8eb4cff41cd941cb1e0 Mon Sep 17 00:00:00 2001 From: nbartonicek Date: Tue, 21 Oct 2025 21:58:33 +1100 Subject: [PATCH 15/16] Fix limma_zinb --- R/compute_single_de.R | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/R/compute_single_de.R b/R/compute_single_de.R index 5f85405..2a3af1a 100644 --- a/R/compute_single_de.R +++ b/R/compute_single_de.R @@ -12,6 +12,7 @@ utils::globalVariables(c("model_matrix")) #' number of samples #' @param k Parameter k for RUVSeq methods, check RUVSeq tutorial #' @param spikes List of genes to use as spike controls +#' @param num_cores Number of cores for parallel processing #' @importFrom limma makeContrasts eBayes contrasts.fit topTable #' @importFrom tibble rownames_to_column #' @import DESeq2 @@ -34,7 +35,8 @@ compute_single_de <- function(data = NULL, method = NULL, batch = 1, k = 2, - spikes = NULL) { + spikes = NULL, + num_cores = 1) { if (!requireNamespace("SummarizedExperiment", quietly = TRUE)) { stop( "compute_single_de(): the following package is required but not installed: SummarizedExperiment", @@ -48,7 +50,7 @@ compute_single_de <- function(data = NULL, method <- if (is.null(method)) "limma_voom" else method if (!method %in% c("Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", - "limma_voom", "limma_trend")) { + "limma_voom", "limma_trend","edgeR_zinb", "limma_zinb")) { stop("Your normalization method is not available.") } if (is.null(treatment_samples) || is.null(control_samples)) { @@ -317,20 +319,25 @@ compute_single_de <- function(data = NULL, return(as.data.frame(top_table)) } - de_zinb <- function(data, pheno_data, treatment_samples, control_samples, batch, k) { + de_edgeR_zinb <- function(data, pheno_data, treatment_samples, control_samples, batch, k) { data_sce<-as.SingleCellExperiment(data) - filtered_sce <- data_sce[rowSums(counts(data_sce)) > 50, ] - num_cores <- 8 # Change this based on your system + filtered_sce <- data_sce[rowSums(counts(data_sce)) > 0, ] cl <- makeCluster(num_cores) doParallel::registerDoParallel(cl) p <- BiocParallel::DoparParam() system.time(zinb <- zinbwave::zinbwave(filtered_sce, K = 2, - epsilon=1000, + epsilon=12000, BPPARAM = p, observationalWeights = TRUE)) weights <- SummarizedExperiment::assay(zinb, "weights") + + #perform_edgeR + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + dge <- DGEList(SummarizedExperiment::assay(zinb)) dge <- calcNormFactors(dge, method = "TMMwsp") @@ -376,7 +383,8 @@ compute_single_de <- function(data = NULL, RUVg = de_ruvg(data, pheno_data, treatment_samples, control_samples, batch, spikes, k), RUVs = de_ruvs(data, pheno_data, treatment_samples, control_samples, batch, k), RUVr = de_ruvr(data, pheno_data, treatment_samples, control_samples, batch, k), - zinb = de_zinb(data, pheno_data, treatment_samples, control_samples, batch, k), + edgeR_zinb = de_edgeR_zinb(data, pheno_data, treatment_samples, control_samples, batch, k), + limma_zinb = limma_zinb(data, pheno_data, treatment_samples, control_samples, batch, k), stop("Unsupported DE method.") ) return(de_data) From cacfdb26201da61b46287b0b68b3586ad99398c8 Mon Sep 17 00:00:00 2001 From: nbartonicek Date: Sat, 25 Oct 2025 12:33:33 +1100 Subject: [PATCH 16/16] Fix limma trend method --- R/compute_multi_de.R | 2 +- compute_single_de.R | 425 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 426 insertions(+), 1 deletion(-) create mode 100644 compute_single_de.R diff --git a/R/compute_multi_de.R b/R/compute_multi_de.R index e98ddef..0f01061 100644 --- a/R/compute_multi_de.R +++ b/R/compute_multi_de.R @@ -58,7 +58,7 @@ compute_multi_de <- function(data = NULL, method <- if (is.null(method)) "edgeR" else method if (!method %in% c("Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", - "limma_voom")) { + "limma_voom","limma_trend")) { stop("Your normalization method is not available.") } if (is.null(control_samples)) { diff --git a/compute_single_de.R b/compute_single_de.R new file mode 100644 index 0000000..f47983c --- /dev/null +++ b/compute_single_de.R @@ -0,0 +1,425 @@ +utils::globalVariables(c("model_matrix")) +#' Retrieve normalised counts of MAC-seq data +#' +#' This function retrieves counts from a number of methods that are available +#' for normalisation, with the default of limma-voomLmFit. +#' @param data A tidyseurat object merged with metadata. Must contain columns +#' "Well_ID", "Row", "Column" +#' @param treatment_samples Value in the column "combined_id" representing replicates of treatment samples in the data +#' @param control_samples Value in the column "combined_id" representing replicates of control samples in the data +#' @param method One of "Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend" +#' @param batch Either empty, a single value, or a vector corresponding to the +#' number of samples +#' @param k Parameter k for RUVSeq methods, check RUVSeq tutorial +#' @param spikes List of genes to use as spike controls +#' @param num_cores Number of cores for parallel processing +#' @importFrom limma makeContrasts eBayes contrasts.fit topTable +#' @importFrom tibble rownames_to_column +#' @import DESeq2 +#' @import RUVSeq +#' @importFrom stats model.matrix +#' @importFrom dplyr rename select +#' +#' @returns Data frame of DE counts +#' @export +#' +#' @examples +#' data(mini_mac) +#' treatment_samples="Staurosporine_0.1" +#' control_samples<-"DMSO_0" +#' top_table <- compute_single_de(mini_mac, treatment_samples, control_samples, method = "limma_voom") + +compute_single_de <- function(data = NULL, + treatment_samples = NULL, + control_samples = NULL, + method = NULL, + batch = 1, + k = 2, + spikes = NULL, + num_cores = 1) { + if (!requireNamespace("SummarizedExperiment", quietly = TRUE)) { + stop( + "compute_single_de(): the following package is required but not installed: SummarizedExperiment", + "\nPlease install via `install.packages()`.") + } + # Helper function to validate input data + validate_inputs <- function(data, method, treatment_samples, control_samples) { + if (!inherits(data, "Seurat")) { + stop("argument 'data' must be a Seurat or TidySeurat object.") + } + method <- if (is.null(method)) "limma_voom" else method + if (!method %in% c("Seurat_wilcox", "DESeq2", "edgeR", + "RUVg", "RUVs", "RUVr", + "limma_voom", "limma_trend","edgeR_zinb", "limma_zinb")) { + stop("Your normalization method is not available.") + } + if (is.null(treatment_samples) || is.null(control_samples)) { + stop("Missing the vectors of treatment and control samples.") + } + if (!"combined_id" %in% colnames(data@meta.data)) { + data <- data %>% + mutate(combined_id = apply(select(starts_with("Treatment_") | starts_with("Concentration_")), + 1, paste, collapse = "_")) %>% + mutate(combined_id = gsub(" ", "", .data$combined_id)) + } + if (length(treatment_samples) == 1 && length(control_samples) == 1) { + treatment_samples_list <- grepl(treatment_samples, data$combined_id) + control_samples_list <- grepl(control_samples, data$combined_id) + if (any(sum(treatment_samples_list) == 0, sum(control_samples_list) == 0)) { + stop("Your treatment and control samples are not in your combined_id column.") + } + } + } + + # Helper: Prepare data and pheno_data + prepare_data <- function(data, treatment_samples, control_samples, batch) { + data <- data[, grepl(paste0(treatment_samples, "|", control_samples), data$combined_id)] + if (length(unique(data$combined_id)) < 2) { + stop("Insufficient factors for differential expression analysis.") + } + pheno_data <- data.frame(batch = as.factor(batch), condition = as.factor(data$combined_id)) + return(list(data = data, pheno_data = pheno_data)) + } + + de_limma_voom <- function(data, pheno_data, treatment_samples, control_samples) { + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + dge <- DGEList(counts = data@assays$RNA$counts, + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- estimateDisp(dge, model_matrix) + dge <- calcNormFactors(dge, method = "TMMwsp") + fit <- voomLmFit(dge, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + tmp <- contrasts.fit(fit, contrasts) + tmp <- eBayes(tmp, robust = TRUE) + top_table <- topTable(tmp, number = Inf, sort.by = "P") %>% + select("logFC", "t", "P.Value", "adj.P.Val") %>% + rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val") %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } + + de_limma_trend <- function(data, pheno_data, treatment_samples, control_samples) { + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + dge <- DGEList(counts = data@assays$RNA$counts, + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- estimateDisp(dge, model_matrix) + dge <- calcNormFactors(dge, method = "TMMwsp") + logCPM <- log2(edgeR::cpm(dge, normalized.lib.sizes=T,log=F)+1) + fit <- lmFit(logCPM, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + tmp <- contrasts.fit(fit, contrasts) + tmp <- eBayes(tmp, trend = TRUE) + top_table <- topTable(tmp, number = Inf, sort.by = "P") %>% + select("logFC", "t", "P.Value", "adj.P.Val") %>% + rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val") %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } + + de_edger <- function(data, pheno_data, treatment_samples, control_samples) { + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + dge <- DGEList(counts = data@assays$RNA$counts, + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- calcNormFactors(dge, method = "TMMwsp") + dge <- estimateDisp(dge, model_matrix) + fit <- glmQLFit(dge, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + qlf <- glmQLFTest(fit, contrast = contrasts) + top_table <- topTags(qlf, n = nrow(data)) %>% + as.data.frame() %>% + select("logFC", "F", "PValue", "FDR") %>% + rename("log2FC" = "logFC", "metric" = "F", "p_value" = "PValue", "p_value_adj" = "FDR") %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } + + #DEseq produces NA adjusted p-values if + de_deseq2 <- function(data, pheno_data, treatment_samples, control_samples) { + combined_id <- data$combined_id + dds <- DESeqDataSetFromMatrix(countData = data@assays$RNA$counts, + colData = pheno_data, + design = ~ condition) + dds <- DESeq(dds) + res <- results(dds, contrast = c("condition", treatment_samples, control_samples)) + top_table <- as.data.frame(res) %>% + select("log2FoldChange", "stat", "pvalue", "padj") %>% + rename("log2FC" = "log2FoldChange", "metric" = "stat", "p_value" = "pvalue", "p_value_adj" = "padj") %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } + + de_seurat <- function(data, pheno_data, treatment_samples, control_samples) { + combined_id <- data$combined_id + data <- NormalizeData(data) + Idents(data) <- combined_id + data$batch <- batch + top_table <- FindMarkers(data, + ident.1 = treatment_samples, + ident.2 = control_samples, + latent.vars = "batch", + test.use = "wilcox") %>% + select("avg_log2FC", "p_val", "p_val_adj") %>% + rename("log2FC" = "avg_log2FC", "p_value" = "p_val", "p_value_adj" = "p_val_adj") %>% + mutate(metric = 1) %>% + rownames_to_column("gene") + + return(as.data.frame(top_table)) + } + + de_ruvg <- function(data, pheno_data, treatment_samples, control_samples, batch, spikes, k) { + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + if (length(spikes) == 0) { + warning("List of control genes not provided for RUVg, using default human housekeeping genes.") + spikes <- c( + "ACTB", + "GAPDH", + "RPLP0", + "B2M", + "HPRT1", + "PGK1", + "TBP", + "UBC", + "YWHAZ", + "PPIA", + "RPL19", + "EEF1A1", + "RPS18", + "TFRC" + ) + } + if (!all(spikes %in% row.names(data@assays$RNA$counts))) { + warning("Some or all of your control genes are not present in the dataset.") + spikes <- spikes[spikes %in% row.names(data@assays$RNA$counts)] + } + #k defines number of sources of variation, two have been chosen for row and column + set <- EDASeq::newSeqExpressionSet(counts = as.matrix(data@assays$RNA$counts), + phenoData = pheno_data) + set <- RUVg(set, cIdx = spikes, k = k) + + dge <- DGEList(counts = EDASeq::normCounts(set), + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- calcNormFactors(dge, method = "upperquartile") + dge <- estimateGLMCommonDisp(dge, model_matrix) + dge <- estimateGLMTagwiseDisp(dge, model_matrix) + fit <- glmFit(dge, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + lrt <- glmLRT(fit, contrast = contrasts) + top_table <- topTags(lrt, n = nrow(data)) %>% + as.data.frame() %>% + select("logFC", "PValue", "FDR") %>% + rename("log2FC" = "logFC", "p_value" = "PValue", "p_value_adj" = "FDR") %>% + mutate(metric = 1) %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } + + de_ruvs <- function(data, pheno_data, treatment_samples, control_samples, batch, k) { + combined_id <- data$combined_id + genes <- row.names(data@assays$RNA$counts) + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + + #k defines number of sources of variation, two have been chosen for row and column + set <- EDASeq::newSeqExpressionSet(counts = as.matrix(data@assays$RNA$counts), + phenoData = pheno_data) + differences <- makeGroups(combined_id) + set <- RUVs(set, cIdx = genes, k = k, scIdx = differences) + + dge <- DGEList(counts = EDASeq::normCounts(set), + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- calcNormFactors(dge, method = "upperquartile") + dge <- estimateGLMCommonDisp(dge, model_matrix) + dge <- estimateGLMTagwiseDisp(dge, model_matrix) + fit <- glmFit(dge, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + lrt <- glmLRT(fit, contrast = contrasts) + top_table <- topTags(lrt, n = nrow(data)) %>% + as.data.frame() %>% + select("logFC", "PValue", "FDR") %>% + rename("log2FC" = "logFC", "p_value" = "PValue", "p_value_adj" = "FDR") %>% + mutate(metric = 1) %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } + + de_ruvr <- function(data, pheno_data, treatment_samples, control_samples, batch, k) { + if (ncol(data) > 100) { + message("EdgeR with over 100 samples takes very long time. Consider reducing the number of samples.") + } + combined_id <- data$combined_id + genes <- row.names(data@assays$RNA$counts) + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + + #k defines number of sources of variation, two have been chosen for row and column + set <- EDASeq::newSeqExpressionSet(counts = as.matrix(data@assays$RNA$counts), + phenoData = pheno_data) + dge <- DGEList(counts = data@assays$RNA$counts, + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- calcNormFactors(dge, method = "TMMwsp") + dge <- estimateGLMCommonDisp(dge, model_matrix) + dge <- estimateGLMTagwiseDisp(dge, model_matrix) + fit <- glmFit(dge, model_matrix) + res <- residuals(fit, type = "deviance") + set <- RUVr(set, genes, k = k, res) + + dge <- DGEList(counts = EDASeq::normCounts(set), + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- calcNormFactors(dge, method = "upperquartile") + dge <- estimateGLMCommonDisp(dge, model_matrix) + dge <- estimateGLMTagwiseDisp(dge, model_matrix) + fit <- glmFit(dge, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + lrt <- glmLRT(fit, contrast = contrasts) + top_table <- topTags(lrt, n = nrow(data)) %>% + as.data.frame() %>% + select("logFC", "PValue", "FDR") %>% + rename("log2FC" = "logFC", "p_value" = "PValue", "p_value_adj" = "FDR") %>% + mutate(metric = 1) %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } + + de_edgeR_zinb <- function(data, pheno_data, treatment_samples, control_samples, batch, k) { + + data_sce<-as.SingleCellExperiment(data) + filtered_sce <- data_sce[rowSums(counts(data_sce)) > 0, ] + cl <- makeCluster(num_cores) + doParallel::registerDoParallel(cl) + p <- BiocParallel::DoparParam() + system.time(zinb <- zinbwave::zinbwave(filtered_sce, K = 2, + epsilon=12000, + BPPARAM = p, + observationalWeights = TRUE)) + + weights <- SummarizedExperiment::assay(zinb, "weights") + + #perform_edgeR + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + + dge <- DGEList(SummarizedExperiment::assay(zinb)) + dge <- calcNormFactors(dge, method = "TMMwsp") + + dge$weights <- weights + combined_id <- data$combined_id + design <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + + dge <- estimateDisp(dge, design) + fit <- glmQLFit(dge, design) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + qlf <- glmQLFTest(fit, contrast = contrasts) + + top_table <- topTags(qlf, n = nrow(data)) %>% + as.data.frame() %>% + select("logFC", "F", "PValue", "FDR") %>% + rename("log2FC" = "logFC", "metric" = "F", "p_value" = "PValue", "p_value_adj" = "FDR") %>% + rownames_to_column("gene") + stopCluster(cl) + doParallel::registerDoParallel() + return(as.data.frame(top_table)) + } + de_limma_zinb <- function(data, pheno_data, treatment_samples, control_samples, batch, k) { + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + + data_sce <- as.SingleCellExperiment(data) + filtered_sce <- subset(data_sce, rowSums(as.data.frame(counts(data_sce))) > 0) + cl <- makeCluster(num_cores) + doParallel::registerDoParallel(cl) + p <- BiocParallel::DoparParam() + suppressWarnings(zinb <- zinbwave::zinbwave(filtered_sce, K = k, + epsilon=12000, + BPPARAM = p, + observationalWeights = TRUE, verbose = F)) + counts <- zinb@assays@data$counts + weights <- zinb@assays@data$weights + + dge <- DGEList(counts = counts(filtered_sce), samples = pheno_data$condition, group = pheno_data$condition) + dge <- calcNormFactors(dge, method = "TMMwsp") + v <- voom(dge, design = model_matrix, plot = FALSE, weights = weights) + fit <- lmFit(v, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + tmp <- contrasts.fit(fit, contrasts) + tmp <- eBayes(tmp, robust = TRUE) + top_table <- topTable(tmp, number = Inf, sort.by = "P") %>% + select("logFC", "t", "P.Value", "adj.P.Val") %>% + rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val") %>% + rownames_to_column("gene") + stopCluster(cl) + doParallel::registerDoParallel() + return(as.data.frame(top_table)) + } + # Main function + validate_inputs(data, method, treatment_samples, control_samples) + prepared <- prepare_data(data, treatment_samples, control_samples, batch) + data <- prepared$data + pheno_data <- prepared$pheno_data + + # Select the appropriate normalization method + de_data <- switch( + method, + limma_voom = de_limma_voom(data, pheno_data, treatment_samples, control_samples), + limma_trend = de_limma_trend(data, pheno_data, treatment_samples, control_samples), + edgeR = de_edger(data, pheno_data, treatment_samples, control_samples), + DESeq2 = de_deseq2(data, pheno_data, treatment_samples, control_samples), + Seurat_wilcox = de_seurat(data, pheno_data, treatment_samples, control_samples), + RUVg = de_ruvg(data, pheno_data, treatment_samples, control_samples, batch, spikes, k), + RUVs = de_ruvs(data, pheno_data, treatment_samples, control_samples, batch, k), + RUVr = de_ruvr(data, pheno_data, treatment_samples, control_samples, batch, k), + edgeR_zinb = de_edgeR_zinb(data, pheno_data, treatment_samples, control_samples, batch, k), + limma_zinb = de_limma_zinb(data, pheno_data, treatment_samples, control_samples, batch, k), + stop("Unsupported DE method.") + ) + return(de_data) +}