diff --git a/NAMESPACE b/NAMESPACE index 131faf9..e0b53ce 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -35,6 +35,7 @@ export(plot_qc_metrics_heatmap) export(plot_rle) export(plot_volcano) export(read_metadata) +export(select_robust_controls) export(subsample_genes) export(summarise_de) export(validate_metadata) @@ -66,6 +67,9 @@ importFrom(dplyr,reframe) importFrom(dplyr,rename) importFrom(dplyr,select) importFrom(dplyr,where) +importFrom(edgeR,DGEList) +importFrom(edgeR,calcNormFactors) +importFrom(edgeR,cpm) importFrom(ggiraph,geom_point_interactive) importFrom(ggiraph,girafe) importFrom(ggplot2,aes) @@ -124,6 +128,7 @@ importFrom(stringr,str_trim) importFrom(tibble,column_to_rownames) importFrom(tibble,rownames_to_column) importFrom(tidyr,drop_na) +importFrom(tidyr,pivot_longer) importFrom(tidyr,pivot_wider) importFrom(tidyr,unnest) importFrom(utils,capture.output) diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R index 84ab865..d2f4af5 100644 --- a/R/check_zeroinflation.R +++ b/R/check_zeroinflation.R @@ -157,9 +157,9 @@ check_zeroinflation <- function(data = NULL, group = unique(df$group), n_genes = nrow(df), n_wells = sum(combined_id == unique(df$group)), - median_p0_obs = median(df$p0_obs), - median_p0_nb = median(df$p0_nb), - median_ZI = median(df$ZI), + mean_p0_obs = mean(df$p0_obs), + mean_p0_nb = mean(df$p0_nb), + mean_ZI = mean(df$ZI), observed_zeros_num = sum(df$obs_zeros_num), expected_zeros_num = sum(df$expected_zeros_num) ) diff --git a/R/compute_multi_de.R b/R/compute_multi_de.R index e98ddef..0f01061 100644 --- a/R/compute_multi_de.R +++ b/R/compute_multi_de.R @@ -58,7 +58,7 @@ compute_multi_de <- function(data = NULL, method <- if (is.null(method)) "edgeR" else method if (!method %in% c("Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", - "limma_voom")) { + "limma_voom","limma_trend")) { stop("Your normalization method is not available.") } if (is.null(control_samples)) { diff --git a/R/compute_normalised_counts.R b/R/compute_normalised_counts.R index 238f12b..813a045 100644 --- a/R/compute_normalised_counts.R +++ b/R/compute_normalised_counts.R @@ -5,7 +5,8 @@ #' @param data A tidyseurat object merged with metadata. Must contain columns #' "Well_ID", "Row", "Column" #' @param method One of "raw", "logNorm", "cpm", "clr", "SCT", "DESeq2", -#' "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "zinb" +#' "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", +#' "limma_voom_zinb", "edgeR_zinb" #' @param batch Either empty, a single value, or a vector corresponding to the #' number of samples #' @param k Parameter k for RUVSeq and zinb methods @@ -53,7 +54,8 @@ compute_normalised_counts <- function(data = NULL, "cpm", "clr", "SCT", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", - "limma_voom", "limma_trend", "zinb")) { + "limma_voom", "limma_trend", "limma_voom_zinb", + "edgeR_zinb")) { stop("Your normalization method is not available.") } batch <- if (is.null(batch)) "1" else as.character(batch) @@ -96,24 +98,24 @@ compute_normalised_counts <- function(data = NULL, normalize_lognorm <- function(data) { lognorm <- NormalizeData(data, normalization.method = "LogNormalize", scale.factor = 10000) - lognorm@assays$RNA$data + return(lognorm@assays$RNA$data) } normalize_cpm <- function(data) { cpm <- NormalizeData(data, normalization.method = "RC", scale.factor = 1e6) - cpm@assays$RNA$data + return(cpm@assays$RNA$data) } normalize_clr <- function(data) { clr <- NormalizeData(data, normalization.method = "CLR", margin = 2) - clr@assays$RNA$data + return(clr@assays$RNA$data) } normalize_sct <- function(data, batch) { sct <- SCTransform(data, do.scale = TRUE, return.only.var.genes = FALSE, vars.to.regress = if (length(batch) == 1) NULL else batch, verbose = FALSE) - sct@assays$SCT$data + return(sct@assays$SCT$data) } normalize_deseq2 <- function(data, batch) { @@ -121,7 +123,7 @@ compute_normalised_counts <- function(data = NULL, design <- model_matrix dds <- DESeqDataSetFromMatrix(countData = data@assays$RNA$counts, colData = coldata, design = design) dds <- estimateSizeFactors(dds) - counts(dds, normalized = TRUE) + return(counts(dds, normalized = TRUE)) } normalize_edger <- function(data, batch) { @@ -135,7 +137,7 @@ compute_normalised_counts <- function(data = NULL, dge <- calcNormFactors(dge, methods = "TMM") design <- model_matrix dge <- estimateDisp(dge, design, BPPARAM = p) - cpm(dge, log = FALSE) + return(cpm(dge, log = FALSE)) } normalize_limma_voom <- function(data, batch) { @@ -143,7 +145,7 @@ compute_normalised_counts <- function(data = NULL, dge <- calcNormFactors(dge, methods = "TMMwsp") design <- model_matrix dge <- voom(dge, design) - dge$E + return(dge$E) } normalize_limma_trend <- function(data, batch) { @@ -154,7 +156,7 @@ compute_normalised_counts <- function(data = NULL, fit <- lmFit(logCPM, design) fit <- eBayes(fit, trend=TRUE) dge <- fit - logCPM + return(logCPM) } normalize_ruvg <- function(data, batch, spikes, k) { @@ -180,7 +182,7 @@ compute_normalised_counts <- function(data = NULL, } if (!all(spikes %in% row.names(data@assays$RNA$counts))) { warning("Some or all of your control genes are not present in the dataset.") - spikes <- intersect(spikes, rownames(counts(set))) + spikes <- intersect(spikes, rownames(counts)) } #k defines number of sources of variation, two have been chosen for row and column set <- EDASeq::newSeqExpressionSet(counts = as.matrix(counts), @@ -236,7 +238,7 @@ compute_normalised_counts <- function(data = NULL, EDASeq::normCounts(set) } - normalize_zinb <- function(data, batch) { + normalize_limmavoom_zinb <- function(data, batch) { message("Please allow extra time for zinb mode.") if (ncol(data) > 50) { @@ -247,34 +249,54 @@ compute_normalised_counts <- function(data = NULL, cl <- makeCluster(num_cores) doParallel::registerDoParallel(cl) p <- BiocParallel::DoparParam() - system.time(zinb <- zinbwave::zinbwave(filtered_sce, K = k, + suppressWarnings(zinb <- zinbwave::zinbwave(filtered_sce, K = k, epsilon=12000, BPPARAM = p, - observationalWeights = TRUE)) + observationalWeights = TRUE, verbose = F)) counts <- zinb@assays@data$counts weights <- zinb@assays@data$weights - - # approximate denoised counts (downweighting dropouts) - #dge <- DGEList(counts = data@assays$RNA$counts, samples = coldata$condition, group = coldata$condition) - #dge <- calcNormFactors(dge, methods = "TMM") - #design <- model_matrix - #dge <- estimateDisp(dge, design, BPPARAM = p) - #dge$weights <- assay(zinb, "weights") - #fit <- glmQLFit(dge, design, BPPARAM = p) - #norm_counts <- fitted(fit) - + dge <- DGEList(counts = counts(filtered_sce), samples = coldata$condition, group = coldata$condition) dge <- calcNormFactors(dge, methods = "TMMwsp") design <- model_matrix dge <- voom(dge, design, weights = weights) normalised_values <- dge$E - - #normalised_values <- zinb@assays@data$normalizedValues stopCluster(cl) doParallel::registerDoParallel() return(normalised_values) } + normalize_edger_zinb <- function(data, batch) { + + message("Please allow extra time for zinb mode.") + if (ncol(data) > 50) { + message("zinb with over 50 samples takes a long time. Consider reducing the number of samples or genes.") + } + data_sce <- as.SingleCellExperiment(data) + filtered_sce <- subset(data_sce, rowSums(as.data.frame(counts(data_sce))) > 0) + cl <- makeCluster(num_cores) + doParallel::registerDoParallel(cl) + p <- BiocParallel::DoparParam() + suppressMessages(zinb <- zinbwave::zinbwave(filtered_sce, K = k, + epsilon=12000, + BPPARAM = p, + observationalWeights = TRUE)) + counts <- zinb@assays@data$counts + weights <- zinb@assays@data$weights + + # approximate denoised counts (downweighting dropouts) + dge <- DGEList(counts = counts, samples = coldata$condition, group = coldata$condition) + dge <- calcNormFactors(dge, methods = "TMM") + design <- model_matrix + dge <- estimateDisp(dge, design, BPPARAM = p) + dge$weights <- weights + fit <- glmQLFit(dge, design, BPPARAM = p) + normalised_values <- fitted(fit) + stopCluster(cl) + doParallel::registerDoParallel() + return(normalised_values) + } + # Main function logic validated <- validate_inputs(data, method, batch, k, max_counts, num_cores) data <- validated$data @@ -297,7 +319,8 @@ compute_normalised_counts <- function(data = NULL, RUVg = normalize_ruvg(data, batch, spikes, k), RUVs = normalize_ruvs(data, batch, k), RUVr = normalize_ruvr(data, batch, k), - zinb = normalize_zinb(data, batch), + limma_voom_zinb = normalize_limmavoom_zinb(data, batch), + edgeR_zinb = normalize_edger_zinb(data, batch), stop("Unsupported normalization method.") ) return(norm_data) diff --git a/R/compute_single_de.R b/R/compute_single_de.R index ea1f600..2a3af1a 100644 --- a/R/compute_single_de.R +++ b/R/compute_single_de.R @@ -12,6 +12,7 @@ utils::globalVariables(c("model_matrix")) #' number of samples #' @param k Parameter k for RUVSeq methods, check RUVSeq tutorial #' @param spikes List of genes to use as spike controls +#' @param num_cores Number of cores for parallel processing #' @importFrom limma makeContrasts eBayes contrasts.fit topTable #' @importFrom tibble rownames_to_column #' @import DESeq2 @@ -34,7 +35,8 @@ compute_single_de <- function(data = NULL, method = NULL, batch = 1, k = 2, - spikes = NULL) { + spikes = NULL, + num_cores = 1) { if (!requireNamespace("SummarizedExperiment", quietly = TRUE)) { stop( "compute_single_de(): the following package is required but not installed: SummarizedExperiment", @@ -48,7 +50,7 @@ compute_single_de <- function(data = NULL, method <- if (is.null(method)) "limma_voom" else method if (!method %in% c("Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", - "limma_voom", "limma_trend")) { + "limma_voom", "limma_trend","edgeR_zinb", "limma_zinb")) { stop("Your normalization method is not available.") } if (is.null(treatment_samples) || is.null(control_samples)) { @@ -157,7 +159,8 @@ compute_single_de <- function(data = NULL, combined_id <- data$combined_id dds <- DESeqDataSetFromMatrix(countData = data@assays$RNA$counts, colData = pheno_data, - design = ~ condition) + # design = ~ condition) + design = if (length(batch) == 1) ~ condition else ~ condition + batch) dds <- DESeq(dds) res <- results(dds, contrast = c("condition", treatment_samples, control_samples)) top_table <- as.data.frame(res) %>% @@ -178,7 +181,7 @@ compute_single_de <- function(data = NULL, latent.vars = "batch", test.use = "wilcox") %>% select("avg_log2FC", "p_val", "p_val_adj") %>% - rename("log2FC" = "avg_log2FC", "p_val" = "p_val", "p_value_adj" = "p_val_adj") %>% + rename("log2FC" = "avg_log2FC", "p_value" = "p_val", "p_value_adj" = "p_val_adj") %>% mutate(metric = 1) %>% rownames_to_column("gene") @@ -190,7 +193,7 @@ compute_single_de <- function(data = NULL, model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else model.matrix(~0 + combined_id + batch) if (length(spikes) == 0) { - warning("List of control genes not provided for RUVg, using default.") + warning("List of control genes not provided for RUVg, using default human housekeeping genes.") spikes <- c( "ACTB", "GAPDH", @@ -316,20 +319,25 @@ compute_single_de <- function(data = NULL, return(as.data.frame(top_table)) } - de_zinb <- function(data, pheno_data, treatment_samples, control_samples, batch, k) { + de_edgeR_zinb <- function(data, pheno_data, treatment_samples, control_samples, batch, k) { data_sce<-as.SingleCellExperiment(data) - filtered_sce <- data_sce[rowSums(counts(data_sce)) > 50, ] - num_cores <- 8 # Change this based on your system + filtered_sce <- data_sce[rowSums(counts(data_sce)) > 0, ] cl <- makeCluster(num_cores) doParallel::registerDoParallel(cl) p <- BiocParallel::DoparParam() system.time(zinb <- zinbwave::zinbwave(filtered_sce, K = 2, - epsilon=1000, + epsilon=12000, BPPARAM = p, observationalWeights = TRUE)) weights <- SummarizedExperiment::assay(zinb, "weights") + + #perform_edgeR + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + dge <- DGEList(SummarizedExperiment::assay(zinb)) dge <- calcNormFactors(dge, method = "TMMwsp") @@ -375,7 +383,8 @@ compute_single_de <- function(data = NULL, RUVg = de_ruvg(data, pheno_data, treatment_samples, control_samples, batch, spikes, k), RUVs = de_ruvs(data, pheno_data, treatment_samples, control_samples, batch, k), RUVr = de_ruvr(data, pheno_data, treatment_samples, control_samples, batch, k), - zinb = de_zinb(data, pheno_data, treatment_samples, control_samples, batch, k), + edgeR_zinb = de_edgeR_zinb(data, pheno_data, treatment_samples, control_samples, batch, k), + limma_zinb = limma_zinb(data, pheno_data, treatment_samples, control_samples, batch, k), stop("Unsupported DE method.") ) return(de_data) diff --git a/R/plot_rle.R b/R/plot_rle.R index 301a438..3395532 100644 --- a/R/plot_rle.R +++ b/R/plot_rle.R @@ -15,7 +15,8 @@ #' @param batch Either empty, a single value, or a vector corresponding to the #' number of samples. #' @param normalisation One of "raw", "logNorm", "cpm", "clr", "SCT", "DESeq2", -#' "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "zinb". If empty, defaults to raw. +#' "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend", "limma_voom_zinb", +#' "edgeR_zinb". If empty, defaults to raw. #' @param spikes List of genes to use as spike controls in RUVg #' @param num_cores Number of cores for edgeR and zinb calculations @@ -205,7 +206,8 @@ plot_rle <- function(data, barcodes = NULL, RUVg = compute_normalised_counts(data, method = "RUVg", batch = batch, spikes = spikes, num_cores = num_cores), RUVs = compute_normalised_counts(data, method = "RUVs", batch = batch, num_cores = num_cores), RUVr = compute_normalised_counts(data, method = "RUVr", batch = batch, num_cores = num_cores), - zinb = compute_normalised_counts(data, method = "zinb", batch = batch, num_cores = num_cores), + limma_voom_zinb = compute_normalised_counts(data, method = "limma_voom_zinb", batch = batch, num_cores = num_cores), + edgeR_zinb = compute_normalised_counts(data, method = "edgeR_zinb", batch = batch, num_cores = num_cores), stop("Unsupported normalization method.") ) diff --git a/R/select_robust_controls.R b/R/select_robust_controls.R new file mode 100644 index 0000000..1a8d3eb --- /dev/null +++ b/R/select_robust_controls.R @@ -0,0 +1,177 @@ +#' Select high-quality control replicates via TMMwsp log-CPM correlation +#' +#' @description +#' For a given control group (e.g., DMSO) on a specific plate/batch, this function +#' ranks samples by their average correlation (Fisher z-averaged) to all *other* +#' samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally) +#' plots per-sample expression distributions and sample-sample correlation heatmaps. +#' +#' @param data A tidyseurat object containing an RNA assay with a **counts** layer. +#' @param samples the control/treatment label to keep in column samples +#' (e.g., "CB_43_EP73_0"). Only cells/samples with this label are considered. +#' @param orig_ident Character scalar: the plate/batch identifier to keep +#' (e.g., "VH02012942"). Only cells/samples from this batch are considered. +#' @param cpm_filter Numeric scalar; CPM threshold used for gene filtering prior to +#' normalization (default 1). +#' @param min_samps Integer; a gene must be expressed (CPM > cpm_filter) in at least +#' this many samples to be retained (default 16). +#' @param corr_method Correlation type used for ranking; one of +#' c("spearman","pearson") (default "spearman"). +#' @param top_n Integer; the number of top-ranked samples to report in topN. +#' Ties at the cutoff are kept (default 5). +#' @param make_plots Logical; if TRUE, print a log2-CPM boxplot and Pearson/Spearman +#' correlation heatmaps (default TRUE). +#' +#' @details +#' Workflow: +#' 1) Subset to the specified samples and orig_ident (plate/batch). +#' 2) Build an edgeR::DGEList, filter lowly expressed genes using CPM and min_samps. +#' 3) Normalize with TMMwsp and compute log2-CPM. +#' 4) Rank samples by mean Fisher z transformed correlation to all other samples +#' (according to corr_method). +#' 5) Return the ranking, correlation matrices, the normalized matrix, and (optionally) +#' plots for QC. +#' +#' Column names of the counts matrix are rewritten to "_" +#' for easier visual inspection in plots. +#' +#' @return A list with elements: +#' * subset_obj: The Seurat object subset used for analysis. +#' * dge: The filtered edgeR::DGEList +#' * log_cpm_tmm: Matrix of TMMwsp log2-CPM. +#' * boxplot_df: Long-format data frame used for the boxplot (gene, sample, log_cpm). +#' * cor_pearson: Sample-sample Pearson correlation matrix. +#' * cor_spearman: Sample-sample Spearman correlation matrix. +#' * ranking_method: The correlation method used for ranking. +#' * scores_mean_to_others: Named numeric vector of mean Fisher-z back-transformed +#' correlations (higher = better), sorted decreasing. +#' * topN: Named numeric vector of the top-ranked samples (ties at the cutoff kept). + +#' +#' +#' @examples +#' data(mini_mac) +#' res <- select_robust_controls(mini_mac,samples = "DMSO_0", orig_ident = "PMMSq033_mini") +#' +#' +#' @importFrom rlang .data +#' @importFrom edgeR DGEList calcNormFactors cpm +#' @importFrom tibble rownames_to_column +#' @importFrom tidyr pivot_longer +#' @export + +select_robust_controls <- function( + data, + samples, # e.g. "CB_43_EP73_0" + orig_ident, # e.g. "VH02012942" + cpm_filter = 1, # CPM threshold for gene filtering + min_samps = 16, # number of samples a gene must be expressed in + corr_method = c("spearman","pearson"), + top_n = 5, + make_plots = TRUE +){ + validate_inputs <- function(data, samples, orig_ident) { + if (!inherits(data, "Seurat")) { + stop("argument 'data' must be a Seurat or TidySeurat object.") + } + + # check samples and orig_ident columns + if (colnames(data@meta.data)%in% c("combined_id","orig.ident") %>% sum() < 2) { + stop("The 'data' object must contain 'combined_id' and 'orig.ident' columns in its metadata.") + } + # check samples in samples column + if (is.null(samples)){ + stop("Please provide a value for 'samples'.") + } else if (!all(samples %in% unique(data$combined_id))) { + stop("Some values in 'samples' are not found in the 'combined_id' column of 'data'.") + } + # check orig.ident in the orig.ident column + if (is.null(orig_ident)){ + stop("Please provide a value for 'orig_ident'.") + } else if (!orig_ident %in% unique(data$orig.ident)) { + stop("The value of 'orig_ident' is not found in the 'orig.ident' column of 'data'.") + } + return(list(data = data, samples = samples, orig_ident = orig_ident)) + } + validated <- validate_inputs(data = data, samples = samples, orig_ident = orig_ident) + data <- validated$data + group_by <- validated$orig_ident + samples <- validated$samples + + corr_method <- match.arg(corr_method) + sel_cells <- colnames(data)[data$combined_id == samples & + data$orig.ident == orig_ident] + if (length(sel_cells) == 0L) { + stop("No cells/samples match the specified 'combined_id' and 'orig_ident'.") + } + subgroup <- subset(data, cells = sel_cells) + # Counts and human-friendly column names + counts_d <- Seurat::GetAssayData(subgroup, assay = "RNA", layer = "counts") + well_colnames <- paste0(subgroup$orig.ident, "_", subgroup$Well_ID) + names(well_colnames) <- rownames(subgroup@meta.data) + colnames(counts_d) <- well_colnames[colnames(counts_d)] + # edgeR container + gene filtering + y <- edgeR::DGEList(counts_d, group = subgroup$orig.ident) + keep <- rowSums(edgeR::cpm(y) > cpm_filter) >= min_samps + y <- y[keep, , keep.lib.sizes = FALSE] + y <- edgeR::calcNormFactors(y, method = "TMMwsp") + log_cpm_tmm <- edgeR::cpm(y, log = TRUE, normalized.lib.sizes = TRUE) + # Long data for boxplot + df_long <- as.data.frame(log_cpm_tmm) |> + tibble::rownames_to_column(var = "gene") |> + tidyr::pivot_longer(-gene, names_to = "sample", values_to = "log_cpm") + if (make_plots) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { + warning("Package 'ggplot2' not available; skipping boxplot.") + } else { + print( + ggplot2::ggplot(df_long, ggplot2::aes(x = .data$sample, y = .data$log_cpm)) + + ggplot2::geom_boxplot(outlier.size = 0.5) + + ggplot2::labs(x = "Sample", y = "log2 CPM", + title = "Boxplot of log2-CPM (TMMwsp)") + + ggplot2::theme_classic() + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + ) + } + } + # Correlation matrices + cors_pear <- stats::cor(log_cpm_tmm, method = "pearson") + cors_spear <- stats::cor(log_cpm_tmm, method = "spearman") + if (make_plots) { + if (!requireNamespace("pheatmap", quietly = TRUE)) { + warning("Package 'pheatmap' not available; skipping heatmaps.") + } else { + pheatmap::pheatmap(cors_pear, main = "Pearson correlation") + pheatmap::pheatmap(cors_spear, main = "Spearman correlation") + } + } + # Ranking by mean Fisher-z correlation to all *other* samples + R <- stats::cor(log_cpm_tmm, method = corr_method, use = "pairwise.complete.obs") + diag(R) <- NA_real_ + # Clip to (-1,1), Fisher z-transform, average, back-transform + Z <- atanh(pmin(pmax(R, -0.999999), 0.999999)) + score_z <- rowMeans(Z, na.rm = TRUE) + score_r <- tanh(score_z) + # Top-N names and scores (keep ties at the cutoff) + ord <- order(score_r, decreasing = TRUE, na.last = NA) + srt <- score_r[ord] + if (length(srt) == 0L) { + stop("No samples available after filtering; adjust 'cpm_filter'/'min_samps'.") + } + k <- min(top_n, length(srt)) + cutoff <- srt[k] + keepN <- srt >= cutoff + topN <- srt[keepN] + # Return everything useful + list( + subset_obj = subgroup, + dge = y, + log_cpm_tmm = log_cpm_tmm, + boxplot_df = df_long, + cor_pearson = cors_pear, + cor_spearman = cors_spear, + ranking_method = corr_method, + scores_mean_to_others = sort(score_r, decreasing = TRUE), + topN = topN + ) +} diff --git a/R/utils.R b/R/utils.R index 0718920..3036c14 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,3 +1,4 @@ utils::globalVariables(c("diff_expressed", "gene_labels", ".cell", "UMAPde_1", "UMAPde_2", - "padj", "target", "target_id")) \ No newline at end of file + "padj", "target", "target_id", + "log_cpm")) \ No newline at end of file diff --git a/_pkgdown.yaml b/_pkgdown.yaml index 3fc5d00..e9510fd 100644 --- a/_pkgdown.yaml +++ b/_pkgdown.yaml @@ -31,8 +31,8 @@ navbar: href: articles/integration_across_modalities.html - text: "Working with Bioconductor classes" href: articles/macpie_bioc.html - - text: "Chck zero-inflation" - href: articles/check_zero_inflation.html + - text: "Assessing zero-inflation" + href: articles/assessing_zero_inflation.html - text: "Reference" href: reference/index.html # Links to your function reference docs. right: diff --git a/compute_single_de.R b/compute_single_de.R new file mode 100644 index 0000000..f47983c --- /dev/null +++ b/compute_single_de.R @@ -0,0 +1,425 @@ +utils::globalVariables(c("model_matrix")) +#' Retrieve normalised counts of MAC-seq data +#' +#' This function retrieves counts from a number of methods that are available +#' for normalisation, with the default of limma-voomLmFit. +#' @param data A tidyseurat object merged with metadata. Must contain columns +#' "Well_ID", "Row", "Column" +#' @param treatment_samples Value in the column "combined_id" representing replicates of treatment samples in the data +#' @param control_samples Value in the column "combined_id" representing replicates of control samples in the data +#' @param method One of "Seurat_wilcox", "DESeq2", "edgeR", "RUVg", "RUVs", "RUVr", "limma_voom", "limma_trend" +#' @param batch Either empty, a single value, or a vector corresponding to the +#' number of samples +#' @param k Parameter k for RUVSeq methods, check RUVSeq tutorial +#' @param spikes List of genes to use as spike controls +#' @param num_cores Number of cores for parallel processing +#' @importFrom limma makeContrasts eBayes contrasts.fit topTable +#' @importFrom tibble rownames_to_column +#' @import DESeq2 +#' @import RUVSeq +#' @importFrom stats model.matrix +#' @importFrom dplyr rename select +#' +#' @returns Data frame of DE counts +#' @export +#' +#' @examples +#' data(mini_mac) +#' treatment_samples="Staurosporine_0.1" +#' control_samples<-"DMSO_0" +#' top_table <- compute_single_de(mini_mac, treatment_samples, control_samples, method = "limma_voom") + +compute_single_de <- function(data = NULL, + treatment_samples = NULL, + control_samples = NULL, + method = NULL, + batch = 1, + k = 2, + spikes = NULL, + num_cores = 1) { + if (!requireNamespace("SummarizedExperiment", quietly = TRUE)) { + stop( + "compute_single_de(): the following package is required but not installed: SummarizedExperiment", + "\nPlease install via `install.packages()`.") + } + # Helper function to validate input data + validate_inputs <- function(data, method, treatment_samples, control_samples) { + if (!inherits(data, "Seurat")) { + stop("argument 'data' must be a Seurat or TidySeurat object.") + } + method <- if (is.null(method)) "limma_voom" else method + if (!method %in% c("Seurat_wilcox", "DESeq2", "edgeR", + "RUVg", "RUVs", "RUVr", + "limma_voom", "limma_trend","edgeR_zinb", "limma_zinb")) { + stop("Your normalization method is not available.") + } + if (is.null(treatment_samples) || is.null(control_samples)) { + stop("Missing the vectors of treatment and control samples.") + } + if (!"combined_id" %in% colnames(data@meta.data)) { + data <- data %>% + mutate(combined_id = apply(select(starts_with("Treatment_") | starts_with("Concentration_")), + 1, paste, collapse = "_")) %>% + mutate(combined_id = gsub(" ", "", .data$combined_id)) + } + if (length(treatment_samples) == 1 && length(control_samples) == 1) { + treatment_samples_list <- grepl(treatment_samples, data$combined_id) + control_samples_list <- grepl(control_samples, data$combined_id) + if (any(sum(treatment_samples_list) == 0, sum(control_samples_list) == 0)) { + stop("Your treatment and control samples are not in your combined_id column.") + } + } + } + + # Helper: Prepare data and pheno_data + prepare_data <- function(data, treatment_samples, control_samples, batch) { + data <- data[, grepl(paste0(treatment_samples, "|", control_samples), data$combined_id)] + if (length(unique(data$combined_id)) < 2) { + stop("Insufficient factors for differential expression analysis.") + } + pheno_data <- data.frame(batch = as.factor(batch), condition = as.factor(data$combined_id)) + return(list(data = data, pheno_data = pheno_data)) + } + + de_limma_voom <- function(data, pheno_data, treatment_samples, control_samples) { + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + dge <- DGEList(counts = data@assays$RNA$counts, + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- estimateDisp(dge, model_matrix) + dge <- calcNormFactors(dge, method = "TMMwsp") + fit <- voomLmFit(dge, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + tmp <- contrasts.fit(fit, contrasts) + tmp <- eBayes(tmp, robust = TRUE) + top_table <- topTable(tmp, number = Inf, sort.by = "P") %>% + select("logFC", "t", "P.Value", "adj.P.Val") %>% + rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val") %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } + + de_limma_trend <- function(data, pheno_data, treatment_samples, control_samples) { + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + dge <- DGEList(counts = data@assays$RNA$counts, + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- estimateDisp(dge, model_matrix) + dge <- calcNormFactors(dge, method = "TMMwsp") + logCPM <- log2(edgeR::cpm(dge, normalized.lib.sizes=T,log=F)+1) + fit <- lmFit(logCPM, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + tmp <- contrasts.fit(fit, contrasts) + tmp <- eBayes(tmp, trend = TRUE) + top_table <- topTable(tmp, number = Inf, sort.by = "P") %>% + select("logFC", "t", "P.Value", "adj.P.Val") %>% + rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val") %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } + + de_edger <- function(data, pheno_data, treatment_samples, control_samples) { + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + dge <- DGEList(counts = data@assays$RNA$counts, + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- calcNormFactors(dge, method = "TMMwsp") + dge <- estimateDisp(dge, model_matrix) + fit <- glmQLFit(dge, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + qlf <- glmQLFTest(fit, contrast = contrasts) + top_table <- topTags(qlf, n = nrow(data)) %>% + as.data.frame() %>% + select("logFC", "F", "PValue", "FDR") %>% + rename("log2FC" = "logFC", "metric" = "F", "p_value" = "PValue", "p_value_adj" = "FDR") %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } + + #DEseq produces NA adjusted p-values if + de_deseq2 <- function(data, pheno_data, treatment_samples, control_samples) { + combined_id <- data$combined_id + dds <- DESeqDataSetFromMatrix(countData = data@assays$RNA$counts, + colData = pheno_data, + design = ~ condition) + dds <- DESeq(dds) + res <- results(dds, contrast = c("condition", treatment_samples, control_samples)) + top_table <- as.data.frame(res) %>% + select("log2FoldChange", "stat", "pvalue", "padj") %>% + rename("log2FC" = "log2FoldChange", "metric" = "stat", "p_value" = "pvalue", "p_value_adj" = "padj") %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } + + de_seurat <- function(data, pheno_data, treatment_samples, control_samples) { + combined_id <- data$combined_id + data <- NormalizeData(data) + Idents(data) <- combined_id + data$batch <- batch + top_table <- FindMarkers(data, + ident.1 = treatment_samples, + ident.2 = control_samples, + latent.vars = "batch", + test.use = "wilcox") %>% + select("avg_log2FC", "p_val", "p_val_adj") %>% + rename("log2FC" = "avg_log2FC", "p_value" = "p_val", "p_value_adj" = "p_val_adj") %>% + mutate(metric = 1) %>% + rownames_to_column("gene") + + return(as.data.frame(top_table)) + } + + de_ruvg <- function(data, pheno_data, treatment_samples, control_samples, batch, spikes, k) { + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + if (length(spikes) == 0) { + warning("List of control genes not provided for RUVg, using default human housekeeping genes.") + spikes <- c( + "ACTB", + "GAPDH", + "RPLP0", + "B2M", + "HPRT1", + "PGK1", + "TBP", + "UBC", + "YWHAZ", + "PPIA", + "RPL19", + "EEF1A1", + "RPS18", + "TFRC" + ) + } + if (!all(spikes %in% row.names(data@assays$RNA$counts))) { + warning("Some or all of your control genes are not present in the dataset.") + spikes <- spikes[spikes %in% row.names(data@assays$RNA$counts)] + } + #k defines number of sources of variation, two have been chosen for row and column + set <- EDASeq::newSeqExpressionSet(counts = as.matrix(data@assays$RNA$counts), + phenoData = pheno_data) + set <- RUVg(set, cIdx = spikes, k = k) + + dge <- DGEList(counts = EDASeq::normCounts(set), + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- calcNormFactors(dge, method = "upperquartile") + dge <- estimateGLMCommonDisp(dge, model_matrix) + dge <- estimateGLMTagwiseDisp(dge, model_matrix) + fit <- glmFit(dge, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + lrt <- glmLRT(fit, contrast = contrasts) + top_table <- topTags(lrt, n = nrow(data)) %>% + as.data.frame() %>% + select("logFC", "PValue", "FDR") %>% + rename("log2FC" = "logFC", "p_value" = "PValue", "p_value_adj" = "FDR") %>% + mutate(metric = 1) %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } + + de_ruvs <- function(data, pheno_data, treatment_samples, control_samples, batch, k) { + combined_id <- data$combined_id + genes <- row.names(data@assays$RNA$counts) + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + + #k defines number of sources of variation, two have been chosen for row and column + set <- EDASeq::newSeqExpressionSet(counts = as.matrix(data@assays$RNA$counts), + phenoData = pheno_data) + differences <- makeGroups(combined_id) + set <- RUVs(set, cIdx = genes, k = k, scIdx = differences) + + dge <- DGEList(counts = EDASeq::normCounts(set), + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- calcNormFactors(dge, method = "upperquartile") + dge <- estimateGLMCommonDisp(dge, model_matrix) + dge <- estimateGLMTagwiseDisp(dge, model_matrix) + fit <- glmFit(dge, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + lrt <- glmLRT(fit, contrast = contrasts) + top_table <- topTags(lrt, n = nrow(data)) %>% + as.data.frame() %>% + select("logFC", "PValue", "FDR") %>% + rename("log2FC" = "logFC", "p_value" = "PValue", "p_value_adj" = "FDR") %>% + mutate(metric = 1) %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } + + de_ruvr <- function(data, pheno_data, treatment_samples, control_samples, batch, k) { + if (ncol(data) > 100) { + message("EdgeR with over 100 samples takes very long time. Consider reducing the number of samples.") + } + combined_id <- data$combined_id + genes <- row.names(data@assays$RNA$counts) + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + + #k defines number of sources of variation, two have been chosen for row and column + set <- EDASeq::newSeqExpressionSet(counts = as.matrix(data@assays$RNA$counts), + phenoData = pheno_data) + dge <- DGEList(counts = data@assays$RNA$counts, + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- calcNormFactors(dge, method = "TMMwsp") + dge <- estimateGLMCommonDisp(dge, model_matrix) + dge <- estimateGLMTagwiseDisp(dge, model_matrix) + fit <- glmFit(dge, model_matrix) + res <- residuals(fit, type = "deviance") + set <- RUVr(set, genes, k = k, res) + + dge <- DGEList(counts = EDASeq::normCounts(set), + samples = pheno_data$condition, + group = pheno_data$condition) + dge <- calcNormFactors(dge, method = "upperquartile") + dge <- estimateGLMCommonDisp(dge, model_matrix) + dge <- estimateGLMTagwiseDisp(dge, model_matrix) + fit <- glmFit(dge, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + lrt <- glmLRT(fit, contrast = contrasts) + top_table <- topTags(lrt, n = nrow(data)) %>% + as.data.frame() %>% + select("logFC", "PValue", "FDR") %>% + rename("log2FC" = "logFC", "p_value" = "PValue", "p_value_adj" = "FDR") %>% + mutate(metric = 1) %>% + rownames_to_column("gene") + return(as.data.frame(top_table)) + } + + de_edgeR_zinb <- function(data, pheno_data, treatment_samples, control_samples, batch, k) { + + data_sce<-as.SingleCellExperiment(data) + filtered_sce <- data_sce[rowSums(counts(data_sce)) > 0, ] + cl <- makeCluster(num_cores) + doParallel::registerDoParallel(cl) + p <- BiocParallel::DoparParam() + system.time(zinb <- zinbwave::zinbwave(filtered_sce, K = 2, + epsilon=12000, + BPPARAM = p, + observationalWeights = TRUE)) + + weights <- SummarizedExperiment::assay(zinb, "weights") + + #perform_edgeR + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + + dge <- DGEList(SummarizedExperiment::assay(zinb)) + dge <- calcNormFactors(dge, method = "TMMwsp") + + dge$weights <- weights + combined_id <- data$combined_id + design <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + + dge <- estimateDisp(dge, design) + fit <- glmQLFit(dge, design) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + qlf <- glmQLFTest(fit, contrast = contrasts) + + top_table <- topTags(qlf, n = nrow(data)) %>% + as.data.frame() %>% + select("logFC", "F", "PValue", "FDR") %>% + rename("log2FC" = "logFC", "metric" = "F", "p_value" = "PValue", "p_value_adj" = "FDR") %>% + rownames_to_column("gene") + stopCluster(cl) + doParallel::registerDoParallel() + return(as.data.frame(top_table)) + } + de_limma_zinb <- function(data, pheno_data, treatment_samples, control_samples, batch, k) { + combined_id <- data$combined_id + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + + data_sce <- as.SingleCellExperiment(data) + filtered_sce <- subset(data_sce, rowSums(as.data.frame(counts(data_sce))) > 0) + cl <- makeCluster(num_cores) + doParallel::registerDoParallel(cl) + p <- BiocParallel::DoparParam() + suppressWarnings(zinb <- zinbwave::zinbwave(filtered_sce, K = k, + epsilon=12000, + BPPARAM = p, + observationalWeights = TRUE, verbose = F)) + counts <- zinb@assays@data$counts + weights <- zinb@assays@data$weights + + dge <- DGEList(counts = counts(filtered_sce), samples = pheno_data$condition, group = pheno_data$condition) + dge <- calcNormFactors(dge, method = "TMMwsp") + v <- voom(dge, design = model_matrix, plot = FALSE, weights = weights) + fit <- lmFit(v, model_matrix) + myargs <- list(paste0("combined_id", + treatment_samples, "-", + paste0("combined_id", control_samples)), + levels = model_matrix) + contrasts <- do.call(makeContrasts, myargs) + tmp <- contrasts.fit(fit, contrasts) + tmp <- eBayes(tmp, robust = TRUE) + top_table <- topTable(tmp, number = Inf, sort.by = "P") %>% + select("logFC", "t", "P.Value", "adj.P.Val") %>% + rename("log2FC" = "logFC", "metric" = "t", "p_value" = "P.Value", "p_value_adj" = "adj.P.Val") %>% + rownames_to_column("gene") + stopCluster(cl) + doParallel::registerDoParallel() + return(as.data.frame(top_table)) + } + # Main function + validate_inputs(data, method, treatment_samples, control_samples) + prepared <- prepare_data(data, treatment_samples, control_samples, batch) + data <- prepared$data + pheno_data <- prepared$pheno_data + + # Select the appropriate normalization method + de_data <- switch( + method, + limma_voom = de_limma_voom(data, pheno_data, treatment_samples, control_samples), + limma_trend = de_limma_trend(data, pheno_data, treatment_samples, control_samples), + edgeR = de_edger(data, pheno_data, treatment_samples, control_samples), + DESeq2 = de_deseq2(data, pheno_data, treatment_samples, control_samples), + Seurat_wilcox = de_seurat(data, pheno_data, treatment_samples, control_samples), + RUVg = de_ruvg(data, pheno_data, treatment_samples, control_samples, batch, spikes, k), + RUVs = de_ruvs(data, pheno_data, treatment_samples, control_samples, batch, k), + RUVr = de_ruvr(data, pheno_data, treatment_samples, control_samples, batch, k), + edgeR_zinb = de_edgeR_zinb(data, pheno_data, treatment_samples, control_samples, batch, k), + limma_zinb = de_limma_zinb(data, pheno_data, treatment_samples, control_samples, batch, k), + stop("Unsupported DE method.") + ) + return(de_data) +} diff --git a/man/select_robust_controls.Rd b/man/select_robust_controls.Rd new file mode 100644 index 0000000..fb4254f --- /dev/null +++ b/man/select_robust_controls.Rd @@ -0,0 +1,83 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/select_robust_controls.R +\name{select_robust_controls} +\alias{select_robust_controls} +\title{Select high-quality control replicates via TMMwsp log-CPM correlation} +\usage{ +select_robust_controls( + data, + samples, + orig_ident, + cpm_filter = 1, + min_samps = 16, + corr_method = c("spearman", "pearson"), + top_n = 5, + make_plots = TRUE +) +} +\arguments{ +\item{data}{A tidyseurat object containing an RNA assay with a \strong{counts} layer.} + +\item{samples}{the control/treatment label to keep in column samples +(e.g., "CB_43_EP73_0"). Only cells/samples with this label are considered.} + +\item{orig_ident}{Character scalar: the plate/batch identifier to keep +(e.g., "VH02012942"). Only cells/samples from this batch are considered.} + +\item{cpm_filter}{Numeric scalar; CPM threshold used for gene filtering prior to +normalization (default 1).} + +\item{min_samps}{Integer; a gene must be expressed (CPM > cpm_filter) in at least +this many samples to be retained (default 16).} + +\item{corr_method}{Correlation type used for ranking; one of +c("spearman","pearson") (default "spearman").} + +\item{top_n}{Integer; the number of top-ranked samples to report in topN. +Ties at the cutoff are kept (default 5).} + +\item{make_plots}{Logical; if TRUE, print a log2-CPM boxplot and Pearson/Spearman +correlation heatmaps (default TRUE).} +} +\value{ +A list with elements: +\itemize{ +\item subset_obj: The Seurat object subset used for analysis. +\item dge: The filtered edgeR::DGEList +\item log_cpm_tmm: Matrix of TMMwsp log2-CPM. +\item boxplot_df: Long-format data frame used for the boxplot (gene, sample, log_cpm). +\item cor_pearson: Sample-sample Pearson correlation matrix. +\item cor_spearman: Sample-sample Spearman correlation matrix. +\item ranking_method: The correlation method used for ranking. +\item scores_mean_to_others: Named numeric vector of mean Fisher-z back-transformed +correlations (higher = better), sorted decreasing. +\item topN: Named numeric vector of the top-ranked samples (ties at the cutoff kept). +} +} +\description{ +For a given control group (e.g., DMSO) on a specific plate/batch, this function +ranks samples by their average correlation (Fisher z-averaged) to all \emph{other} +samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally) +plots per-sample expression distributions and sample-sample correlation heatmaps. +} +\details{ +Workflow: +\enumerate{ +\item Subset to the specified samples and orig_ident (plate/batch). +\item Build an edgeR::DGEList, filter lowly expressed genes using CPM and min_samps. +\item Normalize with TMMwsp and compute log2-CPM. +\item Rank samples by mean Fisher z transformed correlation to all other samples +(according to corr_method). +\item Return the ranking, correlation matrices, the normalized matrix, and (optionally) +plots for QC. +} + +Column names of the counts matrix are rewritten to "_" +for easier visual inspection in plots. +} +\examples{ +data(mini_mac) +res <- select_robust_controls(mini_mac,samples = "DMSO_0", orig_ident = "PMMSq033_mini") + + +} diff --git a/tests/testthat/test-select_robust_controls.R b/tests/testthat/test-select_robust_controls.R new file mode 100644 index 0000000..018484e --- /dev/null +++ b/tests/testthat/test-select_robust_controls.R @@ -0,0 +1,7 @@ +test_that("multiplication works", { + # Load test dataset from tests/testthat/testdata + testdata <- get_testdata() + expect_error(select_robust_controls(testdata, samples=NULL, orig_ident="PLATE")) + res <- select_robust_controls(testdata, samples="DMSO_0", orig_ident="testdata", make_plots = FALSE) + expect_true(is.list(res)) +}) diff --git a/vignettes/assessing_zero_inflation.Rmd b/vignettes/assessing_zero_inflation.Rmd new file mode 100644 index 0000000..36adc17 --- /dev/null +++ b/vignettes/assessing_zero_inflation.Rmd @@ -0,0 +1,328 @@ +--- +title: "Assessing zero-inflation in your data" +output: + rmarkdown::html_document: + theme: flatly + toc: true + toc_float: true + toc_depth: 5 +vignette: > + %\VignetteIndexEntry{assessing_zero_inflation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} + %\VignetteBuild{true} + +--- + + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + digits = 3 +) +options(digits = 3) +``` + + + +## Overview + +This is a quick demonstration of the `check_zeroinflation()` function from our **macpie** package. This function is a fast diagnostic tool to help you assess whether your MACseq data exhibits zero-inflation relative to a negative binomial (NB) model. + +We use a lightweight convenience wrapper `subsample_genes()` around **`seqgendiff::select_counts()`** to create a smaller object with a subset of genes for faster computation. This function is also part of the **macpie** package. + + +Under the hood, `check_zeroinflation()` uses **edgeR** to estimate gene-wise dispersions and compute expected zero probabilities under a NB model. It then compares the observed and expected zero-inflated indexes for each gene within each group defined in the metadata. + +This function: + + - estimates gene-wise tagwise dispersions with edgeR (using all selected groups), + + - builds NB-expected zero probabilities from TMMwsp-scaled means, and + + - returns per-gene ZI (observed zeros minus NB-expected zeros) and per-group summaries (e.g., % genes with ZI > 0.05). ZI-cutoffs are user-defined. + + +The output is a list with two elements: + + - `summary_by_group`: A summary table showing the number and percentage of genes that are zero-inflated for each group at specified cutoffs. + + - `gene_metrics_by_group`: A detailed table with gene-wise metrics, including observed and expected zero numbers and proportions, zero-inflation index (ZI), mean count, and dispersion estimates for each gene in each group. + +**Note**: `check_zeroinflation()` relies on edgeR to estimate dispersion. The current implementation requires ≥2 groups in the design so that edgeR can stabilize gene-wise dispersions across groups. If you only have a single group and still want a design-aware baseline for expected zeros, fit a Gamma–Poisson/NB GLM and compute the expected zero probabilities from its fitted means and over-dispersion. + + +*** +
+ +```{r set_wd, include=FALSE} +dir <- "/Users/liuxin/macpie_Dev/" +devtools::load_all(paste0(dir, "macpie/")) +``` + + +Load data and preprocess before subsampling genes and checking for zero-inflation. + +```{r setup} +#install.packages("macpie") # or devtools::install_github("PMCC-BioinformaticsCore/macpie") +library(macpie) +library(seqgendiff) +library(ggplot2) +library(cowplot) + +# Define project variables +project_name <- "PMMSq033" +project_metadata <- system.file("extdata/PMMSq033_metadata.csv", package = "macpie") +# Load metadata +metadata <- read_metadata(project_metadata) + +``` + +```{r load_data} +# Import raw data +project_rawdata <- paste0(dir, "/macpieData/PMMSq033/raw_matrix") +project_name <- "PMMSq033" +raw_counts <- Read10X(data.dir = project_rawdata) +# Create tidySeurat object +mac <- CreateSeuratObject(counts = raw_counts, + project = project_name, + min.cells = 1, + min.features = 1) +# Join with metadata +mac <- mac %>% + inner_join(metadata, by = c(".cell" = "Barcode")) +# Add unique identifier +mac <- mac %>% + mutate(combined_id = str_c(Compound_ID, Concentration_1, sep = "_")) %>% + mutate(combined_id = gsub(" ", "", .data$combined_id)) +mac <- mac %>% + filter(Project == "Current") + +``` + + +## Subsample genes without filtering + +We first subsample genes without filtering to see the zero-inflation results before filtering lowly expressed genes. + +This is to randomly select a subset of 1000 genes for a quick check. For a more comprehensive analysis, consider using a larger subset. + +```{r} +# Subsample genes for faster computation +sub_mac_unfiltered <- subsample_genes(mac, ngene = 1000, gselect = "random", seed = 1) +sub_mac_unfiltered %>% nrow() +``` + + +We now look at zero-inflation across high dose treatment groups and DMSO control. + +```{r} +# Check for zero-inflation +high_doses <- grep("_10$", unique(sub_mac_unfiltered$combined_id), value = TRUE) +#add DMSO_0 +high_doses <- c(high_doses, "DMSO_0") +zi_results_unfiltered <- check_zeroinflation(sub_mac_unfiltered, + group_by = "combined_id", + samples = high_doses, + batch = 1, + cutoffs = c(0.1, 0.2)) +``` + +#### View gene-wise metrics for each group + +For example, view the first few rows for DMSO_0 group: + +```{r} +zi_results_unfiltered$gene_metrics_by_group %>% filter(group=="DMSO_0") %>% head() + +``` +And for Staurosporine_10 group: + +```{r} +zi_results_unfiltered$gene_metrics_by_group %>% filter(group=="Staurosporine_10") %>% head() +``` + + + +#### View summary statistics for each group + +```{r} +zi_results_unfiltered$summary_by_group %>% head(10) +``` + + +From the summary table, we can see the summary statistics for each group, including the number and percentage of genes that are zero-inflated at the specified cutoffs. + +Each of the columns in the summary table are defined as follows: + + - group: The treatment group + + - n_genes: Total number of genes analyzed in the group + + - n_wells: Total number of wells/samples in the group + + - mean_p0_obs: mean observed zero proportion across genes in the group + + - mean_p0_nb: mean expected zero proportion under the NB model across genes in the group + + - mean_ZI: mean zero-inflation index (ZI = p0_obs - p0_nb for each gene) across genes in the group + + - observed_zeros_num: Number of data points with observed zeros (it shouldn't be more than n_genes*n_wells for each group) + + - expected_zeros_num: Number of data points with expected zeros under the NB model (same here, it shouldn't be more than n_genes*n_wells for each group) + + - pct_ZI_gt_0.1: Percentage of genes with ZI greater than 0.1 + + - pct_ZI_gt_0.2: Percentage of genes with ZI greater than 0.2 + + +Visualisation of zero-inflation metrics for high dose treatment groups and DMSO control. + +```{r, fig.height=10, fig.width=8} +high_doses_zi_results_unfiltered <- zi_results_unfiltered$summary_by_group %>% filter(grepl("_10$", group)) +#concatenate with DMSO +high_doses_zi_results_unfiltered <- rbind(high_doses_zi_results_unfiltered, + zi_results_unfiltered$summary_by_group %>% filter(group=="DMSO_0")) + + +long_zi_results_unfiltered <- high_doses_zi_results_unfiltered %>% select(group, mean_p0_obs, mean_p0_nb, mean_ZI) %>% + pivot_longer(cols = c(mean_p0_obs, mean_p0_nb, mean_ZI), + names_to = "metric", + values_to = "value") + +# rank groups by mean_p0_obs +long_zi_results_unfiltered$group <- factor(long_zi_results_unfiltered$group, + levels = high_doses_zi_results_unfiltered$group[order(-high_doses_zi_results_unfiltered$mean_p0_obs)]) + +#only show mean_p0_obs and mean_p0_nb +long_zi_results_unfiltered_prop <- long_zi_results_unfiltered %>% filter(metric %in% c("mean_p0_obs", "mean_p0_nb")) + + +p1 <- ggplot(long_zi_results_unfiltered_prop, aes(x = group, y = value, fill = metric)) + + geom_bar(stat = "identity", position = "dodge") + + labs(title = "Zero-Inflation Metrics by Treatment Groups (10uM) and DMSO \nmacseq-unfiltered", + x = "Groups", + y = "zero-inflation proportion") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_manual(values = macpie_colours$discrete[1:3]) + +# show ZI separately +p2 <- ggplot(long_zi_results_unfiltered %>% filter(metric=="mean_ZI"), aes(x = group, y = value, fill = metric)) + + geom_bar(stat = "identity", position = "dodge") + + labs(title = "Mean Zero-Inflation Index (ZI) by Treatment Groups (10uM) and DMSO \nmacseq-unfiltered", + x = "Groups", + y = "Mean Zero-Inflation Index (ZI)") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_manual(values = macpie_colours$discrete[1]) + + +plot_grid(p1, p2, ncol = 1) + +``` + +Ranking of groups by mean observed zero proportion (mean_p0_obs) shows that Staurosporine_10 has the highest observed zero proportion, followed by Camptothecin_10, and paclitaxel_10. We can see staurosporine_10 group has the highest mean ZI value and highest proportion of zeros among all treatment groups, indicating significant zero-inflation in this group. Other treatment groups show small or even negative mean ZI values, suggesting no significant zero-inflation. As staurosporine is a cell death control, which it is expected to see zero-inflated data in this group. + +DMSO control group with ~60% of zeros and small mean ZI value, indicating no significant zero-inflation in the control group. + + + +## Subsample genes after filtering lowly expressed genes + +Filter genes with very low counts across all samples. This step is important because genes with extremely low expression can lead to unreliable estimates of dispersion and expected zero probabilities, which can skew the zero-inflation assessment. + +Here we filter genes that have at least 10 counts in at least 2 samples within each group defined by `combined_id`. + +```{r} +# Filter by read count per sample group +mac <- filter_genes_by_expression(mac, + group_by = "combined_id", + min_counts = 10, + min_samples = 2) +``` + + +### Subsample genes for faster computation + +This is to randomly select a subset of 1000 genes for a quick check. For a more comprehensive analysis, consider using a larger subset. + +```{r} +# Subsample genes for faster computation +sub_mac <- subsample_genes(mac, ngene = 1000, gselect = "random", seed = 1) +sub_mac %>% nrow() +``` + + +```{r} +# Check for zero-inflation +all_conditions <- unique(sub_mac$combined_id) +zi_results <- check_zeroinflation(sub_mac, + group_by = "combined_id", + samples = all_conditions, + batch = 1, + cutoffs = c(0.1, 0.2)) +``` + +#### View gene-wise metrics for each group + +```{r} +zi_results$gene_metrics_by_group %>% filter(group=="Luminespib_10") %>% head() + +``` + + +```{r} +zi_results$summary_by_group %>% head(10) +``` + +```{r, fig.height=10, fig.width=8} +high_doses_zi_results <- zi_results$summary_by_group %>% filter(grepl("_10$", group)) +#concatenate with DMSO +high_doses_zi_results <- rbind(high_doses_zi_results, + zi_results$summary_by_group %>% filter(group=="DMSO_0")) + + +long_zi_results <- high_doses_zi_results %>% select(group, mean_p0_obs, mean_p0_nb, mean_ZI) %>% + pivot_longer(cols = c(mean_p0_obs, mean_p0_nb, mean_ZI), + names_to = "metric", + values_to = "value") + +# rank groups by mean_p0_obs +long_zi_results$group <- factor(long_zi_results$group, + levels = high_doses_zi_results$group[order(-high_doses_zi_results$mean_p0_obs)]) + +#only show mean_p0_obs and mean_p0_nb +long_zi_results_prop <- long_zi_results %>% filter(metric %in% c("mean_p0_obs", "mean_p0_nb")) + + +p1 <- ggplot(long_zi_results_prop, aes(x = group, y = value, fill = metric)) + + geom_bar(stat = "identity", position = "dodge") + + labs(title = "Zero-Inflation Metrics by Treatment Groups (10uM) and DMSO \nmacseq-filtered", + x = "Groups", + y = "zero-inflation proportion") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_manual(values = macpie_colours$discrete[1:3]) + +# show ZI separately +p2 <- ggplot(long_zi_results %>% filter(metric=="mean_ZI"), aes(x = group, y = value, fill = metric)) + + geom_bar(stat = "identity", position = "dodge") + + labs(title = "Mean Zero-Inflation Index (ZI) by Treatment Groups (10uM) and DMSO \nmacseq-filtered", + x = "Groups", + y = "Mean Zero-Inflation Index (ZI)") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_manual(values = macpie_colours$discrete[1]) + + +plot_grid(p1, p2, ncol = 1) +``` + +After filtering lowly expressed genes, we observe proportions of both expected and observed zeros decrease across all treatment groups compared to the unfiltered data. These results suggest that lowly expressed genes contribute significantly to the overall zero counts or sparsity in the data. + +DMSO control group has the lowest observed and expected zero proportions among all groups, and staurosporine_10 has the highest observed and expected zero proportions. These findings are expected as these are vehicle and positive controls. + + + diff --git a/vignettes/check_zero_inflation.Rmd b/vignettes/check_zero_inflation.Rmd deleted file mode 100644 index b74f9e9..0000000 --- a/vignettes/check_zero_inflation.Rmd +++ /dev/null @@ -1,243 +0,0 @@ ---- -title: "Check for zero-inflation in your data" -output: - rmarkdown::html_document: - theme: flatly - toc: true - toc_float: true - toc_depth: 5 -vignette: > - %\VignetteIndexEntry{check_zero_inflation} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} - %\VignetteBuild{true} - ---- - - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - digits = 3 -) -options(digits = 3) -``` - - - -## Overview - -This is a quick demonstration of the `check_zeroinflation()` function from our **macpie** package. This function is a fast diagnostic tool to help you assess whether your MACseq data exhibits zero-inflation relative to a negative binomial (NB) model. - -We will use a lightweight convenience wrapper `subsample_genes()` around **`seqgendiff::select_counts()`** to create a smaller object with a subset of genes for faster computation. This function is also part of the **macpie** package. - -Under the hood, `check_zeroinflation()` uses **edgeR** to estimate gene-wise dispersions and compute expected zero probabilities under a NB model. It then compares the observed and expected zero-inflated indexes for each gene within each group defined in the metadata. - -This function: - - - estimates gene-wise tagwise dispersions with edgeR (using all selected groups), - - - builds NB-expected zero probabilities from TMMwsp-scaled means, and - - - returns per-gene ZI (observed zeros minus NB-expected zeros) and per-group summaries (e.g., % genes with ZI > 0.05). ZI-cutoffs are user-defined. - - -The output is a list with two elements: - - - `summary_by_group`: A summary table showing the number and percentage of genes that are zero-inflated for each group at specified cutoffs. - - - `gene_metrics_by_group`: A detailed table with gene-wise metrics, including observed and expected zero numbers and proportions, zero-inflation index (ZI), mean count, and dispersion estimates for each gene in each group. - -**Note**: `check_zeroinflation()` relies on edgeR to estimate dispersion. The current implementation requires ≥2 groups in the design so that edgeR can stabilize gene-wise dispersions across groups. If you only have a single group and still want a design-aware baseline for expected zeros, fit a Gamma–Poisson/NB GLM and compute the expected zero probabilities from its fitted means and over-dispersion. - -*** -
- -```{r set_wd, include=FALSE} -dir <- "/Users/liuxin/macpie_Dev/" -devtools::load_all(paste0(dir, "macpie/")) -``` - -## Load data - -Load data and preprocess before subsampling genes and checking for zero-inflation. - -```{r setup} -#install.packages("macpie") # or devtools::install_github("PMCC-BioinformaticsCore/macpie") -library(macpie) - -# Define project variables -project_name <- "PMMSq033" -project_metadata <- system.file("extdata/PMMSq033_metadata.csv", package = "macpie") -# Load metadata -metadata <- read_metadata(project_metadata) - -``` - -```{r load_data} -# Import raw data -project_rawdata <- paste0(dir, "/macpieData/PMMSq033/raw_matrix") -project_name <- "PMMSq033" -raw_counts <- Read10X(data.dir = project_rawdata) -# Create tidySeurat object -mac <- CreateSeuratObject(counts = raw_counts, - project = project_name, - min.cells = 1, - min.features = 1) -# Join with metadata -mac <- mac %>% - inner_join(metadata, by = c(".cell" = "Barcode")) -# Add unique identifier -mac <- mac %>% - mutate(combined_id = str_c(Compound_ID, Concentration_1, sep = "_")) %>% - mutate(combined_id = gsub(" ", "", .data$combined_id)) -mac <- mac %>% - filter(Project == "Current") - -``` - -Filter genes with very low counts across all samples. This step is important because genes with extremely low expression can lead to unreliable estimates of dispersion and expected zero probabilities, which can skew the zero-inflation assessment. - -```{r} -# Filter by read count per sample group -mac <- filter_genes_by_expression(mac, - group_by = "combined_id", - min_counts = 10, - min_samples = 2) -``` - - -## Subsample genes for faster computation - -This is to randomly select a subset of 1000 genes for a quick check. For a more comprehensive analysis, consider using a larger subset. - -```{r} -# Subsample genes for faster computation -sub_mac <- subsample_genes(mac, ngene = 1000, gselect = "random", seed = 1) -sub_mac %>% nrow() -``` - - -## Check for zero-inflation - -We can now run the zero-inflation check on the subsampled data. - -### DMSO vs Camptothecin - -First, we will compare two groups: "DMSO_0" and "CAMPTOTHECIN_10". - - - -```{r} -zi_results <- check_zeroinflation(sub_mac, - group_by = "combined_id", - samples = c("DMSO_0","CAMPTOTHECIN_10"), - batch = 1, - cutoffs = c(0.1, 0.2)) - -``` - -#### View gene-wise metrics for each group - -For DMSO group: - -```{r} -zi_results$gene_metrics_by_group %>% filter(group=="DMSO_0") %>% head() -``` - -For Camptothecin group: - -```{r} -zi_results$gene_metrics_by_group %>% filter(group=="CAMPTOTHECIN_10") %>% head() -``` - -#### View summary statistics for each group - -```{r} -zi_results$summary_by_group -``` - - -From the summary table, we can see the summary statistics for each group, including the number and percentage of genes that are zero-inflated at the specified cutoffs. - -Each of the columns in the summary table are defined as follows: - - - group: The treatment group - - - n_genes: Total number of genes analyzed in the group - - - n_wells: Total number of wells/samples in the group - - - median_p0_obs: Median observed zero proportion across genes in the group - - - median_p0_nb: Median expected zero proportion under the NB model across genes in the group - - - median_ZI: Median zero-inflation index (ZI = p0_nb - p0_obs for each gene) across genes in the group - - - observed_zeros_num: Number of data points with observed zeros (it shouldn't be more than n_genes*n_wells for each group) - - - expected_zeros_num: Number of data points with expected zeros under the NB model (same here, it shouldn't be more than n_genes*n_wells for each group) - - - pct_ZI_gt_0.1: Percentage of genes with ZI greater than 0.1 - - - pct_ZI_gt_0.2: Percentage of genes with ZI greater than 0.2 - -*** -
- -#### See below for interpretation of the results - -For DMSO vs camptothecin, we can see that both DMSO and camptothecin groups have negative median ZI values, indicating that there is no significant zero-inflation in either group. Percentage of genes with ZI greater than 0.1 or 0.2 are 13.5% and 6.6% for camptothecin. As camptothecin acts as a DNA topoisomerase inhibitor and induces DNA damage, it can lead to cell cycle arrest and apoptosis, which may reduce the overall transcriptional activity in the cells. This reduction in transcriptional activity can result in genes being expressed at lower levels or not at all. On the other hand, DMSO is a common solvent used in biological experiments and is not expected to induce significant changes in gene expression. Therefore, it is reasonable to observe a lower level of zero-inflation in the DMSO group compared to the camptothecin group. Same as ZI results for DMSO, the percentage of genes with ZI greater than 0.1 or 0.2 are both 0%. - -Besides, the observed zeros are also lower than the expected zeros, further supporting the absence of zero-inflation. Overall, these results suggest that the data does not exhibit significant zero-inflation, and a standard negative binomial model may be appropriate for downstream analyses. Zero-inflated models (ZINB) may not be necessary for this dataset. - - - -```{r, include=FALSE} -rm(zi_results) -``` - -*** -
- - - -### DMSO vs Staruosporine - -Next, we will compare two treatment groups: "DMSO_0" and "Staurosporine_10". As staurosporine is a potent inducer of apoptosis, we might expect to see more zero-inflation in this group compared to DMSO. - - -```{r} -# Check for zero-inflation -zi_results <- check_zeroinflation(sub_mac, - group_by = "combined_id", - samples = c("DMSO_0","Staurosporine_10"), - batch = 1, - cutoffs = c(0.1, 0.2)) -``` - - -#### View gene-wise metrics for each group - -For DMSO group: - -```{r} -zi_results$gene_metrics_by_group %>% filter(group=="DMSO_0") %>% head() -``` - -For Staurosporine group: -```{r} -zi_results$gene_metrics_by_group %>% filter(group=="Staurosporine_10") %>% head() -``` - -#### View summary statistics for each group - -```{r} -zi_results$summary_by_group -``` -#### See below for interpretation of the results - -For DMSO vs staurosporine, we can see that only staurosporine has 14% of median ZI value, the observed zeros number (713) is much higher than the expected zeros number (~355), indicating that there is some level of zero-inflation in the staurosporine group. As staurosporine is a cell death control, which it is expected to see zero-inflated data in this group. In this case, it may be appropriate to consider using a zero-inflated negative binomial (ZINB) model for downstream analyses to account for the excess zeros in the data. - - diff --git a/vignettes/cross_platform_compatibility.Rmd b/vignettes/cross_platform_compatibility.Rmd index 6ec1a2a..61c8f02 100644 --- a/vignettes/cross_platform_compatibility.Rmd +++ b/vignettes/cross_platform_compatibility.Rmd @@ -40,6 +40,10 @@ This vignette will cover the following two parts: - Performing differential expression and pathway enrichment tests on merged data with limma-voom correction. +**3. Benchmark** + + - Benchmarking the performance of macpie functions on DRUGseq data. It includes both filtering robust DMSO controls and DEG concordance. + The DRUGseq dataset is a large-scale drug screening dataset that includes a large set of small molecules (N = 4,343) tested on U2OS cells. This dataset was retrieved from [Zenodo](https://doi.org/10.5281/zenodo.14291446) (Ozer et al., 2024). @@ -64,15 +68,19 @@ devtools::load_all(paste0(dir, "macpie/")) ```{r setup} -library(macpie) -library(tibble) -library(stringr) -library(pheatmap) -library(ggiraph) -library(tidyseurat) -library(purrr) +suppressMessages(library(macpie)) +suppressMessages(library(tibble)) +suppressMessages(library(stringr)) +suppressMessages(library(pheatmap)) +suppressMessages(library(ggiraph)) +suppressMessages(library(tidyseurat)) +suppressMessages(library(purrr)) +suppressMessages(library(ggrepel)) ``` +```{r} +options(scipen=999, digits=3) +``` ## Converting a DRUGseq plate to a macpie object @@ -318,7 +326,7 @@ Now, we convert the metadata format to macpie metadata format. The idea is to make a combined metadata and a combined count matrix for three plates with the `plate_ID` labelled. -```{r metdata_for_batch24} +```{r} #make a combined metadata for three plates batch24_metadata <- batch24 %>% map_dfr(~ { @@ -353,7 +361,7 @@ batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_i -```{r count_matrix_for_batch24} +```{r} # create a combined UMI matrix for 3 plates batch24_counts <- batch24 %>% map(~ { @@ -382,7 +390,7 @@ binded_counts <- do.call(cbind, batch24_counts) Then, we can read in the combined count matrix and metadata. -```{r load_data_batch24} +```{r} as_mac <- CreateSeuratObject(counts = binded_counts, min.cells = 1, min.features = 1) @@ -626,4 +634,948 @@ pheatmap(enriched_pathways_mat, color = macpie_colours$continuous_rev) ``` +## Benchmark + +### Selecting robust DMSO controls + +We noticed that DRUGseq authors implemented a permutation strategy to identify robust DMSO wells in their analytics pipeline. We agree that robust control selection is critical for comparative benchmarking. The Novartis DRUGseq workflow selects DMSO controls via a 500-permutation procedure that retains wells minimising spurious DMSO–DMSO differentially expressed genes. To make this step practical for routine QC and for researchers with fewer DMSO wells, we implemented an alternative method in our macpie package `select_robust_controls` function that uses correlation-based selection of control wells. + +This function ranks control wells based on their average Fisher-z-transformed correlation coefficients to all other control wells, selecting those with the highest average correlation scores as the 'robust' controls. + + +The DRUGseq data contains three replicate plates with 48 DMSO controls (CB-43-EP73). We applied our `select_robust_controls` function to these DMSO wells. It filters genes, normalises with TMMwsp, computes log-CPM, and ranks wells by mean Fisher-z–transformed correlation to all other replicate wells. The top 5 wells with the highest transformed correlation were then compared to DRUGseq results. + +We benchmark the performance of macpie's `select_robust_controls` function on DRUGseq data to select robust DMSO controls from three replicate plates. + + +#### DRUGseq: 6 robust DMSO wells + +First we load the file of robust DMSO control wells identified by the DRUGseq authors in their github repo. + +Here, we show the robust DMSO wells for batch 24, which contains the three plates we are interested in (VH02012942, VH02012944, VH02012956). + +These 6 robust DMSO wells are used as the benchmark standard to compare with macpie selected robust DMSO wells. + +```{r} +robust_DMSO_DRUGseq <- read.csv(paste0(dir, "DRUGseqData/robust_RC_ReferenceControl_DMSO_wells.txt"), sep="") +robust_DMSO_DRUGseq %>% filter(batch_id==24) %>% select(batch_id, plate_barcode, well_id) +``` + +As from their results of robust DMSO controls, these are the robust DMSO wells for batch 24: + + VH02012942: I23, M23 + + VH02012944: D23, H23 + + VH02012956: I23, J23 + + +*** +
+ + +#### macpie: select robust DMSO wells + + + +Now we will use a function `select_robust_controls` to identify robust DMSO controls from batch 24. + +This function: + + - applies CPM filtering, + + - performs TMMwsp normalisation and computes log-CPM, + + - ranks wells by their mean Fisher-z–transformed sample–sample correlation to all other wells (Pearson or Spearman) + + - selects the top n wells (user-defined) as robust controls. + +```{r} +mac_filtered <- readRDS(paste0(dir, "/DRUGseqData/macpie_filtered_batch24.Rds")) +mac_filtered$combined_id <- str_replace_all(mac_filtered$combined_id, "-","_") +``` + + + + +Now we will apply the `select_robust_controls` function to each of the three plates in batch 24 to identify robust DMSO controls. + +This function generates three plots: + + - Boxplot of log2-CPM (TMMwsp) + + - Sample–sample correlation (Pearson, log2-CPM) + + - Sample–sample correlation (Spearman, log2-CPM) + + These plots help to visualize the distribution of gene expression and the correlation between samples, aiding in the assessment of DMSO control quality. + + +```{r} +#to use lapply on three plates +plates <- c("VH02012942", "VH02012944", "VH02012956") +results <- lapply(plates, function(plate) { + select_robust_controls( + mac_filtered, + combined_id = "CB_43_EP73_0", + orig_ident = plate, + cpm_filter = 1, + min_samps = 8, + corr_method = "spearman", + top_n = 5, + make_plots = FALSE + ) +}) + +names(results) <- plates +``` + + +As make_plots = FALSE in the above function, we can now visualise the plots for each plate separately. If you set make_plots = TRUE, the function will automatically generate the plots for you. + +##### Plate VH02012942 + +```{r} +pheatmap::pheatmap(results$VH02012942$cor_pearson, + main = "Sample-sample Pearson correlation (log2-CPM)", + fontsize_row = 8, fontsize_col = 8) +pheatmap::pheatmap(results$VH02012942$cor_spearman, + main = "Sample-sample Spearman correlation (log2-CPM)", + fontsize_row = 8, fontsize_col = 8) +``` + + +Apart from correlation heatmaps, the function also returns a ranking of wells by their mean correlation to all other wells. + +```{r} +results$VH02012942$scores_mean_to_others +``` + +Finally, it returns the top N wells as robust DMSO controls. + +```{r} +results$VH02012942$topN +``` + +Now we can see for plate VH02012942, the 2 of top 5 DMSO wells selected by macpie are I23 and J23, which are exactly the same as the robust DMSO wells identified by the DRUGseq authors. + + +Let's repeat the same process for the other two plates in batch 24. + +##### Plate VH02012944 + +```{r} +pheatmap::pheatmap(results$VH02012944$cor_pearson, + main = "Sample-sample Pearson correlation (log2-CPM)", + fontsize_row = 8, fontsize_col = 8) +pheatmap::pheatmap(results$VH02012944$cor_spearman, + main = "Sample-sample Spearman correlation (log2-CPM)", + fontsize_row = 8, fontsize_col = 8) +``` + + +Apart from correlation heatmaps, the function also returns a ranking of wells by their mean correlation to all other wells. + +```{r} +results$VH02012944$scores_mean_to_others +``` + +Finally, it returns the top N wells as robust DMSO controls. + +```{r} +results$VH02012944$topN +``` + +For plate VH02012944, the DRUGseq selected D23 and H23 DMSO wells are not in our top 5. However, H23 is ranked 6th by macpie, which is very close to the top 5. + + +##### Plate VH02012956 +```{r} +pheatmap::pheatmap(results$VH02012956$cor_pearson, + main = "Sample-sample Pearson correlation (log2-CPM)", + fontsize_row = 8, fontsize_col = 8) +pheatmap::pheatmap(results$VH02012956$cor_spearman, + main = "Sample-sample Spearman correlation (log2-CPM)", + fontsize_row = 8, fontsize_col = 8) +``` + + +Apart from correlation heatmaps, the function also returns a ranking of wells by their mean correlation to all other wells. + +```{r} +results$VH02012956$scores_mean_to_others +``` + +Finally, it returns the top N wells as robust DMSO controls. + +```{r} +results$VH02012942$topN +``` + +For plate VH02012956, the DRUGseq selected I23 and J23 DMSO wells are exactly the same as our top DMSO wells selected by macpie. + + +### Summary + +To summarise the performance of macpie in selecting robust DMSO wells, we compare our selected top 5 DMSO wells with the DRUGseq authors' selected robust DMSO wells for each plate in batch 24. + +```{r} +# DRUG-seq authors' robust DMSO wells (batch 24) +expected_df <- as_tibble(data.frame( + plate = c("VH02012942", "VH02012942", "VH02012944", "VH02012944", "VH02012956", "VH02012956"), + well = c("I23", "M23", "D23", "H23", "I23", "J23"), + source = "expected")) + + +# Helper: extract topN wells (names are like "VH02012942_I23") +extract_top_wells <- function(res_per_plate) { + tibble(sample = names(res_per_plate$topN), + score = as.numeric(res_per_plate$topN)) |> + mutate( + plate = sub("_.*$", "", sample), + well = sub("^.*_", "", sample), + rank = row_number() + ) |> + select(plate, well, rank, score) +} + +top_df <- map_df(names(results), ~{ + extract_top_wells(results[[.x]]) +}) + + +matched_df <- expected_df |> + left_join(top_df, by = c("plate", "well")) |> + mutate(found = !is.na(rank)) + + +plate_summary <- matched_df |> + group_by(plate) |> + summarise( + expected = n(), + recovered = sum(found), + recovery_rate = recovered / expected + ) + +plate_summary + +overall <- summarise(plate_summary, + total_expected = sum(expected), + total_recovered = sum(recovered), + overall_rate = total_recovered / total_expected) +overall + +``` +```{r} +ggplot(plate_summary, aes(x = plate, y = recovered, fill = plate)) + + geom_col(width = 0.6, show.legend = FALSE) + + geom_text(aes(label = sprintf("%d / %d (%.0f%%)", + recovered, expected, 100*recovery_rate)), + vjust = -0.6) + + ylim(0, max(plate_summary$expected) + 0.8) + + labs(title = "Recovered DRUG-seq robust DMSO wells by plate", + x = "Plate", y = "Recovered wells") + + theme_classic() + +``` + + +In summary, for the three plates in batch 24, macpie successfully identified 4 out of 6 robust DMSO wells (with ~66.7% overall recovery rate) that were also selected by the DRUGseq authors. Only for plate VH02012944, one of the DRUGseq selected DMSO wells (D23) was not in our top 5, but the other well (H23) was ranked 6th by macpie, which is very close to the top 5. This demonstrates that macpie is effective in selecting high-quality DMSO controls for downstream analysis without running permutation tests, making it a computationally efficient choice. + +This function runs for each plate, it does not take into account any batches or plates. If it's a cross-plates design, it is recommended to either compute within-plate or remove plate effects (e.g. using ComBat, limma removeBatchEffect functions) first. + +*** +
+ + +### DEG concordance + +Now we compare macpie implementations of limma-voom, edgeR, DEseq2 and limma-trend against DRUGseq limma-trend result on the FF_86_NH56 (10uM) vs DMSO control from batch 24. We evaluated three control settings: (i) all DMSO wells (48 wells), (ii) top 15 DMSO selected by our correlation-based approach, and (iii) 6 DMSO wells identified by DRUGseq 500 permutation-based method. + +We only focus on up-regulated genes in FF_86_NH56 (genes were called DEG if BH-adjusted p < 0.05 and log2FC > 0). + + + + +#### DRUGseq DEG results + +For the purpose of this vignette, we only load the DRUGseq DEG results for FF_86_NH56 (10uM) vs DMSO control from batch 24. + +```{r} +batch24_de <- readRDS(paste0(dir,"DRUGseqData/DE_batch24.Rds")) +FF86_res <- batch24_de %>% filter(cmpd_sample_id=="FF-86-NH56") +ff86_res_toptable <- FF86_res[,13:18] +ff86_res_toptable <- ff86_res_toptable %>% + separate(gene.ID, into = c("gene", "chrom"), sep = ",") %>% + select(-chrom) %>% mutate(combined_id ="FF_86_NH56_10") %>% + rename(log2FC=logFC, p_value_adj = adj.P.Val) + +``` + +```{r, fig.height=6, fig.width=8} +plot_volcano(ff86_res_toptable, max.overlaps =3) +``` + +```{r} +ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% nrow() + +drugseq_deg <- ff86_res_toptable %>% filter(p_value_adj <0.05 & log2FC >0) %>% select(gene) %>% pull() +``` + +There are 1423 up-regulated DEGs identified by DRUGseq limma-trend method for FF_86_NH56 (10uM) vs DMSO control from batch 24. + + + +#### macpie DEG results with all DMSO wells + + +Load filered data + +```{r} +mac_filtered <- readRDS(paste0(dir, "/DRUGseqData/macpie_filtered_batch24.Rds")) +mac_filtered$combined_id <- str_replace_all(mac_filtered$combined_id, "-","_") +``` + + + + +##### Differential gene expression + +In here, you can specify a single condition in the combined_id column and compare with DMSO (i.e.CB_43_EP73_0). By using the plate IDs in the column of orig.ident as the input for batch parameter, `compute_singe_de` function can perform differential expression analysis using the preferred method (limma voom in this example) with batch information. + + + +```{r} +methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") + +methods_res <- lapply(methods, function(m){ + + message("\n","Processing method: ", m,"\n") + # m<-"limma_voom" + treatment_samples <- "FF_86_NH56_10" + control_samples <- "CB_43_EP73_0" + subset <- mac_filtered%>%filter(combined_id%in%c(treatment_samples,control_samples)) + + batch <- subset$orig.ident + + + top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch) + + # plot(plot_volcano(top_table, max.overlaps = 5)) + alldmso_degs <- top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + length(intersect(alldmso_degs, drugseq_deg)) + + top_table <- top_table %>% + arrange(p_value_adj, desc(log2FC)) %>% + mutate(gene = factor(gene, levels = unique(gene))) + # add a column if there are in the intersect(alldmso_degs, drugseq_deg) + top_table <- top_table %>% + mutate(in_drugseq_deg = ifelse(gene %in% intersect(alldmso_degs, drugseq_deg), "yes", "no")) + + # label a few top overlapping genes + lab_genes <- top_table[top_table$in_drugseq_deg=="yes", ] |> + dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) + + volcano_overlap <- ggplot(top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + + geom_point(alpha = 0.6, size = 1.2) + + geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 50) + + scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+ + theme_classic() + + + #return + result_list <- list(top_table = top_table, + num_degs_macpie = length(alldmso_degs), + n_overlap = length(intersect(alldmso_degs, drugseq_deg)), + volcano_plot = volcano_overlap) + + return(result_list) + +}) + +names(methods_res) <- methods + +``` + + +##### Summary table + + +```{r} +#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method +deg_summary <- map_df(methods_res, function(x) { + data.frame( + num_degs_macpie = x$num_degs_macpie, + n_overlap = x$n_overlap, + num_degs_DRUGseq = length(drugseq_deg) + ) +}, .id = paste0("macpie_methods")) + +deg_summary +``` + + + +##### Overlapping volcano plot + +```{r} +methods_res$limma_voom$volcano_plot +methods_res$DESeq2$volcano_plot +methods_res$edgeR$volcano_plot +methods_res$limma_trend$volcano_plot +``` + + + + +#### macpie DEG results with 15 robust DMSO wells + +From our select_robust_controls function above, we identified the following top 15 DMSO wells from three plates in batch 24 using `select_robust_controls`: + + +```{r} +batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds")) +names(batch24) +``` + +```{r metdata} +#make a combined metadata for three plates +batch24_metadata <- batch24 %>% + map_dfr(~ { + .x$Annotation %>% + mutate( + Plate_ID = plate_barcode, + Well_ID = well_id, + Barcode = paste0(plate_barcode, "_", well_index), + Row = LETTERS[row], + Column = as.integer(col), + Species = "human", + Cell_type = "U2OS", + Model_type = "2D_adherent", + Time = as.factor(hours_post_treatment), + Unit = "h", + Treatment_1 = cmpd_sample_id, + Concentration_1 = as.numeric(concentration), + Unit_1 = unit, + Sample_type = if_else(well_type == "SA" & col == 24, + "Positive Control", + well_type) + ) + }) + + +batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id, + well_index, col, row, biosample_id, external_biosample_id, + cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample)) +``` + + + +```{r count_matrix} +# create a combined UMI matrix for 3 plates +batch24_counts <- batch24 %>% + map(~ { + .x$UMI.counts %>% + as.data.frame() %>% + rownames_to_column("gene_id") %>% + separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>% + mutate(gene_name = make.unique(gene_name)) %>% + select(-chrom) %>% + tibble::column_to_rownames(var = "gene_name") %>% + as.matrix() + }) + +binded_counts <- do.call(cbind, batch24_counts) + + + +``` + + + +```{r load_data} +as_mac <- CreateSeuratObject(counts = binded_counts, + min.cells = 1, + min.features = 1) + +as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode")) +``` + + +```{r} + +as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1) + +badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>% + filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23", "K23", "J23","C23"))) | + (Plate_ID == "VH02012944" & !(Well_ID %in% c("I23", "M23", "J23", "G23", "K23")))| + (Plate_ID == "VH02012956" & ! (Well_ID %in% c("I23", "J23", "O23","M23","K23")))) + + + +keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO)) + + +mac_badDSMOremoved <- as_mac[,keep_wells] + +mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_") + + +``` + + +```{r} +min_sample_num <- min(table(mac_badDSMOremoved$combined_id)) +mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved, + group_by = "combined_id", min_counts =10, + min_samples = min_sample_num) + +``` + + + + + +##### Differential gene expression + +```{r} +methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") + +five_dmso_methods_res <- lapply(methods, function(m){ + message("Processing method: ", m) + # m<-"limma_voom" + treatment_samples <- "FF_86_NH56_10" + control_samples <- "CB_43_EP73_0" + subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples)) + + batch <- subset$orig.ident + + badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch) + + + # plot(plot_volcano(top_table, max.overlaps = 5)) + badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + length(intersect(badDMSO_out_degs, drugseq_deg)) + + badDMSO_out_top_table <- badDMSO_out_top_table %>% + arrange(p_value_adj, desc(log2FC)) %>% + mutate(gene = factor(gene, levels = unique(gene))) + # add a column if there are in the intersect(alldmso_degs, drugseq_deg) + badDMSO_out_top_table <- badDMSO_out_top_table %>% + mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no")) + + # label a few top overlapping genes + lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |> + dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) + + volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + + geom_point(alpha = 0.6, size = 1.2) + + geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) + + scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+ + theme_classic() + + #return + result_list <- list(top_table = badDMSO_out_top_table, + num_degs_macpie = length(badDMSO_out_degs), + n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)), + volcano_plot = volcano_overlap) + return(result_list) + +}) + +names(five_dmso_methods_res) <- methods + +``` + +##### Summary table + +```{r} +#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method +deg_summary <- map_df(five_dmso_methods_res, function(x) { + data.frame( + num_degs_macpie = x$num_degs_macpie, + n_overlap = x$n_overlap, + num_degs_DRUGseq = length(drugseq_deg) + ) +}, .id = paste0("macpie_methods")) + +deg_summary +``` + + +##### Overlapping volcano plot + +```{r} +five_dmso_methods_res$limma_voom$volcano_plot +five_dmso_methods_res$DESeq2$volcano_plot +five_dmso_methods_res$edgeR$volcano_plot +five_dmso_methods_res$limma_trend$volcano_plot +``` + + + + +#### macpie DEG results with 6 robust DMSO wells from DRUGseq + +From public DRUG-seq analysis pipeline, authors identified two reference controls: all DMSO wells and the ‘robust DMSO’ wells. + +We know these robust DMSO wells for batch 24 from their published data: + + - VH02012942: I23, M23 + + - VH02012944: D23, H23 + + - VH02012956: I23, J23 + + +```{r} +batch24 <- readRDS(paste0(dir,"DRUGseqData/Exp_batch24.Rds")) +names(batch24) +``` + +```{r metdata_for_batch24} +#make a combined metadata for three plates +batch24_metadata <- batch24 %>% + map_dfr(~ { + .x$Annotation %>% + mutate( + Plate_ID = plate_barcode, + Well_ID = well_id, + Barcode = paste0(plate_barcode, "_", well_index), + Row = LETTERS[row], + Column = as.integer(col), + Species = "human", + Cell_type = "U2OS", + Model_type = "2D_adherent", + Time = as.factor(hours_post_treatment), + Unit = "h", + Treatment_1 = cmpd_sample_id, + Concentration_1 = as.numeric(concentration), + Unit_1 = unit, + Sample_type = if_else(well_type == "SA" & col == 24, + "Positive Control", + well_type) + ) + }) + + +batch24_metadata <- batch24_metadata%>%select(-c(batch_id, plate_barcode,plate_index, well_id, + well_index, col, row, biosample_id, external_biosample_id, + cmpd_sample_id, well_type, cell_line_name, cell_line_ncn, concentration, unit, hours_post_treatment, Sample)) +``` + + + +```{r count_matrix_for_batch24} +# create a combined UMI matrix for 3 plates +batch24_counts <- batch24 %>% + map(~ { + .x$UMI.counts %>% + as.data.frame() %>% + rownames_to_column("gene_id") %>% + separate(col = gene_id, into = c("gene_name", "chrom"), sep = ",") %>% + mutate(gene_name = make.unique(gene_name)) %>% + select(-chrom) %>% + tibble::column_to_rownames(var = "gene_name") %>% + as.matrix() + }) + +binded_counts <- do.call(cbind, batch24_counts) + + + +``` + + + +```{r load_data_batch24} +as_mac <- CreateSeuratObject(counts = binded_counts, + min.cells = 1, + min.features = 1) + +as_mac<- as_mac%>% inner_join(batch24_metadata, by = c(".cell"="Barcode")) +``` + + + + +```{r} + +as_mac$combined_id <- paste0(as_mac$Treatment_1,"_", as_mac$Concentration_1) + +badDMSO <- as_mac@meta.data %>% filter(Treatment_1 == "CB-43-EP73") %>% + filter((Plate_ID == "VH02012942" & !(Well_ID %in% c("I23", "M23"))) | + (Plate_ID == "VH02012944" & !(Well_ID %in% c("D23", "H23")))| + (Plate_ID == "VH02012956" & ! (Well_ID %in% c("I23", "J23")))) + + + +keep_wells <- setdiff(rownames(as_mac@meta.data), rownames(badDMSO)) + + +mac_badDSMOremoved <- as_mac[,keep_wells] + +mac_badDSMOremoved$combined_id <- str_replace_all(mac_badDSMOremoved$combined_id, "-","_") + + +``` + + +```{r} +min_sample_num <- min(table(mac_badDSMOremoved$combined_id)) +mac_badDSMOremoved <- filter_genes_by_expression(mac_badDSMOremoved, + group_by = "combined_id", min_counts =10, + min_samples = min_sample_num) + +``` + + + + + + +##### Differential gene expression + + +```{r} +methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") + +robust_dmso_methods_res <- lapply(methods, function(m){ + message("Processing method: ", m) + # m<-"limma_voom" + treatment_samples <- "FF_86_NH56_10" + control_samples <- "CB_43_EP73_0" + subset <- mac_badDSMOremoved%>%filter(combined_id%in%c(treatment_samples,control_samples)) + + batch <- subset$orig.ident + + badDMSO_out_top_table <- compute_single_de(subset, treatment_samples, control_samples, method = m, batch = batch) + + # plot(plot_volcano(top_table, max.overlaps = 5)) + badDMSO_out_degs <- badDMSO_out_top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + length(intersect(badDMSO_out_degs, drugseq_deg)) + + badDMSO_out_top_table <- badDMSO_out_top_table %>% + arrange(p_value_adj, desc(log2FC)) %>% + mutate(gene = factor(gene, levels = unique(gene))) + # add a column if there are in the intersect(alldmso_degs, drugseq_deg) + badDMSO_out_top_table <- badDMSO_out_top_table %>% + mutate(in_drugseq_deg = ifelse(gene %in% intersect(badDMSO_out_degs, drugseq_deg), "yes", "no")) + + # label a few top overlapping genes + lab_genes <- badDMSO_out_top_table[badDMSO_out_top_table$in_drugseq_deg=="yes", ] |> + dplyr::arrange(p_value_adj, dplyr::desc(log2FC)) + + volcano_overlap <- ggplot(badDMSO_out_top_table, aes(x = log2FC, y = -log10(p_value_adj), color = in_drugseq_deg)) + + geom_point(alpha = 0.6, size = 1.2) + + geom_text_repel(data = lab_genes, aes(label = gene), size = 3, max.overlaps = 10) + + scale_color_manual(values = c("no"="#bdbdbd","yes"="#2b8cbe"))+ + theme_classic() + + #return + result_list <- list(top_table = badDMSO_out_top_table, + num_degs_macpie = length(badDMSO_out_degs), + n_overlap = length(intersect(badDMSO_out_degs, drugseq_deg)), + volcano_plot = volcano_overlap) + return(result_list) + +}) + +names(robust_dmso_methods_res) <- methods + +``` + +##### Summary table + +```{r} +#get a table to show number of DEGs and number of overlapping genes with DRUGseq for each method +deg_summary <- map_df(robust_dmso_methods_res, function(x) { + data.frame( + num_degs_macpie = x$num_degs_macpie, + n_overlap = x$n_overlap, + num_degs_DRUGseq = length(drugseq_deg) + ) +}, .id = paste0("macpie_methods")) + +deg_summary +``` + + +##### Overlapping volcano plot + +```{r} +robust_dmso_methods_res$limma_voom$volcano_plot +robust_dmso_methods_res$DESeq2$volcano_plot +robust_dmso_methods_res$edgeR$volcano_plot +robust_dmso_methods_res$limma_trend$volcano_plot +``` + + + + +### Summary of DEG concordance + +To compare DEGs with different replicate numbers and different methods + +```{r} +methods <- c("limma_voom", "DESeq2", "edgeR", "limma_trend") + +get_jaccard <- function(deg_set, drugseq_deg){ + intersection <- length(intersect(deg_set, drugseq_deg)) + union <- length(union(deg_set, drugseq_deg)) + jaccard_index <- intersection / union + return(jaccard_index) +} + +jaccard_index <- lapply(methods, function(m){ + # all dmso + degs <- methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + jaccard_all <- get_jaccard(degs, drugseq_deg) + # five dmso + degs <- five_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + jaccard_five <- get_jaccard(degs, drugseq_deg) + # three dmso + degs <- robust_dmso_methods_res[[m]]$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull() + jaccard_three <- get_jaccard(degs, drugseq_deg) + jaccard_index <- data.frame( + method = m, + jaccard_all = jaccard_all, + jaccard_five = jaccard_five, + jaccard_three = jaccard_three + ) + return(jaccard_index) +}) + +df <- as.data.frame(do.call(rbind, jaccard_index)) +rownames(df) <- df$method +df <- df %>% select(-method) +colnames(df) <- c("All DMSO", "macpie: 15 DMSO", "DRUGseq: 6 DMSO") +pheatmap::pheatmap(df, + cluster_rows = FALSE, + cluster_cols = FALSE, + display_numbers = TRUE, + main = "Jaccard Index between macpie DEGs and DRUGseq DEGs") + +``` + +#### Overlap of DEGs using all DMSO wells + + +```{r} +library(UpSetR) +all_dmso <- list( + limma_voom = methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DESeq2 = methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + edgeR = methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + limma_trend = methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DRUGseq = drugseq_deg +) +upset(fromList(all_dmso), + nsets = 5, + order.by = "freq", + main.bar.color = "black", + sets.bar.color = "gray23", + text.scale = c(2, 2, 2, 1.5, 2, 1.5), + mainbar.y.label = "Number of common DEGs", + sets.x.label = "Number of DEGs") +``` + +#### Overlap of DEGs using 15 DMSO wells + +```{r} +five_dmso <- list( + limma_voom = five_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DESeq2 = five_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + edgeR = five_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + limma_trend = five_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DRUGseq = drugseq_deg +) +upset(fromList(five_dmso), + nsets = 5, + order.by = "freq", + main.bar.color = "black", + sets.bar.color = "gray23", + text.scale = c(2, 2, 2, 1.5, 2, 1.5), + mainbar.y.label = "Number of common DEGs", + sets.x.label = "Number of DEGs") +``` + + +#### Overlap of DEGs using 6 DMSO wells + +```{r} +robust_dmso <- list( + limma_voom = robust_dmso_methods_res$limma_voom$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DESeq2 = robust_dmso_methods_res$DESeq2$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + edgeR = robust_dmso_methods_res$edgeR$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + limma_trend = robust_dmso_methods_res$limma_trend$top_table %>% filter(p_value_adj <0.05 & log2FC>0) %>% select(gene) %>% pull(), + DRUGseq = drugseq_deg +) +upset(fromList(robust_dmso), + nsets = 5, + order.by = "freq", + main.bar.color = "black", + sets.bar.color = "gray23", + text.scale = c(2, 2, 2, 1.5, 2, 1.5), + mainbar.y.label = "Number of common DEGs", + sets.x.label = "Number of DEGs") +``` + +From Jaccard heatmap and UpSet plots, with all 48 DMSO controls, DESeq2 and edgeR show the largest overlaps with DRGseq (Jaccard = 0.29 and 0.24; UpSet intersections in the hundreds). Limma-voom shows moderate similarity (J=0.15), and limma-trend returns a smaller set (J=0.18). Using 15 macpie-selected DMSO reduces totals and overlaps for most methods; DEseq2 and edgeR remains relatively stable (J = 0.28 and 0.24). With only 6 DRUGseq controls, the DEG sets shrink and pair-wise intersection numbers drop. Especially DEseq2 drops down to J=0.01 and limma_voom reduces to J=0.08. While edgeR remains relatively stable with J=0.23. limma-trend run yields very few DEGs under this setting. + + +*** +
+ + + + +## Conclusion + +### Why robust DMSO subsets can reduce concordance with DRUGseq DEGs? + +Possible reasons are that with fewer control samples, our group-aware filter (>= 10 counts in at least 3 samples) becomes more stringent, leading to fewer genes being tested in the DE analysis. This reduction in the number of tested genes can impact the identification of DEGs and their overlap with DRUGseq results. Additionally, having fewer control samples can increase variability in the estimates of dispersion, which can affect the statistical power to detect true DEGs. This increased variability may lead to less consistent results across different methods, thereby reducing concordance with DRUGseq DEGs. + +Robust DMSO are selected based on their similarity in overall expression profiles, which may slightly shift normalisation and the mean-variance trend when estimating by TMM/TMMwsp. + + +### Minimal workflow we recommend for macpie DEG analysis + +1. QC and filtering: + +- Use group-aware filtering `filter_genes_by_expression` to retain genes with consistent expression within each treatment. + +- Examine any batch/unwanted variation, or potential outliers in the data. + +2. Check zero-inflation: + +- Run `check_zeroinflation` before and after filtering to ensure that zero-inflation is minimized. + +- Decision heuristic: for example, if observed zero proportions are significantly higher than expected under NB after filtering, consider using methods that account for zero-inflation. + +3. Pick controls consciously: + +- Start with all available controls and assess any potential outliers. + +4. Choose a DE method: + +- If zero-inflation is present, use a method that accounts for it (`compute_single_de` edgeR with ZINB-WaVE weights). + +- If zero-inflation is not a concern, + + - edgeR (QLFit) and limma-voom (with TMMwsp): both methods are designed to account for gene-specific variability and handle heteroscedasticity effectively. + + - limma-trend: remains suitable for uniformly sequenced experiments as it assumes similar library sizes/sequencing depth. + + - DESeq2: strong shrinkage & automatic independent filtering. + +- If batch/unwanted variation, + + - include them in the design (~ batch + condition) or adjust with limma’s removeBatchEffect / edgeR design. + + - if strong, apply RUVseq (e.g. RUVg with empirical negative controls or RUVs with replicate samples) before DE. Re-compute normalization after RUV and re-fit your DE model. + + +*** +
diff --git a/vignettes/macpie_short.Rmd b/vignettes/macpie_short.Rmd deleted file mode 100644 index d71eb15..0000000 --- a/vignettes/macpie_short.Rmd +++ /dev/null @@ -1,408 +0,0 @@ ---- -title: "macpie: overview" -output: - rmarkdown::html_document: - theme: flatly - toc: true - toc_float: true - toc_depth: 5 -vignette: > - %\VignetteIndexEntry{overview} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} - %\VignetteBuild{true} ---- - - -```{r setup, include = FALSE} -# keep vignette light & deterministic on CI -is_ci <- identical(Sys.getenv("CI"), "true") -if (is_ci) { - # Disable parallelization so nothing is exported to workers - if (requireNamespace("future", quietly = TRUE)) { - future::plan("sequential") - } - options(mc.cores = 1) -} -knitr::opts_chunk$set(message = FALSE, warning = FALSE) -``` - - - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>", - warning = FALSE, - message = FALSE -) -library(dplyr) -library(stringr) # str_c -library(ggplot2) # ggplot, geom_point, etc. -library(tidyr) # pivot_wider -library(tibble) # column_to_rownames -library(ggiraph) # girafe() -library(pheatmap) # pheatmap() -library(enrichR) # enrichr, plotEnrich -library(gridExtra) # grid.arrange -library(Seurat) -library(ggrepel) -``` - -## Introduction - -MAC-seq is a cost-effective, high-throughput transcriptomic platform, developed as a collaboration between Victorian Centre for Functional Genomics (VCFG) and Molecular Genomics Core (MGC) facilities at Peter MacCallum Cancer Centre (Peter Mac), primarily designed for small molecule screening. However, its versatility extends beyond this application, thanks to its integration with high-throughput microscopy and 3D cell culturing techniques. - -macpie is a toolkit designed for researchers, originally with MAC-seq data in mind, but validated for general High-Throughput Transcriptomics (HTTr) data applications. Its primary aim is to deliver the latest tools for quality control (QC), visualization, and analysis. macpie is a result of a collaborative effort by a workgroup at the PeterMac, with a substantial support of the VCFG amd MGC core facilities. - -In this vignette we cover the basic functionality of macpie, from input, quality control to transcriptional and screen-related analyses. For more in-depth workflows, please refer to other vignettes at https://pmcc-bioinformaticscore.github.io/macpie/articles/macpie.html. - -### 1. Metadata import and QC - -**Key points**: - - - metadata has to contain at least columns **Plate_ID**, **Well_ID**, **Row**, **Column**, **Species**, **Sample_type**, **Treatment_1**, **Concentration_1** and **Barcode**. - -Metadata has to be in a tabular format and contain a standardised set of columns to define coordinates of a sample on a plate, provide minimum information about the sample, and allow connection with the transcriptomic data through the sample barcodes. - -To ensure the integrity of metadata for future analyses, we provide the user with a set of tools to verify metadata consistency and visualize the key variables, described in more depth in the [QC vignette](articles/quality_control.html). We will first visually inspect all experimental variables, in order to identify potential artefacts. - -```{r set_wd, include=FALSE} -dir <- "/Users/liuxin/macpie_Dev/" -devtools::load_all(paste0(dir, "macpie/")) -``` - -```{r metadata_plot, fig.width = 8, fig.height = 6} -#install.packages("macpie") # or devtools::install_github("PMCC-BioinformaticsCore/macpie") -library(macpie) -library(enrichR) -library(randomForest) -library(pheatmap) - -# Load metadata -project_metadata <- system.file("extdata/PMMSq033_metadata.csv", package = "macpie") - -# Load metadata -metadata <- read_metadata(project_metadata) -plot_metadata_heatmap(metadata) -``` - -> **Key Lessons for Robust Experimental Design** -> -> - Based on our experience, the specific metadata you need will vary greatly with your experiment's design. Here are some crucial lessons we've learned to help you achieve reliable results: -> -> - Plate Layout Matters: Always place replicate sample wells on the same assay plate, not across multiple plates. -> -> - Minimum Replicates: Aim for a minimum of 3 replicates per condition to ensure statistical robustness. -> -> - Strategic Negative Controls: For negative control wells, we recommend including 10 wells randomized across the same assay plate. This provides a more robust baseline. - - - - -### 2. Sequencing data import and QC - -**Key points**: - - - pay special attention to removal of lowly expressed genes and then: - - use **plot_plate_layout** to check plate-level effects (edge vs centre, between plates) - - use **plot_mds** to check sample grouping (umap/pca is also available using Seurat's functions) - - use **compute_qc_metrics**, **plot_qc_metrics_heatmap**, and **plot_distance** to check sample variability and outliers - - use **plot_rle** to check any row/column/plate effects and compare normalization methods - - more detailed methods avaailable in vignette [Quality control](articles/quality_control.html) - -#### 2.1 Import data to tidySeurat object - -**Data access**: - - - load your raw counts by providing Read10X function with a path to a directory containing matrix.mtx.gz, barcodes.tsv.gz, and features.tsv.gz, commonly "raw_matrix" from CellRanger or StarSolo output - - the full manuscript dataset (>10 MB of `.rds` files) is publicly available at https://zenodo.org/records/15778812. - - use data(mini_mac) to load a subsample (308 samples, 200 genes) of the full dataset, as in the commented code below - -```{r load_read_data} -## 1. Load your own gene counts per sample or 2. data from the publication -#project_rawdata <- "raw_matrix" -#project_name <- "PMMSq033" -#raw_counts <- Read10X(data.dir = project_rawdata) -# -## Create tidySeurat object -#mac <- CreateSeuratObject(counts = raw_counts, -# project = project_name, -# min.cells = 1, -# min.features = 1) - -# 3. Alternatively, load a pre-made example: -data(mini_mac) -mac <- mini_mac - -# Join gene counts per sample with metadata (if not already included) -if (nrow(mac[[]])==0) { - mac <- mac %>% - inner_join(metadata, by = c(".cell" = "Barcode")) -} - -# Create unique identifier for your treatments based on metadata -mac <- mac %>% - mutate(combined_id = str_c(Treatment_1, Concentration_1, sep = "_")) %>% - mutate(combined_id = gsub(" ", "", .data$combined_id)) %>% - mutate(combined_id = make.names(combined_id)) - -# # Filter by read count per sample group -mac <- filter_genes_by_expression(mac, - group_by = "combined_id", - min_counts = 1, - min_samples = 1) -``` - -#### 2.2 Basic QC metrics - -macpie allows you to use the common QC plots from the Seurat package to visualise the number of genes, reads, and percentage of mitochondrial and ribosomal genes per sample. -```{r violin_plot, fig.width = 8, fig.height = 4} -# Calculate the percent of mitochondrial and ribosomal genes -mac[["percent.mt"]] <- PercentageFeatureSet(mac, pattern = "^mt-|^MT-") -mac[["percent.ribo"]] <- PercentageFeatureSet(mac, pattern = "^Rp[slp][[:digit:]]|^Rpsa|^RP[SLP][[:digit:]]|^RPSA") - -# Example of a function from Seurat quality control -VlnPlot(mac, features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.ribo"), - ncol = 4, group.by = "Sample_type") & - scale_fill_manual(values = macpie_colours$discrete) -``` - -In addition, you can apply tidyverse functions to further explore the dataset. For example, let's subset the Seurat object based on the column "Project" in the metadata and visualise the grouping of data on the plate vs on an MDS plot. Plate layout plots provide an interactive way to inspect spatial patterns across a plate, helping to identify anomalies or unexpected trends. When hovering over the plot, sample groups are automatically highlighted to aid interpretation. - -```{r subset_seurat, fig.width = 8, fig.height = 3} -unique(mac$Project) -mac <- mac %>% - filter(Project == "Current") - -# Interactive QC plot plate layout (all metadata columns can be used): -p <- plot_plate_layout(mac, "nCount_RNA", "combined_id") -girafe(ggobj = p, - fonts = list(sans = "sans"), - options = list( - opts_hover(css = "stroke:black; stroke-width:0.8px;") # <- slight darkening - )) -``` - -##### 2.2.1 Sample grouping with MDS plot - -In order to assess sample grouping, we should visualise sample similarity based on the limma's MDS (MultiDimensional Scaling) function. Samples that are treated with a lower concentration of compound will often cluster close to the negative (vehicle) control. Hovering over the individual dots reveals sample identity and grouping. - -```{r mds_plot, fig.width = 8, fig.height = 6} -p <- plot_mds(mac, group_by = "Sample_type", label = "combined_id", n_labels = 30) -girafe(ggobj = p, fonts = list(sans = "sans")) - -``` - -##### 2.2.2 Distribution of read counts - -Visualising distribution of read counts across treatments is an easy way to compare the effects of treatments and estimate sample variability. Read count is commonly directly proportional to the number of cells. - -```{r qc_stats_plot, fig.width = 8, fig.height = 4} -qc_stats <- compute_qc_metrics(mac, group_by = "combined_id", order_by = "median") -``` - -##### 2.2.3 Variability among all replicates - -In relation to the previous plot, we want the user to have the ability to quantify the dispersion of reads between sample replicates. Therefore, we provide access to several statistical metrics such as standard deviation (sd_value), z score (z_score), mad (mad_value) and IQR (IQR) which can be used as a parameter to the function plot_qc_metrics individually, or assessed at once with the function plot_qc_metrics_heatmap. - -```{r qc_metrics, fig.width = 8, fig.height = 4} -plot_qc_metrics_heatmap(qc_stats$stats_summary) -``` - -Identifying outliers and batch effects, especially in the untreated samples, is especially important for downstream analysis. For further statistical methods to quantify variability among sample groups and inter-replicate variability please refer to vignette [Quality control](articles/quality_control.html). - -##### 2.2.4 Correction of the batch effect - -Several methods are available for scaling and normalizing transcriptomic data, with their effects most clearly visualized using RLE (Relative Log Expression) plots. In our case, limma_voom provides the lowest average coefficient of variation, when compared to other methods such as "raw", Seurat "SCT" or "edgeR". - -```{r plot_rle, fig.width = 8, fig.height = 4} -# First we will subset the data to look at control, DMSO samples only -mac_dmso <- mac %>% - filter(Treatment_1 == "DMSO") - -# Run the RLE function -plot_rle(mac_dmso, label_column = "Row", normalisation = "limma_voom") -``` - - -### 3. Differential gene expression - -**Key points**: - - - use **compute_single_de** to perform a differential expression analysis for one treatment group vs control - - use **compute_multi_de** to perform differential expression analyses for all treatment groups vs control - - use volcano plot, box plot and heatmap to show results from the analyses and visualise gene expression levels - - use **enrichr** for pathway enrichment analysis - - more detailed methods available in vignette [Transcriptional analyses](articles/transcriptional_analyses.html) - - -#### 3.1. Single comparison - -Similar to RNA-seq, the quality of differential gene expression analysis in MAC-seq depends on low variability among replicates and the suitability of the statistical model. These aspects are assessed during the quality control stage of the workflow. - -Results of differential expression analysis are classically visualised with a volcano plot. - - -```{r de_analysis, fig.width = 8, fig.height = 6} -# First perform the differential expression analysis -treatment_samples <- "Staurosporine_10" -control_samples <- "DMSO_0" -top_table <- compute_single_de(mac, treatment_samples, control_samples, method = "limma_voom") - -# Let's visualise the results with a volcano plot -plot_volcano(top_table, max.overlaps = 16) -``` - -Based on the results, we can quickly check gene expression levels in counts per million (CPM) for selected genes between treatment and control samples as described below. - -```{r plot_counts, fig.width = 8, fig.height = 6} -genes <- top_table$gene[1:6] -group_by <- "combined_id" -plot_counts(mac, genes, group_by, treatment_samples, control_samples, normalisation = "cpm", color_by = "combined_id") -``` - -#### 3.2. Pathway analysis - -Differential gene expression results for individual comparisons of treatment vs control are usually performed with functions from package enrichR and fgsea. In the following case, the effect of Staurosporine on breast cancer cells through Myc inactivation can be observed through pathway enrichment analyses. - -```{r pathway_analysis_single, fig.width = 8, fig.height = 4} -top_genes <- top_table %>% - filter(p_value_adj < 0.05) %>% - select(gene) %>% - pull() - -enriched <- enrichR::enrichr(top_genes, c("MSigDB_Hallmark_2020")) -p1 <- enrichR::plotEnrich(enriched[[1]]) + - macpie_theme(legend_position_ = 'right') + - scale_fill_gradientn(colors = macpie_colours$divergent) - -gridExtra::grid.arrange(p1, ncol = 1) -``` - -#### 3.3. Differential gene expression - multiple comparisons - -Since MAC-seq is commonly used for high-throughput screening of compound libraries, we often want to compare multiple samples in a screen vs the control. This process can easily be parallelised. First we select a vector of "treatments" as combined_ids that do not contain the word "DMSO". (Warning, due to the limitations of "mclapply", parallelisation speedup currently only works on OSX and Linux machines, and not on Windows.) - -```{r de_multi, fig.width = 8, fig.height = 5} -# Filter out lower concentrations of compounds and untreated samples -treatments <- mac %>% - filter(Concentration_1 == 10) %>% - select(combined_id) %>% - filter(!grepl("DMSO", combined_id)) %>% - pull() %>% - unique() -mac <- compute_multi_de(mac, treatments, control_samples = "DMSO_0", method = "limma_voom", num_cores = 1) -``` - - -We often want to ask which genes are differentially expressed in more than one treatment group. - -Here, we can visualise treatment groups with shared differentially expressed genes, defined as the top 5 DE genes from each single drug comparison (treatment vs control) that are found in at least 2 different treatment groups. - -The heatmap below shows shared differentially expressed genes with corresponding log2FC values. - -```{r plot_multi_de, fig.width=10, fig.height=6} -plot_multi_de(mac, group_by = "combined_id", value = "log2FC", p_value_cutoff = 0.01, direction="up", n_genes = 5, control = "DMSO_0", by="fc") -``` - -#### 3.4. Pathway analysis - multiple comparisons - -The pathway enrichment analysis is done with R package enrichR, and can be summarised with a heatmap, visualising direct and offtarget effects of the perturbations. - -```{r enriched_pathways, fig.width = 8, fig.height = 12} -# Load genesets from enrichr for a specific species or define your own -enrichr_genesets <- download_geneset("human", "MSigDB_Hallmark_2020") -mac <- compute_multi_enrichr(mac, genesets = enrichr_genesets) - -enriched_pathways_mat <- Tool(mini_mac, slot = "pathway_enrichment") %>% - bind_rows() %>% - select(combined_id, Term, Combined.Score) %>% - pivot_wider(names_from = combined_id, values_from = Combined.Score) %>% - column_to_rownames(var = "Term") %>% - mutate(across(everything(), ~ ifelse(is.na(.), 0, log1p(.)))) %>% # Replace NA with 0 across all columns - as.matrix() - -pheatmap(enriched_pathways_mat, color = macpie_colours$continuous_rev) -``` - -### 4. Methods for compound screening - -**Key points**: - - - use **plot_de_umap** to find compounds that behave similarly based on proximity on UMAP maps - - use **compute_single_dose_response** to evaluate impact of compound concentrations on - gene expression or pathway enrichment - - use **compute_multi_screen_profile** to find perturbations similar to your target profile or - a known gene set - - more detailed methods avaailable in vignette [Compound screening](articles/compound_screening.html) - -#### 4.1. UMAP clustering based on DE genes - -Instead of plotting UMAP on individual samples, we can also visualise the samples on UMAP of differential gene expression vs control. This allows us to use batch-corrected data and reduce replicate noise, while showing the grouping of treatments. - -```{r compute_de_umap, fig.width = 8, fig.height = 5} -mac_agg <- aggregate_by_de(mac) -mac_agg <- compute_de_umap(mac_agg) -mac_agg <- FindNeighbors(mac_agg, reduction = "umap_de", dims = 1:2) -mac_agg <- FindClusters(mac_agg, resolution = 1.3) - -cell_coords <- Embeddings(mac_agg, reduction = "umap_de") %>% - as.data.frame() %>% - rownames_to_column("combined_id") %>% - left_join(mac_agg[[]], by = "combined_id") - -# Plot with clusters and labels -ggplot(cell_coords, aes(x = UMAPde_1, y = UMAPde_2, color = seurat_clusters)) + - geom_point(size = 2) + - geom_text_repel(aes(label = combined_id), size = 3, max.overlaps = 10, force_pull = 1) + - theme_minimal() + - guides(color = guide_legend(title = "Cluster")) + - labs(x = "UMAP 1", y = "UMAP 2", title = "UMAP with Cell Names") + - theme(legend.position = "right") -``` - -#### 4.2. Similarity to a treatment profile or phenotype - -Additionally, when performing a screen, sometimes we want to measure similarity to either an existing profile, or to a user-defined gene-set that defines a desired phenotype. - -```{r compute_multi_screen_profile, fig.width = 8, fig.height = 5} -mac <- compute_multi_screen_profile(mac, target = "Staurosporine_10", n_genes_profile = 50, direction = "down", num_cores = 1) -mac_screen_profile <- Tool(mac, slot = "screen_profile") %>% - mutate(logPadj = c(-log10(padj))) %>% - arrange(desc(NES)) %>% - mutate(target_id = factor(target_id, levels = unique(target_id))) - -ggplot(mac_screen_profile, aes(target_id, NES)) + - #geom_point(aes(size = logPadj)) + - geom_point() + - facet_wrap(~pathway, scales = "free") + - macpie_theme(x_labels_angle = 90, show_x_title = FALSE) -``` - -#### 4.3. Estimate of dose-response - -macpie can be used to calculate dose-response curves for individual genes, pathways or any other external measurement such as cell viability that is available in your metadata, based on the R package drc These are also available in a paralelisable format with the function “compute_multiple_dose_response”. - -```{r compute_EC50_curves, fig.width = 6, fig.height = 3} -treatments <- mac %>% - select(combined_id) %>% - filter(!grepl("DMSO", combined_id)) %>% - pull() %>% - unique() -mac <- compute_multi_de(mac, treatments, control_samples = "DMSO_0", method = "limma_voom", num_cores = 1) -mac <- compute_multi_enrichr(mac, genesets = enrichr_genesets) -res <- compute_single_dose_response(data = mac, - gene = "DIP2C", - normalisation = "limma_voom", - treatment_value = "Staurosporine") -res$plot -res <- compute_single_dose_response(data = mac, - pathway = "p53 Pathway", - treatment_value = "Camptothecin") -res$plot -``` - -```{r session-info, echo=FALSE} -sessionInfo() -``` \ No newline at end of file