## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----echo = TRUE, results = "hide", message = FALSE--------------------------- ##Install FLASHMM from CRAN. # install.packages("FLASHMM") ##Install FLASHMM from Github. # devtools::install_github("https://github.com/Baderlab/FLASHMM") ##Load the package. library(FLASHMM) ## ----Simulate dataset, echo = TRUE, message = FALSE--------------------------- ##Generate the scRNA-seq data by simuRNAseq function. set.seed(2503) dat <- simuRNAseq(nGenes = 100, nCells = 100000, nsam = 25, ncls = 10, ntrt = 2, nDEgenes = 10) names(dat) dim(dat$counts) head(dat$metadata) ##Samples are nested in one treatment A or B. table(dat$metadata[, c("trt", "sam")]) ## ----Expression matrix, echo = TRUE, message = FALSE, tidy = TRUE, tidy.opts = list(width.cutoff = 90)---- ##Gene expression matrix, Y = log2(1 + counts) Y <- log2(1 + dat$counts) ##Since the simulated data contains no covariates other than libsize (library size) and cls (clsters or cell types), the fixed effects are specified as follows. fixed <- ~ 0 + log(libsize) + cls + cls:trt X <- model.matrix(fixed, data = dat$metadata) ##The single-component random effects are specified by randomS <- ~ 0 + as.factor(sam) Z1 <- model.matrix(randomS, data = dat$metadata) ##Two-component random effects: Suppose the data contains different measurement locations within a sample, denoted as 'location', which are randomly generated. To account for the variability of the locations, we may use the LMM with two-component random effects described below. set.seed(250801) dat$metadata$location <- sample(LETTERS[1:20], nrow(X), replace = TRUE) randomL <- ~ 0 + location Z2 <- model.matrix(randomL, data = dat$metadata) #Za <- cbind(Z, Z2) #two-component random-effects design matrix #da <- c(ncol(Z), ncol(Z2)) #dimension ## ----LMM fitting, echo = TRUE, message = FALSE, warning = FALSE--------------- ##Fit the LMM with one-component random effects. ##Option 1: Fit the LMM using model formulas based on cell-level data. fit1 <- flashmm(Y, fixed, random = randomS, metadata = dat$metadata, is.counts = FALSE) names(fit1) ##Option 2: Fit the LMM using model design matrices based on cell-level data. fit2 <- lmmfit(Y, X, Z = Z1, d = ncol(Z1)) identical(fit1, fit2) ##Option 3: Fit the LMM based on summary-level data. ##Step 1: computing the summary-level data #n <- nrow(X); XX <- t(X)%*%X; XY <- t(Y%*%X) #ZX <- t(Z)%*%X; ZY <- t(Y%*%Z); ZZ <- t(Z)%*%Z #Ynorm <- rowSums(Y*Y) ##or computed by sslmm function: ss <- sslmm(X, Y, Z = Z1) XX <- ss$XX; XY <- ss$XY ZX <- ss$ZX; ZY <- ss$ZY; ZZ <- ss$ZZ n <- ss$n; Ynorm <- ss$Ynorm d1 <- ncol(Z1) ##Step 2: fitting the LMM using lmm function fit3 <- lmm(XX, XY, ZX, ZY, ZZ, Ynorm = Ynorm, n = n, d = d1) identical(fit2, fit3) ##Fit the LMM with two-component random effects. fit2 <- flashmm(Y, fixed, random = list(randomS, randomL), metadata = dat$metadata) fit3 <- lmmfit(Y, X, Z = cbind(Z1, Z2), d = c(ncol(Z1), ncol(Z2))) identical(fit2, fit3) ## ----echo = TRUE, message = FALSE, warning = FALSE---------------------------- ##Check the genes for which LMM fitting converges. fit1$epsilon convg <- (apply(abs(fit1$dlogL), 2, max) < fit1$epsilon) sum(convg) ## ----------------------------------------------------------------------------- ##Testing coefficients (fixed effects) test <- lmmtest(fit1) ##The t-value and p-values are identical with those provided in the LMM fit. range(test - cbind(t(fit1$coef), t(fit1$t), t(fit1$p))) fit1$coef[, 1:4] #fit1$t[, 1:4] #fit1$p[, 1:4] ## ----echo = TRUE, message = FALSE, warning = FALSE---------------------------- ##The DE genes specific to a cell type ##Coefficients, t-values, and p-values for the genes specific to a cell type. index <- grep(":", rownames(fit1$coef)) ce <- fit1$coef[index, ] tv <- fit1$t[index, ] pv <- fit1$p[index, ] out <- data.frame( gene = rep(colnames(ce), nrow(ce)), cluster = rep(rownames(ce), each = ncol(ce)), coef = c(t(ce)), t = c(t(tv)), p = c(t(pv))) ##FDR. out$FDR <- p.adjust(out$p, method = "fdr") ##The DE genes with FDR < 0.05 and abs(logFC) > 0.5 out <- out[order(out$p), ] rownames(out) <- NULL out[(out$FDR < 0.05) & (abs(out$coef) > 0.5) , ] ## ----------------------------------------------------------------------------- ##Construct the contrast. contrast <- cbind("BvsA" = numeric(nrow(fit1$coef))) index <- grep(":", rownames(fit1$coef)) contrast[index, ] <- 1/length(index) length(index) ##Test the contrast. test <- lmmtest(fit1, contrast = contrast) head(test) ## ----------------------------------------------------------------------------- ncls <- 10 #length(index) sumeff <- paste0(paste0("cls", 1:ncls, ":trtB"), collapse = "+") sumeff <- paste0("(", sumeff, ")/", ncls) #sumeff contrast <- contrast.matrix( contrast = c(BvsA = sumeff), model.matrix.names = rownames(fit1$coef)) test <- lmmtest(fit1, contrast = contrast) head(test) ## ----------------------------------------------------------------------------- ##Using z-test for testing variance components i <- grep("var1", rownames(fit1$theta)) #The (first) variance component z <- fit1$theta[i, ]/fit1$se.theta[i, ] #z-statistics ##One-sided z-test p-values for hypotheses: ##H0: theta <= 0 vs H1: theta > 0 p <- pnorm(z, lower.tail = FALSE) ##Number of significant p-values sum(p < 0.05/length(p)) #Bonferroni correction ## ----------------------------------------------------------------------------- ##(1) z-test for testing the second variance component ##Z-statistics for the second variance component i <- grep("var2", rownames(fit2$theta)) z <- fit2$theta[i, ]/fit2$se.theta[i, ] ##One-sided z-test p-values for hypotheses: ##H0: theta <= 0 vs H1: theta > 0 p <- pnorm(z, lower.tail = FALSE) ##number of significant p-values sum(p.adjust(p, method = "fdr") < 0.05) sum(p < 0.05/length(p)) #Bonferroni correction ##(2) LRT for testing the second variance component LRT <- 2*(fit2$logLik - fit1$logLik) pLRT <- pchisq(LRT, df = 1, lower.tail = FALSE) ##number of significant p-values sum(p.adjust(pLRT, method = "fdr") < 0.05) sum(pLRT < 0.05/length(pLRT)) #Bonferroni correction ##QQ-plot qqplot(runif(length(p)), p, xlab = "Uniform quantile", ylab = "Z-test p-value", col = "blue") abline(0, 1, col = "gray") qqplot(runif(length(pLRT)), pLRT, xlab = "Uniform quantile", ylab = "LRT p-value", col = "blue") abline(0, 1, col = "gray") ## ----LMM_ML, echo = TRUE, message = FALSE, warning = FALSE-------------------- ##Fitting LMM using ML method #fit3 <- lmmfit(Y, X, Z, d = d1, method = "ML") fit3 <- lmm(XX, XY, ZX, ZY, ZZ, Ynorm = Ynorm, n = n, d = d1, method = "ML") ##The DE genes specific to a cell type ##Coefficients, t-values, and p-values index <- grep(":", rownames(fit3$coef)) ce <- fit3$coef[index, ] tv <- fit3$t[index, ] pv <- fit3$p[index, ] out <- data.frame( gene = rep(colnames(ce), nrow(ce)), cluster = rep(rownames(ce), each = ncol(ce)), coef = c(t(ce)), t = c(t(tv)), p = c(t(pv))) ##The DE genes with FDR < 0.05 and abs(logFC) > 0.5 out$FDR <- p.adjust(out$p, method = "fdr") #FDR out <- out[order(out$p), ] rownames(out) <- NULL out[(out$FDR < 0.05) & (abs(out$coef) > 0.5) , ] ## ----echo = TRUE, message = TRUE---------------------------------------------- sessionInfo()