library(grid) # grid.text() library(latticeExtra) # useOuterStrips() whereAmI <- "/Users/jenny/teaching/2011/STAT545A/examples/yeastDeletionGrowth/" ## yeast deletion growth data hDat <- read.delim(jPaste(whereAmI, "data/hDat.txt")) str(hDat) # 5521 obs. of 4 variables ## focusing on two chromosomes jChromo <- c(10, 11) kDat <- droplevels(subset(hDat, chromo %in% jChromo)) str(kDat) # 627 obs. of 4 variables: (chromoCounts <- table(kDat$chromo)) ## 10 11 ## 325 302 (n <- nrow(kDat)) # 627 obs ## getting a visual overview of the data ## note: example of placing text in the panel ## commented out code might be useful for understanding what I'm doing ## and seeing alternative ways to get same result densityplot(~ pheno | chromoPretty, kDat, xlab = "Growth phenotype", panel = function(x, ...) { panel.densityplot(x, ...) ## put pre-computed info on the plot ## or sthg that can be computed on-the-fly ## grid.text(paste("packet.number() returns ", packet.number()), ## grid.text(paste(chromoCounts[packet.number()], 'genes'), grid.text(paste(length(x), 'genes'), x = 0.1, y = 0.9, just = c("left", "center"), gp = gpar(fontfamily = "sans")) }) dev.print(pdf, jPaste(whereAmI, "figs/phenoDensityplotPanelChr", jChromo[1], "Chr", jChromo[2],".pdf"), width = 8, height = 6) ## consider the (non-standard) test statistic ## | avg(x) - avg(y) | ## NOTE: I'm writing code to explain the bootstrap now, ## not to showcase elegance or efficiency. ## compute the observed test statistic (chromoMeans <- with(kDat, tapply(pheno, chromo, mean))) ## 10 11 ## 8.943558 9.203379 (obsTestStat <- abs(chromoMeans[1] - chromoMeans[2])) ## 0.2598215 ## note:example of adding a line and text densityplot(~ pheno | chromoPretty, kDat, xlab = "Growth phenotype", layout = c(1,2), panel = function(x, ...) { panel.densityplot(x, ...) mu <- mean(x) panel.abline(v = mu, lty = 'dotted') if(packet.number() == 1) { avgText <- bquote(bar(x) == .(round(mu, 2))) } else { avgText <- bquote(bar(y) == .(round(mu, 2))) } grid.text(avgText, x = 0.1, y = 0.9, just = c("left", "center")) }) dev.print(pdf, jPaste(whereAmI, "figs/phenoDensityplotPanelChr", jChromo[1], "Chr", jChromo[2],"WithMean.pdf"), width = 5, height = 8) ## enter the world of the null hypothesis ## generate one bootstrap sample set.seed(12) z <- kDat$pheno xStar <- sample(z, size = chromoCounts[1], replace = TRUE) yStar <- sample(z, size = chromoCounts[2], replace = TRUE) c(mean(xStar), mean(yStar)) ## [1] 8.980664 9.062216 (bootTestStat <- abs(mean(xStar) - mean(yStar))) ## 11 ## 0.08155161 plot1 <- densityplot(~pheno, kDat) ## easier to plot if I store the bootstrap data set differently, ## i.e. "stack it" into a data.frame bootDat <- with(kDat, data.frame(pheno = c(xStar, yStar), chromoPretty)) plot2 <- densityplot( ~ pheno | chromoPretty, bootDat, xlab = "Bootstrap data", layout = c(1, 2), panel = function(x, ...) { panel.densityplot(x, ...) mu <- mean(x) panel.abline(v = mu, lty = "dotted") if(packet.number() == 1) { avgText <- bquote(bar(x) == .(round(mu, 2))) } else { avgText <- bquote(bar(y) == .(round(mu, 2))) } grid.text(avgText, x = 0.1, y = 0.9, just = c("left", "center")) }) print(plot1, pos = c(0, 0.25, 0.5, 0.75), more = TRUE) print(plot2, pos = c(0.5, 0, 1, 1), more = FALSE) dev.print(pdf, jPaste(whereAmI, "figs/phenoDensityplotBootDataExampleTiny.pdf"), width = 8, height = 8) ## ten bootstrap samples ## NOTE: I do not recommend an explicit for loop in real life! Doing ## here just for clarity! set.seed(23) B <- 10 bootTestStat <- rep(NA, B) for(i in 1:B) { xStar <- sample(z, size = chromoCounts[1], replace = TRUE) yStar <- sample(z, size = chromoCounts[2], replace = TRUE) bootTestStat[i] <- abs(mean(xStar) - mean(yStar)) } bootTestStat ## [1] 0.007943383 0.109329811 0.138009880 0.127795231 0.261677793 0.039274346 ## [7] 0.090984481 0.018088425 0.035747594 0.007489110 mean(bootTestStat >= obsTestStat) ## [1] 0.1 densityplot(~ bootTestStat, xlab = "bootstrap test stats = |avg x* - avg y*|", panel = function(x, ...) { panel.densityplot(x, ...) panel.abline(v = obsTestStat, lty = 'dotted') grid.text(paste("bootstrap p-value =", round(mean(bootTestStat >= obsTestStat), 2)), x = 0.9, y = 0.9, just = c("right", "center")) }) dev.print(pdf, jPaste(whereAmI, "figs/densityplotBootTestStatAbsMeanDiffTiny.pdf"), width = 8, height = 6) ## increasing B, putting more emphasis on how I would implement the ## bootstrap in actual practice set.seed(101) B <- 10000 bootDat <- matrix(sample(kDat$pheno, size = B * n, replace = TRUE), nrow = n, ncol = B) str(bootDat) ## num [1:627, 1:10000] 9.73 6.15 9.96 9.4 9.46 ... ## let's look at the first 3 bootstrap datasets ## good example of data reshaping for the purposes of plotting bootDatToPlot <- stack(as.data.frame(bootDat[ ,1:3])) str(bootDatToPlot) ## 'data.frame': 1881 obs. of 2 variables: ## $ values: num 9.73 6.15 9.96 9.4 9.46 ... ## $ ind : Factor w/ 3 levels "V1","V2","V3": 1 1 1 1 1 1 1 1 1 1 ... bootDatToPlot <- # tidy up data.frame(chromoPretty = kDat$chromoPretty, # exploit recycling bootIter = unclass(bootDatToPlot$ind), pheno = bootDatToPlot$values) peek(bootDatToPlot) jPlot <- densityplot(~ pheno | factor(bootIter) * chromoPretty, bootDatToPlot, xlab = "(Bootstrap) Growth phenotype", layout = c(2,3), panel = function(x, ...) { panel.densityplot(x, ...) mu <- mean(x) panel.abline(v = mu, lty = 'dotted') grid.text(bquote(mu == .(round(mu, 2))), x = 0.1, y = 0.9, just = c("left", "center")) }) useOuterStrips(jPlot) dev.print(pdf, jPaste(whereAmI, "figs/phenoDensityplotBootDataExample.pdf"), width = 8, height = 6) ## function to compute my test statistic computeAbsDiffMeans <- function(jResp, jFact) { groupMeans <- tapply(jResp, jFact, mean) return(abs(groupMeans[1] - groupMeans[2])) } ## applying that function to the observed data to make sure I get same ## value as before obsTestStat # 0.2598215 (obsTestStat <- with(kDat, computeAbsDiffMeans(pheno, chromo))) ## 0.2598215 .... YES, AS BEFORE ## applying that function to each boostrap dataset ## note use of system.time to see how something takes system.time( bootTestStats <- apply(bootDat, 2, computeAbsDiffMeans, jFact = kDat$chromo) ) # 7 - 8 seconds for me bootTestStats[1:3] ## 0.17465224 0.06890944 0.03129661 ## bootstrap p-value mean(bootTestStats >= obsTestStat) ## [1] 0.0172 t.test(pheno ~ chromo, kDat)$p.value ## [1] 0.01940612 wilcox.test(pheno ~ chromo, kDat)$p.value ## [1] 0.001156313 ## empirical distribution of these bootstrap test stats densityplot( ~ bootTestStats, xlab = expression(group("|", bar(x) - bar(y),"|")), main = "Bootstrap test statistics", plot.points = FALSE, n = 200, ref = TRUE, panel = function(x, ...) { panel.densityplot(x, ...) panel.abline(v = obsTestStat, lty = 'dotted') }) dev.print(pdf, jPaste(whereAmI, "figs/densityplotBootTestStatAbsMeanDiff.pdf"), width = 8, height = 6) ## using the bootstrap data to get empirical null dist'ns for other ## test stats ## | median(x) - median(y) | computeAbsDiffMedians <- function(jResp, jFact) { groupMedians <- tapply(jResp, jFact, median) return(abs(groupMedians[1] - groupMedians[2])) } ## | Welch's t statistic | computeAbsWelch <- function(jResp, jFact) { return(abs(t.test(jResp ~ jFact)$statistic)) } ## Welch's t statistic computeWelch <- function(jResp, jFact) { return(t.test(jResp ~ jFact)$statistic) } ## Kolmogorov-Smirnov test stat computeKs <- function(jResp, jFact) { jList <- split(jResp, jFact) return(ks.test(x = jList[[1]], y = jList[[2]])$statistic) } ## Wilcoxon test stat computeWilcoxon <- function(jResp, jFact) { return(wilcox.test(jResp ~ jFact)$statistic) } ## using these various test statistics with the bootstrap data ## each statement might take up to ~10 secs bootAbsDiffMean <- apply(bootDat, 2, computeAbsDiffMeans, jFact = kDat$chromo) bootWelch <- apply(bootDat, 2, computeWelch, jFact = kDat$chromo) bootAbsWelch <- apply(bootDat, 2, computeAbsWelch, jFact = kDat$chromo) bootAbsDiffMed <- apply(bootDat, 2, computeAbsDiffMedians, jFact = kDat$chromo) bootKs <- apply(bootDat, 2, computeKs, jFact = kDat$chromo) ## will get lots of warnings about ties ... so be it bootWilcoxon <- apply(bootDat, 2, computeWilcoxon, jFact = kDat$chromo) ## gather bootstrap test statistics together bootStats <- data.frame(absDiffMean = bootAbsDiffMean, absDiffMedian = bootAbsDiffMed, absWelch = bootAbsWelch, welch = bootWelch, ks = bootKs, wilcoxon = bootWilcoxon) str(bootStats) ## 'data.frame': 10000 obs. of 6 variables: ## $ absDiffMean : num 0.1747 0.0689 0.0313 0.0668 0.0106 ... ## $ absDiffMedian: num 0.2314 0.0349 0.1574 0.1195 0.1023 ... ## $ absWelch : num 1.5645 0.6022 0.2911 0.6073 0.0943 ... ## $ welch : num -1.5645 -0.6022 -0.2911 0.6073 -0.0943 ... ## $ ks : num 0.072 0.0534 0.0741 0.0776 0.062 ... ## $ wilcoxon : num 46193 48418 46866 50942 47513 ... ## now I want to use the bootstrap test stats to get p-values for ## observed test statistics ## recall the chromosome-specific means ... chromoMeans ## 10 11 ## 8.943558 9.203379 ## ... and do same for median (chromoMedians <- with(kDat, tapply(pheno, chromo, median))) ## 10 11 ## 9.482804 9.804809 ## a list version of kDat is useful for the KS test which lacks a ## formula interface kList <- with(kDat, split(pheno, chromo)) str(kList) ## List of 2 ## $ 10: num [1:325] 10.29 10.54 6.93 10.64 10.07 ... ## $ 11: num [1:302] 10.57 10.54 6.83 8.63 9.4 ... ## this data.frame will have 1 row for each test statistic under ## consideration and columns to hold a description of the test ## statistic, the observed value, the classical p-value and the ## bootstrap p-value statSigRes <- data.frame(testStat = I(c("abs diff means", "abs diff medians", "abs welch t", "welch t", "kolmogorov-smirnov", "wilcoxon")), obsStats = c(absDiffMean = abs(chromoMeans[1] - chromoMeans[2]), absDiffMedian = abs(chromoMedians[1] - chromoMedians[2]), absWelch = abs(t.test(pheno ~ chromo, kDat)$statistic), welch = t.test(pheno ~ chromo, kDat)$statistic, ks = ks.test(x = kList[[1]], y = kList[[2]])$statistic, wilcoxon = wilcox.test(pheno ~ chromo, kDat)$statistic), pValClassical = 1, pValBootstrap = 1) rownames(statSigRes) <- statSigRes$testStat statSigRes ## ugly but helpful to get oriented densityplot(~ absDiffMean + absDiffMedian + absWelch + welch + ks + wilcoxon, bootStats, outer = TRUE, scales = "free") ## computing bootstrap p-values ## test stats where p-value is a RIGHT tail probability statSigRes["abs diff means", "pValBootstrap"] <- mean(bootStats$absDiffMean >= statSigRes["abs diff means", "obsStats"]) statSigRes["abs diff medians", "pValBootstrap"] <- mean(bootStats$absDiffMedian >= statSigRes["abs diff medians", "obsStats"]) statSigRes["abs welch t", "pValBootstrap"] <- mean(bootStats$absWelch >= statSigRes["abs welch t", "obsStats"]) statSigRes["kolmogorov-smirnov", "pValBootstrap"] <- mean(bootStats$ks >= statSigRes["kolmogorov-smirnov", "obsStats"]) ## test stats where p-value is LEFT + RIGHT tail probability statSigRes["welch t", "pValBootstrap"] <- mean(abs(bootStats$welch) >= abs(statSigRes["welch t", "obsStats"])) ## hint: the Wilcoxon U statistic is approximately normally ## distributed with mean (n1 * n2) /2 statSigRes["wilcoxon", "pValBootstrap"] <- mean(abs(bootStats$wilcoxon - prod(chromoCounts)/2) >= abs(statSigRes["wilcoxon", "obsStats"] - prod(chromoCounts)/2)) ## computing "classical" p-values ## test stats with no "classical" p-value statSigRes[c("abs diff means", "abs diff medians", "abs welch t"), "pValClassical"] <- NA ## test stats with a "classical" p-value statSigRes[c("welch t", "kolmogorov-smirnov", "wilcoxon"), "pValClassical"] <- round(c(t.test(pheno ~ chromo, kDat)$p.value, ks.test(x = kList[[1]], y = kList[[2]])$p.value, wilcox.test(pheno ~ chromo, kDat)$p.value), 4) statSigRes$obsStats <- round(statSigRes$obsStats, 3) statSigRes ## testStat obsStats pValClassical pValBootstrap ## abs diff means 0.260 NA 0.0172 ## abs diff medians 0.322 NA 0.0140 ## abs welch t 2.344 NA 0.0170 ## welch t -2.344 0.0194 0.0170 ## kolmogorov-smirnov 0.134 0.0073 0.0051 ## wilcoxon 41710.000 0.0012 0.0011 jWriteTable(statSigRes, jPaste(whereAmI, "results/statSigRes.txt"), sep = "\t") showOneBootstrapTest <- function(jBootTestStats, jObsTestStat, jPval, jDesc = "", jXlab = "test statistic", writeToFile = FALSE) { print(densityplot( ~ jBootTestStats, xlab = jXlab, plot.points = FALSE, n = 200, ref = TRUE, panel = function(x, ...) { panel.densityplot(x, ...) panel.abline(v = jObsTestStat, lty = 'dotted') jText <- paste(jDesc, "\n", "obs test stat =", round(jObsTestStat, 2), "\n", "p-value =", round(jPval, 4), "\n") grid.text(jText, x = 0.9, y = 0.9, just = c("right", "center"), gp = gpar(fontfamily = "sans")) })) if (writeToFile) { yo <- deparse(substitute(jBootTestStats)) dev.print(pdf, jPaste(whereAmI, "figs/densityplotBootTest-", yo, ".pdf"), width = 8, height = 6) } } with(bootStats, showOneBootstrapTest(absDiffMean, statSigRes["abs diff means", "obsStats"], statSigRes["abs diff means", "pValBootstrap"], "abs. diff. of means", jXlab = expression(group("|", bar(x) - bar(y),"|")), TRUE)) with(bootStats, showOneBootstrapTest(absDiffMedian, statSigRes["abs diff medians", "obsStats"], statSigRes["abs diff medians", "pValBootstrap"], "| median(x) - median(y) |", jXlab = "Abs. value of difference in medians", TRUE)) with(bootStats, showOneBootstrapTest(absWelch, statSigRes["abs welch t", "obsStats"], statSigRes["abs welch t", "pValBootstrap"], "| Welch's t stat |", jXlab = "Abs. value of Welch's t statistic", TRUE)) with(bootStats, showOneBootstrapTest(welch, c(1, -1) * statSigRes["welch t", "obsStats"], statSigRes["welch t", "pValBootstrap"], "Welch's t stat", jXlab = "Welch's t statistic", TRUE)) with(bootStats, showOneBootstrapTest(ks, statSigRes["kolmogorov-smirnov", "obsStats"], statSigRes["kolmogorov-smirnov", "pValBootstrap"], "KS test stat", jXlab = "Kolmogorov-Smirnov test statistic", TRUE)) with(bootStats, showOneBootstrapTest(wilcoxon, prod(chromoCounts)/2 + c(1, -1) * (statSigRes["wilcoxon", "obsStats"] - prod(chromoCounts)/2), statSigRes["wilcoxon", "pValBootstrap"], "Wilcoxon test stat", jXlab = "Wilcoxon test statistic, i.e. sum of 1 group's ranks", TRUE))