library(RColorBrewer) library(latticeExtra) # ecdfplot (!) whereAmI <- "/Users/jenny/teaching/2011/STAT545A/examples/yeastDeletionGrowth/" ## function that returns result from 4 different "two groups" tests ## for a given pair of chromosomes source(jPaste(whereAmI, "code/10-twoGroupTests.R")) ## yeast deletion growth data hDat <- read.delim(jPaste(whereAmI, "data/hDat.txt")) str(hDat) # 5521 obs. of 4 variables (nC <- length(levels(hDat$chromoPretty))) # no. of chromosomes = 16 ## Goal: perform a two sample t test for each pair of chromosomes ## (or maybe some other kind of test) ## almost a cheap trick ... use the built-in function! ## ttr = t test results ttr1 <- pairwise.t.test(hDat$pheno, hDat$chromoPretty, pool.sd = FALSE, p.adjust.method = "none") class(ttr1) # "pairwise.htest" names(ttr1) # what's in it? ttr1$p.value[1:5, 1:5] # most relevant output ## We will learn more by implementing this ourselves. ## Plus we can easily use a different two-sample test. ## My code here is inspired by a thread on R-help. Search for "form ## pairs variables" and you'll find it. ## 'combn' returns indices we can use to generate all possible pairs ## of chromosomes as a 2 * (16 choose 2) matrix choose(nC, 2) # 120 possible pairs jCombos <- combn(nC, 2) str(jCombos) # two cols, 120 rows jCombos[, 1:7] # first several pairs jCombos <- t(jCombos) # I prefer the transpose head(jCombos) ## use 'apply()' to visit each row of jCombos, i.e. each pair of ## chromosomes, and conduct various "two groups" tests for the ## associated pair of chromosomes tgtRes <- t(apply(jCombos, 1, function(jInd) { foo <- twoGroupTests(chromoA = jInd[1], chromoB = jInd[2], hDat) return(sapply(foo, function(yo) return(yo$p.value))) })) ## get lots of warnings() from the chi-squared test ## proceed anyway .... ## bring in the chromosome pair tgtRes <- data.frame(chromoA = jCombos[ , 1], chromoB = jCombos[ , 2], tgtRes) str(tgtRes) ## 'data.frame': 120 obs. of 6 variables: ## $ chromoA: int 1 1 1 1 1 1 1 1 1 1 ... ## $ chromoB: int 2 3 4 5 6 7 8 9 10 11 ... ## $ t : num 1.04e-05 4.86e-03 2.67e-11 8.66e-01 6.26e-05 ... ## $ wilcox : num 7.29e-07 2.10e-03 5.39e-11 7.77e-01 1.17e-05 ... ## $ ks : num 2.53e-06 1.87e-02 1.26e-08 7.68e-01 2.98e-05 ... ## $ chisq : num 2.19e-07 5.55e-02 3.27e-09 6.26e-03 3.54e-04 ... peek(tgtRes) ## must handle some pesky zeroes returned as p-values from KS test summary(tgtRes$ks) ## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.0000000 0.0002148 0.0077410 0.1361000 0.1860000 0.9587000 tgtRes[which(tgtRes$ks == 0), ] ## chromoA chromoB t wilcox ks chisq ## 43 4 5 7.906832e-22 1.154823e-23 0 4.340871e-19 ## A p-value of zero is harmful, recode as a very tiny number. tgtRes$ks[tgtRes$ks == 0] <- 2 * .Machine$double.eps ## comparing these test results to each other round(cor(subset(tgtRes, select = c(t, wilcox, ks, chisq))), 2) splom(~ subset(tgtRes, select = c(t, wilcox, ks, chisq)), xlab = "p-values from two groups tests") dev.print(pdf, jPaste(whereAmI, "figs/splomTwoGroupsPvalues.pdf"), width = 8, height = 8) ## want to look at log(p-values) but must subset and transform 'by ## hand' ## less control over log axis handling here than in xyplot :-( foo <- subset(tgtRes, select = c(t, wilcox, ks, chisq)) foo <- sapply(foo, log10) round(cor(foo), 2) splom(~ foo, xlab = "log10 p-values from two groups tests") dev.print(pdf, jPaste(whereAmI, "figs/splomTwoGroupsLog10Pvalues.pdf"), width = 8, height = 8) head(tgtRes) ## more looks at the overall distribution of the p-values for various ## "two groups" tests ecdfplot( ~ t + wilcox + ks + chisq, tgtRes, auto.key = TRUE) ecdfplot( ~ t + wilcox + ks + chisq, tgtRes, auto.key = TRUE, scales = list(x = list(log = 10))) densityplot( ~ t + wilcox + ks + chisq, tgtRes, auto.key = TRUE, scales = list(x = list(log = 10))) ## did not save these ## to make the heatmap we have in mind, we will need to store our ## p-values just like the return value from the built-in function ## pairwise.t.test ## let's practice! ttr2Pvals <- matrix(NA, nrow = nC - 1, ncol = nC - 1) ttr2Pvals[lower.tri(ttr2Pvals, diag = TRUE)] <- tgtRes$t rownames(ttr2Pvals) <- levels(hDat$chromoPretty)[-1] colnames(ttr2Pvals) <- levels(hDat$chromoPretty)[-nC] ttr2Pvals[1:5, 1:5] # are they the same? ttr1$p.value[1:5, 1:5] all.equal(ttr2Pvals, ttr1$p.value) # Yes, they are the same. ## small examples to illustrate techniques useful for square, ## symmetric matrices (foo <- matrix(NA, nrow = 4, ncol = 4)) (filler <- LETTERS[1:10]) foo[lower.tri(foo, diag = TRUE)] <- filler foo (foo <- matrix(NA, nrow = 4, ncol = 4)) (filler <- LETTERS[1:6]) foo[lower.tri(foo)] <- filler foo foo[upper.tri(foo)] <- t(foo)[upper.tri(foo)] foo ## also, know that R has the 'dist()' class for efficient storage and ## handling of square symmetric matrices ## storing the p-values from each "two groups" test in a nC by nC ## matrix and populating all cells sensibly ... will facilitate making ## figures ## first, practice with p-values from just one test jP <- matrix(NA, nrow = nC, ncol = nC) jP[lower.tri(jP)] <- tgtRes$t rownames(jP) <- colnames(jP) <- levels(hDat$chromoPretty) diag(jP) <- 1 jP[upper.tri(jP)] <- t(jP)[upper.tri(jP)] str(jP) jP[1:5, 1:5] ## now scaling up to all four "two groups" tests jP <- lapply(subset(tgtRes, select = c(t, wilcox, ks, chisq)), function(pvalues) { foo <- matrix(NA, nrow = nC, ncol = nC) foo[lower.tri(foo)] <- pvalues rownames(foo) <- colnames(foo) <- levels(hDat$chromoPretty) diag(foo) <- 1 foo[upper.tri(foo)] <- t(foo)[upper.tri(foo)] return(foo) }) str(jP) # list of 4 nC by nC matrices str(jP$wilcox) jP$wilcox[1:5, 1:5] ## Goal: present these p-values to an actual human being, who needs to ## absorb the message quickly and with as little pain as possible. ## graphs depicting the pairwise comparisons of phenotype dist'n ## shade the squares according to p-value levelplot(jP$t, scales = list(x = list(rot = 45))) # ugly but it's a start dev.print(pdf, jPaste(whereAmI, "figs/levPlot01.pdf"), width = 8, height = 6) ## getting a better color ramp ## RColorBrewer provides a good start in this area display.brewer.all(n=NULL, type="all", select=NULL, exact.n=TRUE) title("RColorBrewer palettes") display.brewer.pal(n = 9, "GnBu") text(x = 1:9, y = rep(1, 9), labels = 1:9) title("RColorBrewer palette 'GnBu'") dev.print(pdf, jPaste(whereAmI, "figs/RColorBrewerGnBu.pdf"), width = 8, height = 6) ## pick palette to focus on brewPal <- brewer.pal(n=9,"GnBu") colorFun <- colorRampPalette(rev(brewPal)) myCols <- colorFun(100) ## improved plot, color-wise levelplot(jP$t, col.regions = myCols, scales = list(x = list(rot = 45)), xlab = "", ylab = "", main = "Colors based on 'GnBu' palette from RColorBrewer") dev.print(pdf, jPaste(whereAmI, "figs/levPlot02.pdf"), width = 8, height = 6) ## including densityplots alongside is helpful lPlot <- levelplot(jP$t, col.regions = myCols, xlab = "", ylab = "", scales = list(x = list(rot = 45))) dPlot <- densityplot(~ pheno | chromoPretty, hDat, layout = c(1, nC), plot.points = FALSE, strip = FALSE, strip.left = strip.custom(horizontal = TRUE, bg = NA), par.strip.text = list(lines = 3), scales = list(draw = FALSE), xlab = "", ylab = "") print(dPlot, pos = c(0, 0.1, 0.2, 0.96), more = TRUE) print(lPlot, pos = c(0.15, 0, 1, 1), more = FALSE) dev.print(pdf, jPaste(whereAmI, "figs/levPlot03.pdf"), width = 8, height = 6) ## The plot will be easier to read if the chromosomes are ordered more ## logically. Alphabetical order is silly here! ## I generated plots using the different orderings interactively. ## I.e. use your head when you submit the code below ## order by median phenotype jOrd1 <- order(tapply(hDat$pheno, hDat$chromoPretty, median)) jTitle <- "Ordered by median phenotype" jOrd <- jOrd1 ## order by hierarchical clustering jOrd2 <- order.dendrogram(as.dendrogram(hclust(dist(jP$t)))) jTitle <- "Ordered by clustering result" jOrd <- jOrd2 ## order by Jenny, inspired by above jOrd3 <- c(4, 7, 8, 6, 9, 2, 14, 12, 10, 16, 3, 15, 11, 13, 1, 5) jTitle <- "Ordered by Jenny" jOrd <- jOrd3 lPlot <- levelplot(jP$t[jOrd, jOrd], col.regions = myCols, xlab = "", ylab = "", main = jTitle, scales = list(x = list(rot = 45))) ## You can update a lattice plot object! dPlot <- update(dPlot, index.cond = list(jOrd)) print(dPlot, pos = c(0, 0.1, 0.2, 0.96), more = TRUE) print(lPlot, pos = c(0.15, 0, 1, 1), more = FALSE) dev.print(pdf, # jPaste(whereAmI, "figs/levPlot04medianOrder.pdf"), # jPaste(whereAmI, "figs/levPlot04hclustOrder.pdf"), jPaste(whereAmI, "figs/levPlot04jennyOrder.pdf"), width = 8, height = 6) ## transforming the p-values to optimize mapping into the color ramp ## I want to de-emphasize differences between teeny p-values ## and between very large p-values ## and increase the visual impact of differences in (0.01, 0.10) pvalueTicks <- c(0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1) qnScale <- 0.6 colorkeyTicks <- qnorm(pvalueTicks * qnScale) jTitle <- "t-test, JB order, probit transformed p-values" lPlot <- levelplot(qnorm(jP$t[jOrd, jOrd] * qnScale), col.regions = myCols, scales = list(x = list(rot = 45)), xlab = "", ylab = "", main = jTitle, colorkey = list(col = myCols, labels = list(labels = pvalueTicks, at = colorkeyTicks))) print(dPlot, pos = c(0, 0.1, 0.2, 0.96), more = TRUE) print(lPlot, pos = c(0.15, 0, 1, 1), more = FALSE) dev.print(pdf, jPaste(whereAmI, "figs/levPlot05.pdf"), width = 8, height = 6) ## using the p-values from the KS test jTitle <- "KS test, JB order, probit transformed p-values" lPlotKS <- levelplot(qnorm(jP$ks[jOrd, jOrd] * qnScale), col.regions = myCols, scales = list(x = list(rot = 45)), xlab = "", ylab = "", main = jTitle, colorkey = list(col = myCols, labels = list(labels = pvalueTicks, at = colorkeyTicks))) print(dPlot, pos = c(0, 0.1, 0.2, 0.96), more = TRUE) print(lPlotKS, pos = c(0.15, 0, 1, 1), more = FALSE) dev.print(pdf, jPaste(whereAmI, "figs/levPlot06.pdf"), width = 8, height = 6) ## most statistically significant differences head(tgtRes[order(tgtRes$t), ], 15)