library(latticeExtra) # ecdfplot() ## 'home' directory for Gapminder example whereWasI <- "/Users/jenny/teaching/STAT545A/examples/gapminder/" ## 'home' directory for this lecture whereAmI <- "/Users/jenny/teaching/STAT545A/classMeet/cm05/" ## import the cleaned Gapminder data gDat <- read.delim(paste0(whereWasI,"data/gapminderDataFiveYear.txt")) ## reach out and touch the data str(gDat) ## 'data.frame': 1704 obs. of 6 variables: ## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ... ## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ... ## $ pop : num 8425333 9240934 10267083 11537966 13079460 ... ## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 33 ... ## $ lifeExp : num 28.8 30.3 32 34 36.1 ... ## $ gdpPercap: num 779 821 853 836 740 ... summary(gDat) head(gDat) peek(gDat) ## the quantitative variables summary(subset(gDat, select = c(pop, lifeExp, gdpPercap))) ## high-level summaries summary(gDat$lifeExp) fivenum(gDat$lifeExp) ## measures of location mean(gDat$lifeExp) median(gDat$lifeExp) ## measures of spread var(gDat$lifeExp) sd(gDat$lifeExp) mad(gDat$lifeExp) IQR(gDat$lifeExp) ## looking at extremes min(gDat$lifeExp) max(gDat$lifeExp) quantile(gDat$lifeExp, probs = c(0.05, 0.95)) range(gDat$lifeExp) which.min(gDat$lifeExp) gDat[which.min(gDat$lifeExp), ] which.max(gDat$lifeExp) gDat[which.max(gDat$lifeExp), ] ## using the above in data aggregation techniques ## summarizing a quantitative variable for each level of a categorical ## variable ## simplest case: learning 1 number, the median with(gDat, tapply(lifeExp, continent, median)) ## harder case: a multi-number summary (foo <- with(gDat, tapply(lifeExp, continent, summary))) (leByContinent <- do.call(cbind, foo)) # tidy up! ## learning a 1 number summary BUT cross-tabulating by two factors now with(gDat, tapply(lifeExp, list(year, continent), median)) ## hardest case: multi-number summary AND cross-tabulating by two ## factors (foo <- with(gDat, tapply(lifeExp, list(year, continent), summary))) ## return value is matrix-like, rows are years, columns are continent ## each entry, however, is a numeric vector holding the six number ## summary for a specific year and country how to tidy up? first, what ## is foo actually? str(foo) ## foo is a list with 12 years * 5 continents = 60 elements ## it also has dimension 12 (rows = years) by 5 (columns = continents) ## and associated dimnames ## first, stack up the 5-vectors foo2 <- do.call(rbind, foo) ## convert into a data.frame and use row and column names to restore ## the year and continent info leByYearAndCont <- data.frame(year = as.numeric(rownames(foo)), continent = factor(rep(colnames(foo), each = nrow(foo)), levels = levels(gDat$continent)), foo2) str(leByYearAndCont) peek(leByYearAndCont) ## to be totally proper, fix variable names for the 1st and 3rd ## quartiles names(leByYearAndCont) names(leByYearAndCont)[match(c("X1st.Qu.", "X3rd.Qu."), names(leByYearAndCont))] <- c("Q1", "Q3") peek(leByYearAndCont) ## visual exploration of a quantitative variable stripplot(lifeExp ~ continent, gDat, subset = year == 2007) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpStripplot-01.pdf"), height = 6, width = 6) stripplot(lifeExp ~ continent, gDat, subset = year == 2007, jitter.data = TRUE) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpStripplot-02.pdf"), height = 6, width = 6) stripplot(lifeExp ~ continent, gDat, subset = year == 2007, jitter.data = TRUE, type = c("p", "a"), fun = median) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpStripplot-03.pdf"), height = 6, width = 6) ## sarkar 10.6 stripplot(lifeExp ~ reorder(continent, lifeExp), gDat, subset = year == 2007, jitter.data = TRUE, type = c("p", "a"), fun = median) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpStripplot-04.pdf"), height = 6, width = 6) ## sarkar 9.2.5 ## https://stat.ethz.ch/pipermail/r-help/2009-October/214516.html stripplot(lifeExp ~ reorder(continent, lifeExp), gDat, subset = year %in% c(1952, 1977, 2007), groups = year, auto.key = TRUE, jitter.data = TRUE, type = c("p", "a"), fun = median) ## auto.key stills shows all years ... yuck dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpStripplot-05.pdf"), height = 6, width = 6) stripplot(lifeExp ~ reorder(continent, lifeExp), subset(gDat, subset = year %in% c(1952, 1977, 2007)), groups = year, auto.key = TRUE, jitter.data = TRUE, type = c("p", "a"), fun = median) ## subsetting prior to stripplot call solves problem dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpStripplot-06.pdf"), height = 6, width = 6) stripplot(lifeExp ~ reorder(continent, lifeExp), subset(gDat, subset = year %in% c(1952, 1977, 2007)), ## reversing rows in key makes it easier to read groups = year, auto.key = list(reverse.rows = TRUE), jitter.data = TRUE, type = c("p", "a"), fun = median) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpStripplot-07.pdf"), height = 6, width = 6) stripplot(lifeExp ~ factor(year), droplevels(subset(gDat, continent != "Oceania")), groups = reorder(continent, lifeExp), auto.key = list(reverse.rows = TRUE), jitter.data = TRUE, type = c("p", "a"), fun = median) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpStripplot-08.pdf"), height = 6, width = 6) stripplot(lifeExp ~ reorder(continent, lifeExp), gDat, jitter.data = TRUE, type = c("p", "a"), fun = median) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpStripplot-08.pdf"), height = 6, width = 6) bwplot(lifeExp ~ reorder(continent, lifeExp), gDat) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpBoxplot.pdf"), height = 6, width = 6) bwplot(lifeExp ~ reorder(continent, lifeExp), gDat, panel = panel.violin) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpViolinplot.pdf"), height = 6, width = 6) bwplot(lifeExp ~ reorder(continent, lifeExp), gDat, panel = function(..., box.ratio) { panel.violin(..., col = "transparent", border = "grey60", varwidth = FALSE, box.ratio = box.ratio) panel.bwplot(..., fill = NULL, box.ratio = .1) }) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpBoxAndViolinplot.pdf"), height = 6, width = 6) histogram(~ lifeExp | continent, gDat) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpHistogramByContinent.pdf"), height = 6, width = 6) histogram(~ lifeExp | year, gDat) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpHistogramByYear.pdf"), height = 6, width = 6) densityplot(~ lifeExp | continent, gDat) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpDensityplotByContinent.pdf"), height = 6, width = 6) densityplot(~ lifeExp | continent, gDat, plot.points = FALSE, ref = TRUE) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpDensityplotByContinentPrettier.pdf"), height = 6, width = 6) densityplot(~ lifeExp, gDat, groups = reorder(continent, lifeExp), auto.key = TRUE, plot.points = FALSE, ref = TRUE) dev.print(pdf, paste0(whereWasI, "figs/cm05/gapminderLifeExpDensityplotGroupOnContinent.pdf"), height = 6, width = 6) densityplot(~ lifeExp, gDat, groups = year > 1967, auto.key = TRUE) ecdfplot(~ lifeExp, gDat, groups = year > 1967, auto.key = TRUE) ## kernel density estimation str(faithful) peek(faithful) (densityRes <- with(faithful, density(eruptions))) densityplot(~ eruptions, data = faithful) densityplot(~ eruptions, data = faithful, kernel = "rect") densityplot(~ eruptions, data = faithful, kernel = "rect", bw = 0.1) densityplot(~ eruptions, data = faithful) densityplot(~ eruptions, data = faithful, bw = 0.03) densityplot(~ eruptions, data = faithful, bw = 1) densityplot(~ eruptions, data = faithful, kernel = "rect", bw = 0.2, plot.points = "rug", n = 200) densityplot(~ eruptions, data = faithful) densityplot(~ eruptions, data = faithful, n = 10, main = "n = 10") dev.print(pdf, paste0(whereWasI, "figs/cm05/faithful-densityploy-n10.pdf"), height = 6, width = 6) densityplot(~ eruptions, data = faithful, n = 35, main = "n = 35") dev.print(pdf, paste0(whereWasI, "figs/cm05/faithful-densityploy-n35.pdf"), height = 6, width = 6) densityplot(~ eruptions, data = faithful, n = 150, main = "n = 150") dev.print(pdf, paste0(whereWasI, "figs/cm05/faithful-densityploy-n150.pdf"), height = 6, width = 6)