library(robustbase)                     # ltsReg(), lmrob()
library(RColorBrewer)

## grab the Gapminder data to use as an example
whereWasI <- "/Users/jenny/teaching/2011/STAT545A/examples/gapminder/"

whereAmI <- "/Users/jenny/teaching/2011/STAT545A/examples/robustRegression/"

## Gapminder data import
gDat <- read.delim(jPaste(whereWasI, "data/gapminderDataFiveYear.txt"))
str(gDat)
## 'data.frame':	1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3  ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ gdpPercap: num  779 821 853 836 740 ...

## lifeExp vs. time
## hoping to find a country that is begging for robust regression!

pdf(jPaste(whereAmI, "figs/lifeExpVsYear-allCountries.pdf"),
    width = 10, height = 7)

xyplot(lifeExp ~ year | country, gDat,
       layout = c(3, 2), type = c("p", "r", "g"))

dev.off()

## cambodia, rwanda, swaziland are all possibilities
## but am going with rwanda
hDat <- droplevels(subset(gDat, country == "Rwanda",
                          select = c(year, lifeExp, country, continent)))
str(hDat)
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num  40 41.5 43 44.1 44.6 ...
##  $ country  : Factor w/ 1 level "Rwanda": 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 1 level "Africa": 1 1 1 1 1 1 1 1 1 1 ...

(n <- nrow(hDat))                      # n = 12, how quaint!
jYlim <- c(22, 53)                     # hold constant from here on

xyplot(lifeExp ~ year, hDat, type = c("p", "r", "g"),
       ylim = jYlim)

dev.print(pdf, jPaste(whereAmI, "figs/rwandaRaw.pdf"),
          width = 7, height = 4.5)

## there are two outliers
## flag for possible removal and visual emphasis
hDat$outlier <- FALSE
hDat$outlier[hDat$year %in% c(1992, 1997)] <- TRUE

## plain vanilla linear model
(yearMin <- min(hDat$year))             # 1952
lsFit <- lm(lifeExp ~ I(year - yearMin), hDat)
coef(lsFit)
##      (Intercept) I(year - yearMin) 
##      42.74194872       -0.04583147
summary(lsFit)

## now exclude the outliers
lsFitAlt <- lm(lifeExp ~ I(year - yearMin), hDat,
               subset = !outlier)
coef(lsFitAlt)
##      (Intercept) I(year - yearMin) 
##      41.99422084        0.07408486
summary(lsFitAlt)

## preparing to plot ... define a helper function
## will use inside custom panel functions to plot the various fits
jPanel <- function(jFit, jMeth, jCol = NULL, ...) {
  if(is.null(jCol)) {
    theColor <- jColor[jMeth]
  } else {
    theColor <- jCol
  }
  panel.xyplot(jYear, predict(jFit,
                              newdata = data.frame(year = jYear)),
               type = "l", col = theColor, ...)
}

## will use as the xVals when I predict from the various fits
(jYear <- with(hDat, sort(unique(year))))

## associate colors with the approaches
theMethods <- c("leastSq", "leastTrimSq", "MM")
display.brewer.all(type="qual")
jColor <- brewer.pal(n = length(theMethods), name = "Dark2")
names(jColor) <- theMethods

## when I want to suppress color
myGray <- "gray60"

## plot data and these first fits
plainPlot <-
  xyplot(lifeExp ~ year, hDat,
         groups = outlier,
#         auto.key = list(x = 0.1, y = 0.1, corner = c(0, 0)),
         type = c("g","p"), ylim = jYlim,
         xlab = "Year", ylab = "Life expectancy")
print(plainPlot)

dev.print(pdf, jPaste(whereAmI, "figs/rwandaShowOutliers.pdf"),
          width = 7, height = 4.5)


overlayPlot <- 
  xyplot(lifeExp ~ year, hDat,
         groups = outlier,  ylim = jYlim,
         xlab = "Year", ylab = "Life expectancy",
         panel = function(x, y, ...) {
           panel.grid(h = -1, v = -1)
           jPanel(lsFit, "leastSq", myGray)
           panel.xyplot(x, y, ...)
         })
print(overlayPlot)

dev.print(pdf, jPaste(whereAmI, "figs/rwandaLeastSquares.pdf"),
          width = 7, height = 4.5)


overlayPlotAlt <-
  xyplot(lifeExp ~ year, hDat,
         groups = outlier, ylim = jYlim,
         xlab = "Year", ylab = "Life expectancy",
         panel = function(x, y, ...) {
           panel.grid(h = -1, v = -1)
           jPanel(lsFitAlt, "leastSq", myGray)
           panel.xyplot(x, y, ...)
       })

print(overlayPlotAlt)

dev.print(pdf, jPaste(whereAmI, "figs/rwandaLeastSquaresNoOut.pdf"),
          width = 7, height = 4.5)

## minimizing the trimmed mean of the squared residuals
ltsFit <- ltsReg(lifeExp ~ I(year - yearMin), hDat)
ltsFit
##         Intercept  I(year - yearMin)  
##           40.5682             0.1947
summary(ltsFit)
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## Intercept         40.56821    0.33083  122.62 6.84e-10 ***
## I(year - yearMin)  0.19467    0.01835   10.61 0.000129 ***


ltsFitAlt <- ltsReg(lifeExp ~ I(year - yearMin), hDat,
                    subset = !outlier)
ltsFitAlt
##        Intercept  I(year - yearMin)  
##          40.5682             0.1947
summary(ltsFitAlt)
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## Intercept         40.56821    0.33083  122.62 6.84e-10 ***
## I(year - yearMin)  0.19467    0.01835   10.61 0.000129 ***


## using an MM estimator
mmFit <- lmrob(lifeExp ~ I(year - yearMin), hDat)
mmFit
##      (Intercept)  I(year - yearMin)  
##          42.0758             0.0668
summary(mmFit)
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       42.07577    0.82850  50.785 2.12e-13 ***
## I(year - yearMin)  0.06679    0.04306   1.551    0.152    

mmFitAlt <- lmrob(lifeExp ~ I(year - yearMin), hDat,
                  subset = !outlier)
mmFitAlt
##      (Intercept)  I(year - yearMin)  
##         42.01527            0.07648
summary(mmFitAlt)
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       42.01527    0.87992  47.749 4.09e-11 ***
## I(year - yearMin)  0.07648    0.03495   2.188   0.0601 .  

## gathering all the fits together
jEst <- data.frame(est = rep(0, 6),     # 6 fits
                   se = as.numeric(NA),
                   meth = factor(rep(theMethods, each = 2),
                     levels = theMethods),
                   excl = rep(c(FALSE, TRUE), 3))


## least squares
i <- 1
jEst[i, c("est", "se")] <-
  summary(lsFit)$coef["I(year - yearMin)", c("Estimate", "Std. Error")]

## least squares, exclude outliers
i <- i + 1
jEst[i, c("est", "se")] <-
  summary(lsFitAlt)$coef["I(year - yearMin)", c("Estimate", "Std. Error")]

## least trimmed squared deviation
i <- i + 1
jEst[i, c("est", "se")] <-
  summary(ltsFit)$coef["I(year - yearMin)", c("Estimate", "Std. Error")]

## least trimmed squared deviation, exclude outliers
i <- i + 1
jEst[i, c("est", "se")] <-
  summary(ltsFitAlt)$coef["I(year - yearMin)", c("Estimate", "Std. Error")]

## MM estimation
i <- i + 1
jEst[i, c("est", "se")] <-
  summary(mmFit)$coef["I(year - yearMin)", c("Estimate", "Std. Error")]

## MM estimation, exclude outliers
i <- i + 1
jEst[i, c("est", "se")] <-
  summary(mmFitAlt)$coef["I(year - yearMin)", c("Estimate", "Std. Error")]

jEst
##           est         se        meth  excl
## 1 -0.04583147 0.10968601     leastSq FALSE
## 2  0.07408486 0.02725230     leastSq  TRUE
## 3  0.19467143 0.01835124 leastTrimSq FALSE
## 4  0.19467143 0.01835124 leastTrimSq  TRUE
## 5  0.06679455 0.04305619          MM FALSE
## 6  0.07647548 0.03494911          MM  TRUE

str(jEst)
## 'data.frame':	6 obs. of  4 variables:
##  $ est   : num  -0.0458 0.0741 0.1947 0.1947 0.0668 ...
##  $ se    : num  0.1097 0.0273 0.0184 0.0184 0.0431 ...
##  $ meth  : Factor w/ 3 levels "leastSq","leastTrimSq",..: 1 1 2 2 3 3
##  $ excl: logi  FALSE TRUE FALSE TRUE FALSE TRUE

## plotting these fits
methKey <-
  list(x = 0.05, y = 0.05, corner = c(0, 0),
       text = list(levels(jEst$meth)),
       lines = list(col = jColor))

xyplot(lifeExp ~ year, hDat,
       xlab = "Year", ylab = "Life expectancy",
       ylim = jYlim,
       panel = function(x, y, ...) {
         panel.grid(h = -1, v = -1)
         jPanel(lsFit, "leastSq")
         jPanel(mmFit, "MM")
         ## sadly ltsReg offers no predict method
         panel.xyplot(hDat$year, fitted(ltsFit),
                      type = "l", col = jColor["leastTrimSq"], ...)
         panel.xyplot(x, y, ...)
       },
       key = methKey)

dev.print(pdf, jPaste(whereAmI, "figs/rwandaThreeMethods.pdf"),
          width = 8, height = 6)

xyplot(lifeExp ~ year, hDat,
       xlab = "Year", ylab = "Life expectancy",
       ylim = jYlim,
       panel = function(x, y, ...) {
         panel.grid(h = -1, v = -1)
         jPanel(lsFitAlt, "leastSq")
         jPanel(mmFitAlt, "MM")
         ## sadly ltsReg offers no predict method
         panel.xyplot(hDat$year[!hDat$outlier],
                      fitted(ltsFitAlt),
                      type = "l", col = jColor["leastTrimSq"], ...)
         panel.xyplot(x, y, ...)
       },
       key = methKey)

dev.print(pdf, jPaste(whereAmI, "figs/rwandaThreeMethodsNoOut.pdf"),
          width = 8, height = 6)

## printing estimates and se's to paste as table in Keynote
(foo <- subset(jEst, select = c(meth, excl, est, se)))
foo$est <- round(foo$est, 3)
foo$se <- round(foo$se, 4)
write.table(foo, jPaste(whereAmI, "results/ests.txt"),
            row.names = FALSE, quote = FALSE, sep = "\t")

## transitioning to bootstrap now
B <- 1000
set.seed(17)

bootIndices <- sample(1:n, n * B, replace = TRUE)

getBootDat <- function(jFit, jIndices) {
  return(sweep(matrix(resid(jFit)[bootIndices], nrow = n), 1,
               fitted(jFit), "+"))
}

bootDatLS <- getBootDat(lsFit, bootIndices)
bootDatLTS <- getBootDat(ltsFit, bootIndices)
bootDatMM <- getBootDat(mmFit, bootIndices)

bootCoefLS <- 
  apply(bootDatLS, 2, function(lifeExp) {
    coef(lm(lifeExp ~ I(jYear - yearMin)))["I(jYear - yearMin)"]
  })

str(bootCoefLS)
## num [1:1000] -0.096 -0.2053 -0.0859 0.3885 -0.151 ...

bootCoefLTS <- 
  apply(bootDatLTS, 2, function(lifeExp) {
    coef(ltsReg(lifeExp ~ I(jYear - yearMin)))["I(jYear - yearMin)"]
  })

bootCoefMM <- 
  apply(bootDatMM, 2, function(lifeExp) {
    coef(lmrob(lifeExp ~ I(jYear - yearMin)))["I(jYear - yearMin)"]
  })
## Warning messages:
## 1: In lmrob.S(x, y, control = control) :
##   S refinements did not converge (to tol=1e-07) in 200 iterations
## 2: In lmrob.S(x, y, control = control) :
##   S refinements did not converge (to tol=1e-07) in 200 iterations

## Do I see obvious problems in the MM bootstrap coef estimates?
summary(bootCoefMM)
##    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.28920  0.04213  0.06530  0.06592  0.08601  0.51860
nExt <- 3
(inspectMe <- which(rank(bootCoefMM) <= nExt |
                    rank(bootCoefMM) > (B - nExt)))
## [1]   4  12 116 683 803 863
lmrob(bootDatMM[ , 863] ~ I(jYear - yearMin))
## no warnings for these extreme values, so I conclude it's safe to
## proceed for today's purpose

## stack these up
bootCoef <-
  data.frame(est = c(bootCoefLS, bootCoefLTS, bootCoefMM),
             meth = factor(rep(theMethods, each = B),
               levels = theMethods))


## to help show how I did this
bootIndices[1:(3 * n)]

matrix(resid(lsFit)[bootIndices[1:(3 * n)]], nrow = n)

cbind(fitted(lsFit))

sweep(matrix(resid(lsFit)[bootIndices[1:(3 * n)]], nrow = n), 1,
      fitted(lsFit), "+")

(jBootAvgs <- tapply(bootCoef$est,
                    bootCoef$meth,
                    mean))
##     leastSq leastTrimSq          MM 
## -0.04487993  0.19367829  0.06591758 

(jBootSds <- tapply(bootCoef$est,
                    bootCoef$meth,
                    sd))
##     leastSq leastTrimSq          MM 
## 0.10100098  0.06588174  0.04937583 

getSlope <- function(jFit) coef(jFit)["I(year - yearMin)"]

(obsCoef <- c(getSlope(lsFit),
              getSlope(ltsFit), getSlope(mmFit)))
## I(year - yearMin) I(year - yearMin) I(year - yearMin) 
##       -0.04583147        0.19467143        0.06679455 

obsCoef - jBootAvgs                     # bias?
##       leastSq   leastTrimSq            MM 
## -0.0009515358  0.0009931385  0.0008769686 

(obsCoef - jBootAvgs)/jBootSds          # bias rel to sd?
##      leastSq  leastTrimSq           MM 
## -0.009421055  0.015074564  0.017761092 

obsCoef/jBootSds                        # pseudo test stat
##     leastSq leastTrimSq          MM 
##  -0.4537725   2.9548619   1.3527783 

#quantile(bootCoefs[1,], probs = c(0.025, 0.975))
#summary(lmFitQ)

jEst$seBoot <- NA
jEst$seBoot[!jEst$excl] <- jBootSds

jEst$avgBoot <- NA
jEst$avgBoot[!jEst$excl] <- jBootAvgs


## saving jEst again for use as table elsewhere
(foo <- subset(jEst, excl == FALSE, select = c(meth, est,
                                      avgBoot, se, seBoot)))
foo$est <- round(foo$est, 3)
foo$avgBoot <- round(foo$avgBoot, 3)
foo$se <- round(foo$se, 4)
foo$seBoot <- round(foo$seBoot, 4)
write.table(foo, jPaste(whereAmI, "results/estsBoot.txt"),
            row.names = FALSE, quote = FALSE, sep = "\t")


## bootstrap figures start here

## first few bootstrap datasets
xyplot(bootDatLS[,1] + bootDatLS[,2] + bootDatLS[,3] +
       bootDatLS[,4] + bootDatLS[,5] + bootDatLS[,6] ~ jYear,
       outer = TRUE, type = c("g","p","r"),
       xlab = "Year", ylab = "Life expectancy")

dev.print(pdf, paste(whereAmI, "figs/rwandaLSsixBoots.pdf", sep = ""),
          width = 8, height = 6)


densityplot(~ est | meth, bootCoef,
            layout = c(3, 1),
            xlab = "Bootstrap coefficients",
            type = c("p", "g"), n = 100)

dev.print(pdf, paste(whereAmI, "figs/bootCoefThreeMethodsPlotPoints.pdf", sep = ""),
          width = 10, height = 5)

densityplot(~ est | meth, bootCoef,
            layout = c(2, 1),
            subset = meth != "leastTrimSq",
            xlab = "Bootstrap coefficients",
            type = "g", n = 100,
            panel = function(x, ...) {
              panel.densityplot(x, ...)
              if (packet.number() == 1) {
                jv <- jEst[1, "est"]
              } else {
                jv <- jEst[5, "est"]
              }
              panel.abline(v = jv, col = myGray)
            })

dev.print(pdf, paste(whereAmI, "figs/bootCoefTwoMethodsNoPoints.pdf", sep = ""),
          width = 10, height = 5)


densityplot(~ est | meth, bootCoef,
            layout = c(2, 1),
            subset = meth != "leastTrimSq",
            xlab = "Bootstrap coefficients",
            type = "g", n = 200,
            panel = function(x, ...) {
              panel.densityplot(x, ...)
              if (packet.number() == 1) {
                jv <- jEst[1, "est"]
                jsd <- jEst[1, "se"]
              } else {
                jv <- jEst[5, "est"]
                jsd <- jEst[5, "se"]
              }
              panel.abline(v = jv, col = myGray)
              panel.mathdensity(args = list(mean = jv,
                                  sd = jsd), col.line = myGray)
            })

dev.print(pdf, paste(whereAmI, "figs/bootCoefTwoMethodsWithDensity.pdf", sep = ""),
          width = 10, height = 5)

densityplot(~ est | meth, bootCoef,
            layout = c(3, 1),
            xlab = "Bootstrap coefficients",
            type = "g", n = 200,
            panel = function(x, ...) {
              panel.densityplot(x, ...)
              if (packet.number() == 1) {
                jv <- jEst[1, "est"]
                jsd <- jEst[1, "se"]
              } else if (packet.number() == 2) {
                jv <- jEst[3, "est"]
              } else {
                jv <- jEst[5, "est"]
                jsd <- jEst[5, "se"]
              }
              panel.abline(v = jv, col = myGray)
              if(packet.number() != 2) {
              panel.mathdensity(args = list(mean = jv,
                                  sd = jsd), col.line = myGray)
            }
            })

dev.print(pdf, paste(whereAmI, "figs/bootCoefThreeMethodsWithDensity.pdf", sep = ""),
          width = 10, height = 5)