Question

p264

We will now try to predict per capita crime rate in the Boston data set.

Try out some of the regression methods explored in this chapter, such as best subset selection, the lasso, ridge regression, and PCR. Present and discuss results for the approaches that you consider.
Propose a model (or set of models) that seem to perform well on this data set, and justify your answer. Make sure that you are evaluating model performance using validation set error, cross- validation, or some other reasonable alternative, as opposed to using training error.
Does your chosen model involve all of the features in the data set? Why or why not?

library(MASS)
library(leaps)

11a best subset selection

set.seed(1)

n = nrow(Boston)
dim(Boston)[2]-1

## [1] 13

train = sample(1:n, n*.7)
#test = (-train)
bss.model <- regsubsets(crim ~ ., data = Boston[train,], method = "exhaustive", nvmax = dim(Boston)[2]-1)
summary.fit = summary(bss.model)

plot(summary.fit$bic) # lower is better

plot(summary.fit$cp) # lower is better

plot(summary.fit$rss) # lower is better

plot(summary.fit$adjr2) # lower is better

bic.min = which.min(summary.fit$bic)
bic.min

## [1] 3

plot(summary.fit$bic) # lower is better
points(bic.min, summary.fit$bic[bic.min], col=2, cex=2, pch=20)

plot(bss.model)

lasso

library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.0-2

x = model.matrix(crim ~ ., data = Boston)[,-1] # Remove intercept
y = Boston$crim

grid = 10^seq(10,-2, length=100)
lasso.model = glmnet(x[train,], y[train], alpha = 1, lambda = grid) # , lambda = grid
plot(lasso.model)

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

cv.lasso = cv.glmnet(x[train,], y[train], alpha=1)
plot(cv.lasso)

best.lasso.lambda = cv.lasso$lambda.min

lasso.pred = predict(lasso.model, s=best.lasso.lambda, newx = x[-train,])

lasso.mse = mean((lasso.pred - y[-train])^2) # 23.30576

lasso.out = glmnet(x,y, lambda = grid) # , lambda = grid

lasso.coef = predict(lasso.out, type="coefficients", s=best.lasso.lambda)

lasso.coef

## 14 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept) 13.7339988760
## zn           0.0379731938
## indus       -0.0727154962
## chas        -0.6142076302
## nox         -7.8988988799
## rm           0.2880813841
## age          .           
## dis         -0.8429319393
## rad          0.5255308400
## tax         -0.0004096812
## ptratio     -0.2111769377
## black       -0.0075435895
## lstat        0.1262817517
## medv        -0.1682372500

ridge

ridge.model <- glmnet(x[train,], y[train], alpha = 0)
plot(ridge.model)

cv.ridge <- cv.glmnet(x[train,], y[train], alpha = 0)
plot(cv.ridge)

best.ridge.lambda = cv.ridge$lambda.min
which.min(cv.ridge$lambda) # Find index

## [1] 100

cv.ridge$lambda[100]

## [1] 0.519044

cv.ridge$lambda.min

## [1] 0.519044

ridge.predict = predict(ridge.model, s=best.ridge.lambda, newx = x[-train,])

ridge.mse = mean((ridge.predict - y[-train])^2)
ridge.mse; lasso.mse

## [1] 60.15076

## [1] 59.51466

pcr

library(pls)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:stats':
## 
##     loadings

pcr.model = pcr(crim ~ ., data = Boston, subset = train, scale=TRUE, validation = "CV")

validationplot(pcr.model, val.type = "MSEP")

summary(pcr.model)

## Data:    X dimension: 354 13 
##  Y dimension: 354 1
## Fit method: svdpc
## Number of components considered: 13
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           8.085    6.691    6.689     6.22    6.173    6.191    6.245
## adjCV        8.085    6.685    6.683     6.21    6.164    6.182    6.233
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       6.225    6.069    6.053     6.052     6.072     6.066     6.013
## adjCV    6.213    6.056    6.032     6.038     6.059     6.051     5.999
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       48.34    61.18    70.44    77.16    83.41    88.28    91.51    93.81
## crim    33.34    33.76    43.13    44.14    44.15    44.20    44.63    47.57
##       9 comps  10 comps  11 comps  12 comps  13 comps
## X       95.52     97.14     98.48     99.50    100.00
## crim    48.08     48.12     48.13     48.46     49.37

pcr.predict = predict(pcr.model, x[-train,], ncomp = 9)

pcr.mse = mean((pcr.predict - y[-train])^2)
ridge.mse; lasso.mse; pcr.mse

## [1] 60.15076

## [1] 59.51466

## [1] 62.4953

pls

pls.model = plsr(crim ~ .,  data = Boston, subset = train, scale=TRUE, validation = "CV")

summary(pls.model)

## Data:    X dimension: 354 13 
##  Y dimension: 354 1
## Fit method: kernelpls
## Number of components considered: 13
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           8.085    6.485    6.149    6.142    6.114    6.078    6.073
## adjCV        8.085    6.480    6.137    6.124    6.095    6.062    6.056
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       6.102    6.086    6.085     6.085     6.084     6.084     6.084
## adjCV    6.082    6.068    6.066     6.067     6.066     6.066     6.066
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       47.88    57.45    63.67    72.38    78.51    80.90    83.76    87.25
## crim    37.55    46.29    48.13    48.79    48.99    49.26    49.34    49.36
##       9 comps  10 comps  11 comps  12 comps  13 comps
## X       90.08     95.02     97.03     98.47    100.00
## crim    49.37     49.37     49.37     49.37     49.37

pls.predict = predict(pls.model, x[-train,], ncomp = 9)

pls.mse = mean((pls.predict - y[-train])^2)
ridge.mse; lasso.mse; pcr.mse; pls.mse

## [1] 60.15076

## [1] 59.51466

## [1] 62.4953

## [1] 58.95384

11b

Lasso is the best because MSE was the smallest

11c

No Lasso removed some variables: age
Saw others on a different run
Got different results on a final run for some reason FIXME!

ISLR Q6.11 - Crime Rate/Boston

Question

11a best subset selection

lasso

ridge

pcr

pls

11b

11c