p264
We will now try to predict per capita crime rate in the Boston data set.
Try out some of the regression methods explored in this chapter, such as best subset selection, the lasso, ridge regression, and PCR. Present and discuss results for the approaches that you consider.
Propose a model (or set of models) that seem to perform well on this data set, and justify your answer. Make sure that you are evaluating model performance using validation set error, cross- validation, or some other reasonable alternative, as opposed to using training error.
Does your chosen model involve all of the features in the data set? Why or why not?
library(MASS)
library(leaps)
set.seed(1)
n = nrow(Boston)
dim(Boston)[2]-1
## [1] 13
train = sample(1:n, n*.7)
#test = (-train)
bss.model <- regsubsets(crim ~ ., data = Boston[train,], method = "exhaustive", nvmax = dim(Boston)[2]-1)
summary.fit = summary(bss.model)
plot(summary.fit$bic) # lower is better
plot(summary.fit$cp) # lower is better
plot(summary.fit$rss) # lower is better
plot(summary.fit$adjr2) # lower is better
bic.min = which.min(summary.fit$bic)
bic.min
## [1] 3
plot(summary.fit$bic) # lower is better
points(bic.min, summary.fit$bic[bic.min], col=2, cex=2, pch=20)
plot(bss.model)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.0-2
x = model.matrix(crim ~ ., data = Boston)[,-1] # Remove intercept
y = Boston$crim
grid = 10^seq(10,-2, length=100)
lasso.model = glmnet(x[train,], y[train], alpha = 1, lambda = grid) # , lambda = grid
plot(lasso.model)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
cv.lasso = cv.glmnet(x[train,], y[train], alpha=1)
plot(cv.lasso)
best.lasso.lambda = cv.lasso$lambda.min
lasso.pred = predict(lasso.model, s=best.lasso.lambda, newx = x[-train,])
lasso.mse = mean((lasso.pred - y[-train])^2) # 23.30576
lasso.out = glmnet(x,y, lambda = grid) # , lambda = grid
lasso.coef = predict(lasso.out, type="coefficients", s=best.lasso.lambda)
lasso.coef
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 13.7339988760
## zn 0.0379731938
## indus -0.0727154962
## chas -0.6142076302
## nox -7.8988988799
## rm 0.2880813841
## age .
## dis -0.8429319393
## rad 0.5255308400
## tax -0.0004096812
## ptratio -0.2111769377
## black -0.0075435895
## lstat 0.1262817517
## medv -0.1682372500
ridge.model <- glmnet(x[train,], y[train], alpha = 0)
plot(ridge.model)
cv.ridge <- cv.glmnet(x[train,], y[train], alpha = 0)
plot(cv.ridge)
best.ridge.lambda = cv.ridge$lambda.min
which.min(cv.ridge$lambda) # Find index
## [1] 100
cv.ridge$lambda[100]
## [1] 0.519044
cv.ridge$lambda.min
## [1] 0.519044
ridge.predict = predict(ridge.model, s=best.ridge.lambda, newx = x[-train,])
ridge.mse = mean((ridge.predict - y[-train])^2)
ridge.mse; lasso.mse
## [1] 60.15076
## [1] 59.51466
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
pcr.model = pcr(crim ~ ., data = Boston, subset = train, scale=TRUE, validation = "CV")
validationplot(pcr.model, val.type = "MSEP")
summary(pcr.model)
## Data: X dimension: 354 13
## Y dimension: 354 1
## Fit method: svdpc
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.085 6.691 6.689 6.22 6.173 6.191 6.245
## adjCV 8.085 6.685 6.683 6.21 6.164 6.182 6.233
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.225 6.069 6.053 6.052 6.072 6.066 6.013
## adjCV 6.213 6.056 6.032 6.038 6.059 6.051 5.999
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 48.34 61.18 70.44 77.16 83.41 88.28 91.51 93.81
## crim 33.34 33.76 43.13 44.14 44.15 44.20 44.63 47.57
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 95.52 97.14 98.48 99.50 100.00
## crim 48.08 48.12 48.13 48.46 49.37
pcr.predict = predict(pcr.model, x[-train,], ncomp = 9)
pcr.mse = mean((pcr.predict - y[-train])^2)
ridge.mse; lasso.mse; pcr.mse
## [1] 60.15076
## [1] 59.51466
## [1] 62.4953
pls.model = plsr(crim ~ ., data = Boston, subset = train, scale=TRUE, validation = "CV")
summary(pls.model)
## Data: X dimension: 354 13
## Y dimension: 354 1
## Fit method: kernelpls
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.085 6.485 6.149 6.142 6.114 6.078 6.073
## adjCV 8.085 6.480 6.137 6.124 6.095 6.062 6.056
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.102 6.086 6.085 6.085 6.084 6.084 6.084
## adjCV 6.082 6.068 6.066 6.067 6.066 6.066 6.066
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 47.88 57.45 63.67 72.38 78.51 80.90 83.76 87.25
## crim 37.55 46.29 48.13 48.79 48.99 49.26 49.34 49.36
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 90.08 95.02 97.03 98.47 100.00
## crim 49.37 49.37 49.37 49.37 49.37
pls.predict = predict(pls.model, x[-train,], ncomp = 9)
pls.mse = mean((pls.predict - y[-train])^2)
ridge.mse; lasso.mse; pcr.mse; pls.mse
## [1] 60.15076
## [1] 59.51466
## [1] 62.4953
## [1] 58.95384
Lasso is the best because MSE was the smallest