ISLR Home

Question

We now use boosting to predict Salary in the Hitters data set.

  1. Remove the observations for whom the salary information is unknown, and then log-transform the salaries.

  2. Create a training set consisting of the first 200 observations, and a test set consisting of the remaining observations.

  3. Perform boosting on the training set with 1,000 trees for a range of values of the shrinkage parameter λ. Produce a plot with different shrinkage values on the x-axis and the corresponding training set MSE on the y-axis.

  4. Produce a plot with different shrinkage values on the x-axis and the corresponding test set MSE on the y-axis.

  5. Compare the test MSE of boosting to the test MSE that results from applying two of the regression approaches seen in Chapters 3 and 6.

  6. Which variables appear to be the most important predictors in the boosted model?

  7. Now apply bagging to the training set. What is the test set MSE for this approach?


library(gbm)
library(ISLR)

10a

Removing all rows with missing values

hitters = na.omit(Hitters)

# Log-transforming the salaries
hitters$Salary = log1p(hitters$Salary)

10b

set.seed(1)

# Creating a boolean vector of 200 indicies for train
train = sample(1:nrow(hitters), 200)

10c

# Creating a list of shrinkage parameters
shrinkage.parameters = c(2, 4, 6, 8, 10)

# Training MSE placeholder vector
train.mse = rep(0, length(shrinkage.parameters))

# Test MSE placeholder vector
test.mse = rep(0, length(shrinkage.parameters))

# Fitting a boosting model trying all values in shrinkage.parameters
for (i in 1:length(shrinkage.parameters)) {
  
  # Fitting a boosting model with 1000 trees and shrinkage.parameters[i]
  boost.hitters = gbm(
    Salary~.,
    data=hitters[train,],
    distribution="gaussian",
    n.trees=1000,
    interaction.depth=shrinkage.parameters[i]
  )
  
  # Predicting on the train data
  yhat.train = predict(boost.hitters, newdata=hitters[train,], n.trees=1000)
  
  # Calculating the train MSE and storing in the train.mse vector
  train.mse[i] = mean((yhat.train-hitters[train,]$Salary)^2)
  
  # Predicting on the test data
  yhat.test = predict(boost.hitters, newdata=hitters[-train,], n.trees=1000)
  
  # Calculating test MSE and storing in test,mse vector
  test.mse[i] = mean((yhat.test-hitters[-train,]$Salary)^2)
}

Plotting the training MSE for each shrinkage parameter used

plot(shrinkage.parameters, train.mse, type="o", col="blue", pch="o", lty=1)

10d

COMMENTS: Calculating the test MSE in 10(e)

Plotting the test MSE for each shrinkage parameter used

plot(shrinkage.parameters, test.mse, type="o", col="blue", pch="o", lty=1)

10e

# Fitting a Linear Model

## Fitting a linear model on hitters
lm.fit = lm(Salary~., data=hitters, subset=train)

## Predicting on the test set
lm.preds = predict(lm.fit, newdata=hitters[-train,])

## Calculating the MSE of linear model
lm.mse = mean((lm.preds-hitters[-train,]$Salary)^2)

Fitting a Lasso Model

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.0-2
## Preparing the data
x = model.matrix(Salary ~ ., hitters)[,-1]
y = hitters$Salary

## Creating a grid of lambda values
grid = 10^seq(10, -2, length=100)

## Fitting a Lasso model
lasso.fit = glmnet(
  x[train,],
  y[train],
  alpha=1,
  lambda=grid
)

Plotting the results of lasso

plot(lasso.fit)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

Running a CV to determine the best lambda

cv.lasso = cv.glmnet(x[train,], y[train], alpha=1)
plot(cv.lasso)

Getting the best lambda value

bestlam = cv.lasso$lambda.min

## Applying lasso with best lambda on test data
lasso.pred = predict(lasso.fit, s=bestlam, newx=x[-train,])

## Calculating the MSE
lasso.mse = mean((lasso.pred - y[-train])^2)

## Comparing test MSE for boosting, linear regression, and lasso regression
test.mse[3];lm.mse;lasso.mse
## [1] 0.3559185
## [1] 0.4811669
## [1] 0.4678237

COMMENTS: The boosting gave the lowest test MSE, then the lasso, and last the linear model

10f

summary(boost.hitters)

##                 var    rel.inf
## CAtBat       CAtBat 18.9750805
## CRBI           CRBI 15.0208454
## CRuns         CRuns 11.7137509
## PutOuts     PutOuts  7.0536893
## CWalks       CWalks  6.8647654
## Walks         Walks  6.1242649
## RBI             RBI  4.0188027
## AtBat         AtBat  3.7895846
## Hits           Hits  3.7757352
## Years         Years  3.5115230
## HmRun         HmRun  3.2870297
## CHmRun       CHmRun  3.0913008
## Runs           Runs  3.0652202
## CHits         CHits  2.9362552
## Assists     Assists  2.6026815
## Errors       Errors  2.1362480
## League       League  0.9036578
## Division   Division  0.7624828
## NewLeague NewLeague  0.3670824

COMMENTS: The top predictors that have influence on Salary are: “CRuns”, “CRBI”, “CAtBat”, “CHits”, “CWalks”, “PutOuts”

10g Fitting a bagging model

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
bagging.fit = randomForest(
  Salary~.,
  data=hitters,
  subset=train,
  mtry=dim(hitters)[2] - 1,
  importance=TRUE
)

# Predicting test data using bagging model
yhat.bag = predict(bagging.fit, newdata=hitters[-train,])

# Calculating the MSE of bagging model
bag.mse = mean((yhat.bag-hitters[-train,]$Salary)^2)

Comparing boosting test mse with bagging

test.mse[3]; bag.mse
## [1] 0.3559185
## [1] 0.2425952

COMMENTS: Bagging has a lower MSE than boosting