p171
This question should be answered using the Weekly data set, which is part of the ISLR package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1,089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.
Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?
Use the full data set to perform a logistic regression with Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?
Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.
Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).
Repeat (d) using LDA
Repeat (d) using QDA
Repeat (d) using KNN with K = 1.
Which of these methods appears to provide the best results on this data?
Experiment with different combinations of predictors, includ- ing possible transformations and interactions, for each of the methods. Report the variables, method, and associated confu- sion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.
library(ISLR)
library(tidyverse)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly)
names(Weekly)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
glm.fit1 = glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data = Weekly, family = binomial) # binomial for logistic regression
summary(glm.fit1)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag2, other predictors appear statistically significant: All Pr(>|z|) > 0.05
glm.probs = predict(glm.fit1, type = "response")
contrasts(Weekly$Direction)
## Up
## Down 0
## Up 1
summary(Weekly$Direction)
## Down Up
## 484 605
Create prediction table
nrow(Weekly)
## [1] 1089
glm.pred = rep("Down", nrow(Weekly))
glm.pred[glm.probs > .5] = "Up"
#glm.pred
summary(glm.pred)
## Length Class Mode
## 1089 character character
Diagonals indicate correct predictions. Off-diagonals indicate incorrect predictions.
table(glm.pred, Weekly$Direction)
##
## glm.pred Down Up
## Down 54 48
## Up 430 557
# glm.p Down Up
# Down 54 48
# Up 430 557
430 False Up’s
48 …
605/1809
## [1] 0.3344389
430/(430 + 557)
## [1] 0.4356636
430/1809
## [1] 0.2377004
48/1809
## [1] 0.026534
(430 + 557)/1806
## [1] 0.5465116
mean(glm.pred == Weekly$Direction) ## Correct 56%
## [1] 0.5610652
mean(glm.pred != Weekly$Direction) ## Incorrect
## [1] 0.4389348
summary(Weekly$Year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1990 1995 2000 2000 2005 2010
train = (Weekly$Year<2009)
test_data_weekly.2009 = Weekly[!train,]
Direction.2009 = Weekly$Direction[!train] # The Y's, the Response
glm.fit1 = glm(Direction ~Lag2, data = Weekly, family = binomial, subset = train) # Train Data
glm.probs2 = predict(glm.fit1, test_data_weekly.2009, type = "response")
glm.pred = rep("Down", nrow(test_data_weekly.2009))
glm.pred[glm.probs2 > .5] = "Up"
table(glm.pred, Direction.2009)
## Direction.2009
## glm.pred Down Up
## Down 9 5
## Up 34 56
mean(glm.pred == Direction.2009) # % Correct Predictions
## [1] 0.625
mean(glm.pred != Direction.2009) # Compute and test error rate
## [1] 0.375
library(MASS)
lda.fit1 = lda(Direction ~ Lag2,
data = Weekly,
subset = train) # train boolean vector
lda.fit1
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
summary(lda.fit1)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 2 -none- numeric
## scaling 1 -none- numeric
## lev 2 -none- character
## svd 1 -none- numeric
## N 1 -none- numeric
## call 4 -none- call
## terms 3 terms call
## xlevels 0 -none- list
#lda.pred = predict(lda.fit1, test_data_weekly.2009, type = “response”) # This one?
lda.pred = predict(lda.fit1, test_data_weekly.2009) # This one!! Default type="response"?
summary(lda.pred) # class, posterior, x - See Lab
## Length Class Mode
## class 104 factor numeric
## posterior 208 -none- numeric
## x 104 -none- numeric
lda.class = lda.pred$class
table(lda.class, Direction.2009) # Confusion Matrix Predicted vs Truth
## Direction.2009
## lda.class Down Up
## Down 9 5
## Up 34 56
Direction.2009
## [1] Down Down Down Down Up Down Down Down Down Up Up Up Up Up Up
## [16] Down Up Up Down Up Up Up Up Down Down Down Down Up Up Up
## [31] Up Down Up Up Down Up Up Down Down Up Up Down Down Up Up
## [46] Down Up Up Up Down Up Down Up Down Down Down Down Up Up Down
## [61] Up Up Up Up Up Up Down Up Down Down Up Down Up Down Up
## [76] Up Down Down Up Down Up Down Up Down Down Down Up Up Up Up
## [91] Down Up Up Up Up Up Down Up Down Up Up Up Up Up
## Levels: Down Up
mean(lda.class == Direction.2009) # Accurate 62% of the time
## [1] 0.625
sum(lda.pred$posterior[,1] >=.5) # Using column Down, Out of 104 in test data 75 are
## [1] 14
sum(lda.pred$posterior[,1] <.5)
## [1] 90
head(lda.pred$posterior)
## Down Up
## 986 0.4736555 0.5263445
## 987 0.3558617 0.6441383
## 988 0.5132860 0.4867140
## 989 0.5142948 0.4857052
## 990 0.4799727 0.5200273
## 991 0.4597586 0.5402414
sum(lda.pred$posterior[,1] >.9) # Want only over 90% posterior probability
## [1] 0
qda.fit = qda(Direction ~ Lag2,
data = Weekly,
subset = train) # train boolean vector
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
summary(qda.fit)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 2 -none- numeric
## scaling 2 -none- numeric
## ldet 2 -none- numeric
## lev 2 -none- character
## N 1 -none- numeric
## call 4 -none- call
## terms 3 terms call
## xlevels 0 -none- list
qda.pred = predict(qda.fit, test_data_weekly.2009)
qda.class = qda.pred$class
table(qda.class, Direction.2009) # Confusion Matrix
## Direction.2009
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class == Direction.2009) # Accurate 58% of the time
## [1] 0.5865385
library(class)
train.X = as.matrix(Weekly$Lag2[train])
test.X = as.matrix(Weekly$Lag2[!train])
train.Direction = Weekly$Direction[train]
summary(Weekly$Direction[train])
## Down Up
## 441 544
summary(train.Direction)
## Down Up
## 441 544
length(train.Direction)
## [1] 985
length(train.X)
## [1] 985
length(test.X)
## [1] 104
set.seed(1)
dim(train.X)
## [1] 985 1
dim(test.X)
## [1] 104 1
knn.pred = knn(train.X, test.X, train.Direction, k=1)
table(knn.pred, Direction.2009)
## Direction.2009
## knn.pred Down Up
## Down 21 30
## Up 22 31
Direction.2009
## [1] Down Down Down Down Up Down Down Down Down Up Up Up Up Up Up
## [16] Down Up Up Down Up Up Up Up Down Down Down Down Up Up Up
## [31] Up Down Up Up Down Up Up Down Down Up Up Down Down Up Up
## [46] Down Up Up Up Down Up Down Up Down Down Down Down Up Up Down
## [61] Up Up Up Up Up Up Down Up Down Down Up Down Up Down Up
## [76] Up Down Down Up Down Up Down Up Down Down Down Up Up Up Up
## [91] Down Up Up Up Up Up Down Up Down Up Up Up Up Up
## Levels: Down Up
mean(knn.pred == Direction.2009) # 50% correct
## [1] 0.5
LDA accurate 62% of the time