p155
library(ISLR)
names(Smarket)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
dim(Smarket)
## [1] 1250 9
summary(Smarket)
## Year Lag1 Lag2 Lag3
## Min. :2001 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000
## 1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500 1st Qu.:-0.640000
## Median :2003 Median : 0.039000 Median : 0.039000 Median : 0.038500
## Mean :2003 Mean : 0.003834 Mean : 0.003919 Mean : 0.001716
## 3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750
## Max. :2005 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000
## Lag4 Lag5 Volume Today
## Min. :-4.922000 Min. :-4.92200 Min. :0.3561 Min. :-4.922000
## 1st Qu.:-0.640000 1st Qu.:-0.64000 1st Qu.:1.2574 1st Qu.:-0.639500
## Median : 0.038500 Median : 0.03850 Median :1.4229 Median : 0.038500
## Mean : 0.001636 Mean : 0.00561 Mean :1.4783 Mean : 0.003138
## 3rd Qu.: 0.596750 3rd Qu.: 0.59700 3rd Qu.:1.6417 3rd Qu.: 0.596750
## Max. : 5.733000 Max. : 5.73300 Max. :3.1525 Max. : 5.733000
## Direction
## Down:602
## Up :648
##
##
##
##
cor(Smarket[,-9]) # Skip col 9 because it's qualitative
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 0.029699649 0.030596422 0.033194581 0.035688718
## Lag1 0.02969965 1.000000000 -0.026294328 -0.010803402 -0.002985911
## Lag2 0.03059642 -0.026294328 1.000000000 -0.025896670 -0.010853533
## Lag3 0.03319458 -0.010803402 -0.025896670 1.000000000 -0.024051036
## Lag4 0.03568872 -0.002985911 -0.010853533 -0.024051036 1.000000000
## Lag5 0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## Volume 0.53900647 0.040909908 -0.043383215 -0.041823686 -0.048414246
## Today 0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
## Lag5 Volume Today
## Year 0.029787995 0.53900647 0.030095229
## Lag1 -0.005674606 0.04090991 -0.026155045
## Lag2 -0.003557949 -0.04338321 -0.010250033
## Lag3 -0.018808338 -0.04182369 -0.002447647
## Lag4 -0.027083641 -0.04841425 -0.006899527
## Lag5 1.000000000 -0.02200231 -0.034860083
## Volume -0.022002315 1.00000000 0.014591823
## Today -0.034860083 0.01459182 1.000000000
attach(Smarket)
#detach(Smarket)
plot(Smarket$Volume)
p156
glm.fit1 = glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data = Smarket,
family = binomial) # binomial for logistic regression
summary(glm.fit1)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Smarket)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.446 -1.203 1.065 1.145 1.326
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000 0.240736 -0.523 0.601
## Lag1 -0.073074 0.050167 -1.457 0.145
## Lag2 -0.042301 0.050086 -0.845 0.398
## Lag3 0.011085 0.049939 0.222 0.824
## Lag4 0.009359 0.049974 0.187 0.851
## Lag5 0.010313 0.049511 0.208 0.835
## Volume 0.135441 0.158360 0.855 0.392
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1727.6 on 1243 degrees of freedom
## AIC: 1741.6
##
## Number of Fisher Scoring iterations: 3
coef(glm.fit1)
## (Intercept) Lag1 Lag2 Lag3 Lag4 Lag5
## -0.126000257 -0.073073746 -0.042301344 0.011085108 0.009358938 0.010313068
## Volume
## 0.135440659
summary(glm.fit1)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000257 0.24073574 -0.5233966 0.6006983
## Lag1 -0.073073746 0.05016739 -1.4565986 0.1452272
## Lag2 -0.042301344 0.05008605 -0.8445733 0.3983491
## Lag3 0.011085108 0.04993854 0.2219750 0.8243333
## Lag4 0.009358938 0.04997413 0.1872757 0.8514445
## Lag5 0.010313068 0.04951146 0.2082966 0.8349974
## Volume 0.135440659 0.15835970 0.8552723 0.3924004
summary(glm.fit1)$coef[,4] # Column 4
## (Intercept) Lag1 Lag2 Lag3 Lag4 Lag5
## 0.6006983 0.1452272 0.3983491 0.8243333 0.8514445 0.8349974
## Volume
## 0.3924004
p157
P(Y=1|X)
If no data set is supplied to the predict() function, then the probabilities are computed for the training data that was used to fit the logistic regression mode
glm.probs = predict(glm.fit1, type = "response")
glm.probs[1:6]
## 1 2 3 4 5 6
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565
#View(glm.probs)
contrasts(Smarket$Direction)
## Up
## Down 0
## Up 1
#glm.probs
glm.pred = rep("Down", 1250)
glm.pred[glm.probs > .5] = "Up"
summary(glm.pred)
## Length Class Mode
## 1250 character character
# Produce a confusion matrix
# Diagonals indicate correct predictions. Off-diagonals indicate incorrect predictions.
table(glm.pred, Smarket$Direction)
##
## glm.pred Down Up
## Down 145 141
## Up 457 507
(507 + 145) / 1250 # Correct 52.16%
## [1] 0.5216
mean(glm.pred == Smarket$Direction)
## [1] 0.5216
head(glm.pred, 5)
## [1] "Up" "Down" "Down" "Up" "Up"
head(Smarket$Direction, 5)
## [1] Up Up Down Up Up
## Levels: Down Up
p159
train = (Smarket$Year<2005)
Smarket.2005 = Smarket[!train,]
dim(Smarket.2005)
## [1] 252 9
Direction.2005 = Smarket$Direction[!train] # Vector
Smarket.2005[1:6,]
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 999 2005 -0.134 0.008 -0.007 0.715 -0.431 0.7869 -0.812 Down
## 1000 2005 -0.812 -0.134 0.008 -0.007 0.715 1.5108 -1.167 Down
## 1001 2005 -1.167 -0.812 -0.134 0.008 -0.007 1.7210 -0.363 Down
## 1002 2005 -0.363 -1.167 -0.812 -0.134 0.008 1.7389 0.351 Up
## 1003 2005 0.351 -0.363 -1.167 -0.812 -0.134 1.5691 -0.143 Down
## 1004 2005 -0.143 0.351 -0.363 -1.167 -0.812 1.4779 0.342 Up
head(Smarket.2005, 4)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 999 2005 -0.134 0.008 -0.007 0.715 -0.431 0.7869 -0.812 Down
## 1000 2005 -0.812 -0.134 0.008 -0.007 0.715 1.5108 -1.167 Down
## 1001 2005 -1.167 -0.812 -0.134 0.008 -0.007 1.7210 -0.363 Down
## 1002 2005 -0.363 -1.167 -0.812 -0.134 0.008 1.7389 0.351 Up
glm.fit2 = glm(Direction ~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data = Smarket,
family = binomial, subset = train)
glm.probs2 = predict(glm.fit2, Smarket.2005, type = "response")
glm.pred = rep("Down", 252) # nrow(Smarket.2005)
glm.pred[glm.probs2 > .5] = "Up"
table(glm.pred, Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 77 97
## Up 34 44
mean(glm.pred == Direction.2005)
## [1] 0.4801587
mean(glm.pred != Direction.2005) # Compute and test error rate
## [1] 0.5198413
remove +Lag3+Lag4+Lag5+Volume
glm.fit1 = glm(Direction ~Lag1+Lag2, data = Smarket, family = binomial, subset = train) # Train Data
glm.probs2 = predict(glm.fit1, Smarket.2005, type = "response")
head(Smarket.2005, 5)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 999 2005 -0.134 0.008 -0.007 0.715 -0.431 0.7869 -0.812 Down
## 1000 2005 -0.812 -0.134 0.008 -0.007 0.715 1.5108 -1.167 Down
## 1001 2005 -1.167 -0.812 -0.134 0.008 -0.007 1.7210 -0.363 Down
## 1002 2005 -0.363 -1.167 -0.812 -0.134 0.008 1.7389 0.351 Up
## 1003 2005 0.351 -0.363 -1.167 -0.812 -0.134 1.5691 -0.143 Down
glm.pred = rep("Down", 252)
glm.pred[glm.probs2 > .5] = "Up"
table(glm.pred, Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 35 35
## Up 76 106
mean(glm.pred == Direction.2005)
## [1] 0.5595238
106 / (106 + 76)
## [1] 0.5824176
predict(glm.fit1, newdata = data.frame(Lag1=c(1.2,1.5), Lag2=c(1.1,-0.8)), type = "response")
## 1 2
## 0.4791462 0.4960939
p161
library(MASS)
Same format as glm: glm(Direction ~Lag1+Lag2, data = Smarket, family = binomial, subset = train) - family
lda.fit = lda(Direction ~Lag1+Lag2, data = Smarket, subset = train)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
lda.pred = predict(lda.fit, Smarket.2005)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class = lda.pred$class
table(lda.class, Direction.2005)
## Direction.2005
## lda.class Down Up
## Down 35 35
## Up 76 106
mean(lda.class == Direction.2005)
## [1] 0.5595238
sum(lda.pred$posterior[,1] >=.5)
## [1] 70
sum(lda.pred$posterior[,1] <.5)
## [1] 182
sum(lda.pred$posterior[,1] >.9) # Want only over 90% posterior probability
## [1] 0
# 0 !!!
p163
Same format as lda()
qda.fit = qda(Direction ~Lag1+Lag2, data = Smarket, subset = train)
qda.fit
## Call:
## qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
qda.pred = predict(qda.fit, Smarket.2005)
names(qda.pred)
## [1] "class" "posterior"
qda.class = qda.pred$class
table(qda.class,Direction.2005)
## Direction.2005
## qda.class Down Up
## Down 30 20
## Up 81 121
mean(qda.class == Direction.2005) # Accurate 60% of the time
## [1] 0.5992063
p163
data(package=“ISLR”)
knn() requires 4 inputs
library(class) # knn()
train.X = cbind(Smarket$Lag1, Smarket$Lag2)[train,]
#View(train.X)
test.X = cbind(Smarket$Lag1, Smarket$Lag2)[!train,] # ! wrong??
train.Direction = Smarket$Direction[train]
#train.Direction
#Direction[train]
dim(train.X)
## [1] 998 2
summary(Smarket$Direction)
## Down Up
## 602 648
set.seed(1)
knn.pred = knn(train.X, test.X, train.Direction, k=1)
table(knn.pred,Direction.2005)
## Direction.2005
## knn.pred Down Up
## Down 43 58
## Up 68 83
(83+43)/252
## [1] 0.5
knn.pred = knn(train.X, test.X, train.Direction, k=3)
table(knn.pred, Direction.2005)
## Direction.2005
## knn.pred Down Up
## Down 48 54
## Up 63 87
mean(knn.pred == Direction.2005)
## [1] 0.5357143
detach(Smarket) # Cleanup so we don’t have bad references to wrong table