ISLR Home

p165

library(ISLR)
dim(Caravan)
## [1] 5822   86
#View(Caravan)
attach(Caravan)
summary(Caravan$Purchase)
##   No  Yes 
## 5474  348
348/5822 # 6% purchased Caravan insurance
## [1] 0.05977327

Salary and age are on different scales

summary(Caravan[,1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   10.00   30.00   24.25   35.00   41.00

Now every column of standardized.X has a standard deviation of one and a mean of zero.

Scale Data

standardized.X=scale(Caravan [,-86]) # Produces matrix for all columns
summary(standardized.X[,1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.8101 -1.1095  0.4473  0.0000  0.8365  1.3036
var(Caravan [ ,1])
## [1] 165.0378
var(Caravan [ ,2])
## [1] 0.1647078
var(standardized.X[,1])
## [1] 1
var(standardized.X[,2])
## [1] 1
mean(standardized.X[,1])
## [1] -7.025576e-17

Split Data into Train/Test

test=1:1000

train.X=standardized.X[-test ,] # Exclude first 1000 rows

test.X=standardized.X[test ,]

train.Y=Caravan$Purchase [-test]
test.Y=Caravan$Purchase [test]

KNN Model

library(class) # knn
set.seed(1)
knn.pred=knn(train.X, test.X, train.Y, k=1)

Evaluate

mean(test.Y!=knn.pred) # 0.118  11.8%
## [1] 0.118
mean(test.Y!="No") # 0.059 6%
## [1] 0.059
table(knn.pred,test.Y) # Confusion Matrix
##         test.Y
## knn.pred  No Yes
##      No  873  50
##      Yes  68   9
# Yes  68   9
9/(68+9) # Success rate = 11.7%
## [1] 0.1168831

K=3

knn.pred=knn(train.X, test.X, train.Y, k=3)
table(knn.pred,test.Y) # Confusion Matrix
##         test.Y
## knn.pred  No Yes
##      No  920  54
##      Yes  21   5
# Yes  21   5
5/(21 + 5) # Success rate = 19.2%
## [1] 0.1923077

K=5

knn.pred=knn(train.X, test.X, train.Y, k=5)
table(knn.pred,test.Y) # Confusion Matrix
##         test.Y
## knn.pred  No Yes
##      No  930  55
##      Yes  11   4
# Yes  11   4
4/(11+4) # Success rate = 26%
## [1] 0.2666667

Logistic Regression Model

1. Fit

glm.fits=glm(Purchase~.,data=Caravan ,family=binomial, subset=-test)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

2. Predict

glm.probs=predict(glm.fits,Caravan[test,], type="response")

3. Evaluate

glm.pred=rep("No",1000) # The 1000 in test
glm.pred[glm.probs >.5]="Yes"
table(glm.pred,test.Y)
##         test.Y
## glm.pred  No Yes
##      No  934  59
##      Yes   7   0

Yes 7 0

glm.pred=rep("No",1000) # The 1000 in test
glm.pred[glm.probs >.25]="Yes"
table(glm.pred,test.Y)
##         test.Y
## glm.pred  No Yes
##      No  919  48
##      Yes  22  11
#Yes  22  11
11/(22+11) # Success rate = 33.3%
## [1] 0.3333333