p369
Generate a simulated two-class data set with 100 observations and two features in which there is a visible but non-linear separation between the two classes.
2 columns and the class
set.seed(1)
x = matrix(rnorm(50*2), ncol=2)
y = c(rep(-1,25), rep(1,25)) # The class vector
x[y==1,]=x[y==1,] + 1
rnorm(2)
## [1] -0.62036668 0.04211587
library(e1071)
dat = data.frame(x=x, y=as.factor(y))
train = sample(50, 25)
x
## [,1] [,2]
## [1,] -0.62645381 0.39810588
## [2,] 0.18364332 -0.61202639
## [3,] -0.83562861 0.34111969
## [4,] 1.59528080 -1.12936310
## [5,] 0.32950777 1.43302370
## [6,] -0.82046838 1.98039990
## [7,] 0.48742905 -0.36722148
## [8,] 0.73832471 -1.04413463
## [9,] 0.57578135 0.56971963
## [10,] -0.30538839 -0.13505460
## [11,] 1.51178117 2.40161776
## [12,] 0.38984324 -0.03924000
## [13,] -0.62124058 0.68973936
## [14,] -2.21469989 0.02800216
## [15,] 1.12493092 -0.74327321
## [16,] -0.04493361 0.18879230
## [17,] -0.01619026 -1.80495863
## [18,] 0.94383621 1.46555486
## [19,] 0.82122120 0.15325334
## [20,] 0.59390132 2.17261167
## [21,] 0.91897737 0.47550953
## [22,] 0.78213630 -0.70994643
## [23,] 0.07456498 0.61072635
## [24,] -1.98935170 -0.93409763
## [25,] 0.61982575 -1.25363340
## [26,] 0.94387126 1.29144624
## [27,] 0.84420449 0.55670813
## [28,] -0.47075238 1.00110535
## [29,] 0.52184994 1.07434132
## [30,] 1.41794156 0.41047905
## [31,] 2.35867955 0.43133127
## [32,] 0.89721227 0.86482138
## [33,] 1.38767161 2.17808700
## [34,] 0.94619496 -0.52356680
## [35,] -0.37705956 1.59394619
## [36,] 0.58500544 1.33295037
## [37,] 0.60571005 2.06309984
## [38,] 0.94068660 0.69581608
## [39,] 2.10002537 1.37001881
## [40,] 1.76317575 1.26709879
## [41,] 0.83547640 0.45747997
## [42,] 0.74663832 2.20786781
## [43,] 1.69696338 2.16040262
## [44,] 1.55666320 1.70021365
## [45,] 0.31124431 2.58683345
## [46,] 0.29250484 1.55848643
## [47,] 1.36458196 -0.27659221
## [48,] 1.76853292 0.42673459
## [49,] 0.88765379 -0.22461261
## [50,] 1.88110773 0.52659936
df = data.frame(x1 = x[1], x2=x[2], y=y)
ggplot(df, aes(x=x1, x2, color=factor(y))) +
geom_point(size=2, shape=23) +
xlim(-2, 2) +
ylim(-2, 2) + geom_jitter()
dat
## x.1 x.2 y
## 1 -0.62645381 0.39810588 -1
## 2 0.18364332 -0.61202639 -1
## 3 -0.83562861 0.34111969 -1
## 4 1.59528080 -1.12936310 -1
## 5 0.32950777 1.43302370 -1
## 6 -0.82046838 1.98039990 -1
## 7 0.48742905 -0.36722148 -1
## 8 0.73832471 -1.04413463 -1
## 9 0.57578135 0.56971963 -1
## 10 -0.30538839 -0.13505460 -1
## 11 1.51178117 2.40161776 -1
## 12 0.38984324 -0.03924000 -1
## 13 -0.62124058 0.68973936 -1
## 14 -2.21469989 0.02800216 -1
## 15 1.12493092 -0.74327321 -1
## 16 -0.04493361 0.18879230 -1
## 17 -0.01619026 -1.80495863 -1
## 18 0.94383621 1.46555486 -1
## 19 0.82122120 0.15325334 -1
## 20 0.59390132 2.17261167 -1
## 21 0.91897737 0.47550953 -1
## 22 0.78213630 -0.70994643 -1
## 23 0.07456498 0.61072635 -1
## 24 -1.98935170 -0.93409763 -1
## 25 0.61982575 -1.25363340 -1
## 26 0.94387126 1.29144624 1
## 27 0.84420449 0.55670813 1
## 28 -0.47075238 1.00110535 1
## 29 0.52184994 1.07434132 1
## 30 1.41794156 0.41047905 1
## 31 2.35867955 0.43133127 1
## 32 0.89721227 0.86482138 1
## 33 1.38767161 2.17808700 1
## 34 0.94619496 -0.52356680 1
## 35 -0.37705956 1.59394619 1
## 36 0.58500544 1.33295037 1
## 37 0.60571005 2.06309984 1
## 38 0.94068660 0.69581608 1
## 39 2.10002537 1.37001881 1
## 40 1.76317575 1.26709879 1
## 41 0.83547640 0.45747997 1
## 42 0.74663832 2.20786781 1
## 43 1.69696338 2.16040262 1
## 44 1.55666320 1.70021365 1
## 45 0.31124431 2.58683345 1
## 46 0.29250484 1.55848643 1
## 47 1.36458196 -0.27659221 1
## 48 1.76853292 0.42673459 1
## 49 0.88765379 -0.22461261 1
## 50 1.88110773 0.52659936 1
svmfit.linear=svm(y~., data=dat[train,], kernel="linear", cost=1, scale = FALSE)
plot(svmfit.linear, dat)
summary(svmfit.linear)
##
## Call:
## svm(formula = y ~ ., data = dat[train, ], kernel = "linear", cost = 1,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 16
##
## ( 8 8 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
svm.predict = predict(svmfit.linear, newdata = dat[train,])
table(true=dat[train,"y"], pred=svm.predict)
## pred
## true -1 1
## -1 12 2
## 1 3 8
(11+7)/25
## [1] 0.72
length(svm.predict)
## [1] 25
svmfit.radial=svm(y~., data=dat[train,], kernel="radial", gamma=1,
cost=1)
plot(svmfit.radial, dat[train,])
summary(svmfit.radial)
##
## Call:
## svm(formula = y ~ ., data = dat[train, ], kernel = "radial", gamma = 1,
## cost = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 20
##
## ( 10 10 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
svm.predict.radial = predict(svmfit.radial, newdata = dat[train,])
table(true=dat[train,"y"], pred=svm.predict.radial)
## pred
## true -1 1
## -1 11 3
## 1 1 10
(13+9)/25
## [1] 0.88
svm.predict.test = predict(svmfit.linear, newdata = dat[-train,])
table(true=dat[-train,"y"], pred=svm.predict.test)
## pred
## true -1 1
## -1 8 3
## 1 3 11
(10+11)/25
## [1] 0.84
plot(svmfit.linear, dat[-train,])
svm.predict.radial.test = predict(svmfit.radial, newdata = dat[-train,])
table(true=dat[-train,"y"], pred=svm.predict.radial.test)
## pred
## true -1 1
## -1 8 3
## 1 4 10
(8+9)/25
## [1] 0.68
plot(svmfit.radial, dat[-train,])
The radial kernel overfit the training data and did not do as well on the test data. In other words, linear kernel was better on the test data.