library(mlr)
library(tidyverse)
This chapter covers:
\[posterior = \frac{likelihood * prior}{evidence}\] # 6.2. Building first naive Bayes model
data(HouseVotes84, package = "mlbench")
votesTib <- as_tibble(HouseVotes84)
votesTib
## # A tibble: 435 x 17
## Class V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 repu… n y n y y y n n n y <NA> y
## 2 repu… n y n y y y n n n n n y
## 3 demo… <NA> y y <NA> y y n n n n y n
## 4 demo… n y y n <NA> y n n n n y n
## 5 demo… y y y n y y n n n n y <NA>
## 6 demo… n y y n y y n n n n n n
## 7 demo… n y n y y y n n n n n n
## 8 repu… n y n y y y n n n n n n
## 9 repu… n y n y y y n n n n n y
## 10 demo… y y y n n n y y y n n n
## # … with 425 more rows, and 4 more variables: V13 <fct>, V14 <fct>, V15 <fct>,
## # V16 <fct>
map_dbl(votesTib, ~sum(is.na(.)))
## Class V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 0 12 48 11 11 15 11 14 15 22 7 21 31
## V13 V14 V15 V16
## 25 17 28 104
Use the map_dbl() function as we did in listing 6.2 to count the number of y values in each column of votesTib. Hint: Use which(. == “y”) to return the rows in each column that equal y.
votesUntidy <- gather(votesTib, "Variable", "Value", -Class)
ggplot(votesUntidy, aes(Class, fill = Value)) +
facet_wrap(~ Variable, scales = "free_y") +
geom_bar(position = "fill") +
theme_bw()
votesTask <- makeClassifTask(data = votesTib, target = "Class")
## Warning in makeTask(type = type, data = data, weights = weights, blocking =
## blocking, : Provided data is not a pure data.frame but from class tbl_df, hence
## it will be converted.
bayes <- makeLearner("classif.naiveBayes")
bayesModel <- train(bayes, votesTask)
Should do 50 reps
kFold <- makeResampleDesc(method = "RepCV", folds = 10,
reps = 5, # 50 to 5
stratify = TRUE)
bayesCV <- resample(learner = bayes, task = votesTask,
resampling = kFold,
measures = list(mmce, acc, fpr, fnr))
## Resampling: repeated cross-validation
## Measures: mmce acc fpr fnr
## [Resample] iter 1: 0.0909091 0.9090909 0.1176471 0.0740741
## [Resample] iter 2: 0.0909091 0.9090909 0.0588235 0.1111111
## [Resample] iter 3: 0.0697674 0.9302326 0.0588235 0.0769231
## [Resample] iter 4: 0.1860465 0.8139535 0.2500000 0.1481481
## [Resample] iter 5: 0.0930233 0.9069767 0.0588235 0.1153846
## [Resample] iter 6: 0.0909091 0.9090909 0.0588235 0.1111111
## [Resample] iter 7: 0.0909091 0.9090909 0.0588235 0.1111111
## [Resample] iter 8: 0.0909091 0.9090909 0.0000000 0.1481481
## [Resample] iter 9: 0.1136364 0.8863636 0.1176471 0.1111111
## [Resample] iter 10: 0.0476190 0.9523810 0.0000000 0.0769231
## [Resample] iter 11: 0.1818182 0.8181818 0.1176471 0.2222222
## [Resample] iter 12: 0.0930233 0.9069767 0.1250000 0.0740741
## [Resample] iter 13: 0.1363636 0.8636364 0.0588235 0.1851852
## [Resample] iter 14: 0.0952381 0.9047619 0.0625000 0.1153846
## [Resample] iter 15: 0.1363636 0.8636364 0.1176471 0.1481481
## [Resample] iter 16: 0.0465116 0.9534884 0.0588235 0.0384615
## [Resample] iter 17: 0.0227273 0.9772727 0.0000000 0.0370370
## [Resample] iter 18: 0.1136364 0.8863636 0.2352941 0.0370370
## [Resample] iter 19: 0.0930233 0.9069767 0.0000000 0.1538462
## [Resample] iter 20: 0.0681818 0.9318182 0.0588235 0.0740741
## [Resample] iter 21: 0.1395349 0.8604651 0.1176471 0.1538462
## [Resample] iter 22: 0.1162791 0.8837209 0.1176471 0.1153846
## [Resample] iter 23: 0.0697674 0.9302326 0.0000000 0.1111111
## [Resample] iter 24: 0.1363636 0.8636364 0.2352941 0.0740741
## [Resample] iter 25: 0.0930233 0.9069767 0.0588235 0.1153846
## [Resample] iter 26: 0.0930233 0.9069767 0.1250000 0.0740741
## [Resample] iter 27: 0.0681818 0.9318182 0.0000000 0.1111111
## [Resample] iter 28: 0.0454545 0.9545455 0.0000000 0.0740741
## [Resample] iter 29: 0.1136364 0.8863636 0.1176471 0.1111111
## [Resample] iter 30: 0.1136364 0.8863636 0.0000000 0.1851852
## [Resample] iter 31: 0.0697674 0.9302326 0.0625000 0.0740741
## [Resample] iter 32: 0.1590909 0.8409091 0.1764706 0.1481481
## [Resample] iter 33: 0.1818182 0.8181818 0.0588235 0.2592593
## [Resample] iter 34: 0.0697674 0.9302326 0.0588235 0.0769231
## [Resample] iter 35: 0.0465116 0.9534884 0.0000000 0.0740741
## [Resample] iter 36: 0.0930233 0.9069767 0.1176471 0.0769231
## [Resample] iter 37: 0.0454545 0.9545455 0.0588235 0.0370370
## [Resample] iter 38: 0.1363636 0.8636364 0.1176471 0.1481481
## [Resample] iter 39: 0.1363636 0.8636364 0.2352941 0.0740741
## [Resample] iter 40: 0.0697674 0.9302326 0.0000000 0.1153846
## [Resample] iter 41: 0.1363636 0.8636364 0.0588235 0.1851852
## [Resample] iter 42: 0.0681818 0.9318182 0.1176471 0.0370370
## [Resample] iter 43: 0.0454545 0.9545455 0.0588235 0.0370370
## [Resample] iter 44: 0.0714286 0.9285714 0.0000000 0.1153846
## [Resample] iter 45: 0.0909091 0.9090909 0.0000000 0.1481481
## [Resample] iter 46: 0.1363636 0.8636364 0.0588235 0.1851852
## [Resample] iter 47: 0.1627907 0.8372093 0.2500000 0.1111111
## [Resample] iter 48: 0.1162791 0.8837209 0.1764706 0.0769231
## [Resample] iter 49: 0.0909091 0.9090909 0.0588235 0.1111111
## [Resample] iter 50: 0.0697674 0.9302326 0.0588235 0.0769231
##
## Aggregated Result: mmce.test.mean=0.0987360,acc.test.mean=0.9012640,fpr.test.mean=0.0822059,fnr.test.mean=0.1092593
##
bayesCV$aggr
## mmce.test.mean acc.test.mean fpr.test.mean fnr.test.mean
## 0.09873603 0.90126397 0.08220588 0.10925926
politician <- tibble(V1 = "n", V2 = "n", V3 = "y", V4 = "n", V5 = "n",
V6 = "y", V7 = "y", V8 = "y", V9 = "y", V10 = "y",
V11 = "n", V12 = "y", V13 = "n", V14 = "n",
V15 = "y", V16 = "n")
politicianPred <- predict(bayesModel, newdata = politician)
## Warning in predict.WrappedModel(bayesModel, newdata = politician): Provided data
## for prediction is not a pure data.frame but from class tbl_df, hence it will be
## converted.
getPredictionResponse(politicianPred)
## [1] democrat
## Levels: democrat republican
Model predicts that the new politician is a Democrat
Wrap your naive Bayes model inside the getLearnerModel() function. Can you identify the prior probabilities and the likelihoods for each vote?
SVMs for linearly separable data
The SVM algorithm finds an optimal linear hyperplane that separates the classes
The soft-margin SVM algorithm still tries to find the hyperplane that best separates the classes, but it is penalized for having cases inside its margin.
There are two ways of doing this:
data(spam, package = "kernlab")
spamTib <- as_tibble(spam)
spamTib
## # A tibble: 4,601 x 58
## make address all num3d our over remove internet order mail receive
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.64 0.64 0 0.32 0 0 0 0 0 0
## 2 0.21 0.28 0.5 0 0.14 0.28 0.21 0.07 0 0.94 0.21
## 3 0.06 0 0.71 0 1.23 0.19 0.19 0.12 0.64 0.25 0.38
## 4 0 0 0 0 0.63 0 0.31 0.63 0.31 0.63 0.31
## 5 0 0 0 0 0.63 0 0.31 0.63 0.31 0.63 0.31
## 6 0 0 0 0 1.85 0 0 1.85 0 0 0
## 7 0 0 0 0 1.92 0 0 0 0 0.64 0.96
## 8 0 0 0 0 1.88 0 0 1.88 0 0 0
## 9 0.15 0 0.46 0 0.61 0 0.3 0 0.92 0.76 0.76
## 10 0.06 0.12 0.77 0 0.19 0.32 0.38 0 0.06 0 0
## # … with 4,591 more rows, and 47 more variables: will <dbl>, people <dbl>,
## # report <dbl>, addresses <dbl>, free <dbl>, business <dbl>, email <dbl>,
## # you <dbl>, credit <dbl>, your <dbl>, font <dbl>, num000 <dbl>, money <dbl>,
## # hp <dbl>, hpl <dbl>, george <dbl>, num650 <dbl>, lab <dbl>, labs <dbl>,
## # telnet <dbl>, num857 <dbl>, data <dbl>, num415 <dbl>, num85 <dbl>,
## # technology <dbl>, num1999 <dbl>, parts <dbl>, pm <dbl>, direct <dbl>,
## # cs <dbl>, meeting <dbl>, original <dbl>, project <dbl>, re <dbl>,
## # edu <dbl>, table <dbl>, conference <dbl>, charSemicolon <dbl>,
## # charRoundbracket <dbl>, charSquarebracket <dbl>, charExclamation <dbl>,
## # charDollar <dbl>, charHash <dbl>, capitalAve <dbl>, capitalLong <dbl>,
## # capitalTotal <dbl>, type <fct>
Listing 6.8. Creating the task and learner
spamTask <- makeClassifTask(data = spamTib, target = "type")
## Warning in makeTask(type = type, data = data, weights = weights, blocking =
## blocking, : Provided data is not a pure data.frame but from class tbl_df, hence
## it will be converted.
svm <- makeLearner("classif.svm")