library(mlr)
library(tidyverse)
This chapter covers:
\[posterior = \frac{likelihood * prior}{evidence}\] # 6.2. Building first naive Bayes model
data(HouseVotes84, package = "mlbench")
votesTib <- as_tibble(HouseVotes84)
votesTib
## # A tibble: 435 x 17
##    Class V1    V2    V3    V4    V5    V6    V7    V8    V9    V10   V11   V12  
##    <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
##  1 repu… n     y     n     y     y     y     n     n     n     y     <NA>  y    
##  2 repu… n     y     n     y     y     y     n     n     n     n     n     y    
##  3 demo… <NA>  y     y     <NA>  y     y     n     n     n     n     y     n    
##  4 demo… n     y     y     n     <NA>  y     n     n     n     n     y     n    
##  5 demo… y     y     y     n     y     y     n     n     n     n     y     <NA> 
##  6 demo… n     y     y     n     y     y     n     n     n     n     n     n    
##  7 demo… n     y     n     y     y     y     n     n     n     n     n     n    
##  8 repu… n     y     n     y     y     y     n     n     n     n     n     n    
##  9 repu… n     y     n     y     y     y     n     n     n     n     n     y    
## 10 demo… y     y     y     n     n     n     y     y     y     n     n     n    
## # … with 425 more rows, and 4 more variables: V13 <fct>, V14 <fct>, V15 <fct>,
## #   V16 <fct>
map_dbl(votesTib, ~sum(is.na(.)))
## Class    V1    V2    V3    V4    V5    V6    V7    V8    V9   V10   V11   V12 
##     0    12    48    11    11    15    11    14    15    22     7    21    31 
##   V13   V14   V15   V16 
##    25    17    28   104
Use the map_dbl() function as we did in listing 6.2 to count the number of y values in each column of votesTib. Hint: Use which(. == “y”) to return the rows in each column that equal y.
votesUntidy <- gather(votesTib, "Variable", "Value", -Class)
ggplot(votesUntidy, aes(Class, fill = Value)) +
  facet_wrap(~ Variable, scales = "free_y") +
  geom_bar(position = "fill") +
  theme_bw()
votesTask <- makeClassifTask(data = votesTib, target = "Class")
## Warning in makeTask(type = type, data = data, weights = weights, blocking =
## blocking, : Provided data is not a pure data.frame but from class tbl_df, hence
## it will be converted.
bayes <- makeLearner("classif.naiveBayes")
bayesModel <- train(bayes, votesTask)
Should do 50 reps
kFold <- makeResampleDesc(method = "RepCV", folds = 10,
                          reps = 5, # 50 to 5
                          stratify = TRUE)
                          
bayesCV <- resample(learner = bayes, task = votesTask,
                    resampling = kFold,
                    measures = list(mmce, acc, fpr, fnr))
## Resampling: repeated cross-validation
## Measures:             mmce      acc       fpr       fnr
## [Resample] iter 1:    0.0909091 0.9090909 0.1176471 0.0740741
## [Resample] iter 2:    0.0909091 0.9090909 0.0588235 0.1111111
## [Resample] iter 3:    0.0697674 0.9302326 0.0588235 0.0769231
## [Resample] iter 4:    0.1860465 0.8139535 0.2500000 0.1481481
## [Resample] iter 5:    0.0930233 0.9069767 0.0588235 0.1153846
## [Resample] iter 6:    0.0909091 0.9090909 0.0588235 0.1111111
## [Resample] iter 7:    0.0909091 0.9090909 0.0588235 0.1111111
## [Resample] iter 8:    0.0909091 0.9090909 0.0000000 0.1481481
## [Resample] iter 9:    0.1136364 0.8863636 0.1176471 0.1111111
## [Resample] iter 10:   0.0476190 0.9523810 0.0000000 0.0769231
## [Resample] iter 11:   0.1818182 0.8181818 0.1176471 0.2222222
## [Resample] iter 12:   0.0930233 0.9069767 0.1250000 0.0740741
## [Resample] iter 13:   0.1363636 0.8636364 0.0588235 0.1851852
## [Resample] iter 14:   0.0952381 0.9047619 0.0625000 0.1153846
## [Resample] iter 15:   0.1363636 0.8636364 0.1176471 0.1481481
## [Resample] iter 16:   0.0465116 0.9534884 0.0588235 0.0384615
## [Resample] iter 17:   0.0227273 0.9772727 0.0000000 0.0370370
## [Resample] iter 18:   0.1136364 0.8863636 0.2352941 0.0370370
## [Resample] iter 19:   0.0930233 0.9069767 0.0000000 0.1538462
## [Resample] iter 20:   0.0681818 0.9318182 0.0588235 0.0740741
## [Resample] iter 21:   0.1395349 0.8604651 0.1176471 0.1538462
## [Resample] iter 22:   0.1162791 0.8837209 0.1176471 0.1153846
## [Resample] iter 23:   0.0697674 0.9302326 0.0000000 0.1111111
## [Resample] iter 24:   0.1363636 0.8636364 0.2352941 0.0740741
## [Resample] iter 25:   0.0930233 0.9069767 0.0588235 0.1153846
## [Resample] iter 26:   0.0930233 0.9069767 0.1250000 0.0740741
## [Resample] iter 27:   0.0681818 0.9318182 0.0000000 0.1111111
## [Resample] iter 28:   0.0454545 0.9545455 0.0000000 0.0740741
## [Resample] iter 29:   0.1136364 0.8863636 0.1176471 0.1111111
## [Resample] iter 30:   0.1136364 0.8863636 0.0000000 0.1851852
## [Resample] iter 31:   0.0697674 0.9302326 0.0625000 0.0740741
## [Resample] iter 32:   0.1590909 0.8409091 0.1764706 0.1481481
## [Resample] iter 33:   0.1818182 0.8181818 0.0588235 0.2592593
## [Resample] iter 34:   0.0697674 0.9302326 0.0588235 0.0769231
## [Resample] iter 35:   0.0465116 0.9534884 0.0000000 0.0740741
## [Resample] iter 36:   0.0930233 0.9069767 0.1176471 0.0769231
## [Resample] iter 37:   0.0454545 0.9545455 0.0588235 0.0370370
## [Resample] iter 38:   0.1363636 0.8636364 0.1176471 0.1481481
## [Resample] iter 39:   0.1363636 0.8636364 0.2352941 0.0740741
## [Resample] iter 40:   0.0697674 0.9302326 0.0000000 0.1153846
## [Resample] iter 41:   0.1363636 0.8636364 0.0588235 0.1851852
## [Resample] iter 42:   0.0681818 0.9318182 0.1176471 0.0370370
## [Resample] iter 43:   0.0454545 0.9545455 0.0588235 0.0370370
## [Resample] iter 44:   0.0714286 0.9285714 0.0000000 0.1153846
## [Resample] iter 45:   0.0909091 0.9090909 0.0000000 0.1481481
## [Resample] iter 46:   0.1363636 0.8636364 0.0588235 0.1851852
## [Resample] iter 47:   0.1627907 0.8372093 0.2500000 0.1111111
## [Resample] iter 48:   0.1162791 0.8837209 0.1764706 0.0769231
## [Resample] iter 49:   0.0909091 0.9090909 0.0588235 0.1111111
## [Resample] iter 50:   0.0697674 0.9302326 0.0588235 0.0769231
## 
## Aggregated Result: mmce.test.mean=0.0987360,acc.test.mean=0.9012640,fpr.test.mean=0.0822059,fnr.test.mean=0.1092593
## 
bayesCV$aggr
## mmce.test.mean  acc.test.mean  fpr.test.mean  fnr.test.mean 
##     0.09873603     0.90126397     0.08220588     0.10925926
politician <- tibble(V1 = "n", V2 = "n", V3 = "y", V4 = "n", V5 = "n",
                     V6 = "y", V7 = "y", V8 = "y", V9 = "y", V10 = "y",
                     V11 = "n", V12 = "y", V13 = "n", V14 = "n",
                     V15 = "y", V16 = "n")
politicianPred <- predict(bayesModel, newdata = politician)
## Warning in predict.WrappedModel(bayesModel, newdata = politician): Provided data
## for prediction is not a pure data.frame but from class tbl_df, hence it will be
## converted.
getPredictionResponse(politicianPred)
## [1] democrat
## Levels: democrat republican
Model predicts that the new politician is a Democrat
Wrap your naive Bayes model inside the getLearnerModel() function. Can you identify the prior probabilities and the likelihoods for each vote?
SVMs for linearly separable data
The SVM algorithm finds an optimal linear hyperplane that separates the classes
The soft-margin SVM algorithm still tries to find the hyperplane that best separates the classes, but it is penalized for having cases inside its margin.
There are two ways of doing this:
data(spam, package = "kernlab")
spamTib <- as_tibble(spam)
spamTib
## # A tibble: 4,601 x 58
##     make address   all num3d   our  over remove internet order  mail receive
##    <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>    <dbl> <dbl> <dbl>   <dbl>
##  1  0       0.64  0.64     0  0.32  0      0        0     0     0       0   
##  2  0.21    0.28  0.5      0  0.14  0.28   0.21     0.07  0     0.94    0.21
##  3  0.06    0     0.71     0  1.23  0.19   0.19     0.12  0.64  0.25    0.38
##  4  0       0     0        0  0.63  0      0.31     0.63  0.31  0.63    0.31
##  5  0       0     0        0  0.63  0      0.31     0.63  0.31  0.63    0.31
##  6  0       0     0        0  1.85  0      0        1.85  0     0       0   
##  7  0       0     0        0  1.92  0      0        0     0     0.64    0.96
##  8  0       0     0        0  1.88  0      0        1.88  0     0       0   
##  9  0.15    0     0.46     0  0.61  0      0.3      0     0.92  0.76    0.76
## 10  0.06    0.12  0.77     0  0.19  0.32   0.38     0     0.06  0       0   
## # … with 4,591 more rows, and 47 more variables: will <dbl>, people <dbl>,
## #   report <dbl>, addresses <dbl>, free <dbl>, business <dbl>, email <dbl>,
## #   you <dbl>, credit <dbl>, your <dbl>, font <dbl>, num000 <dbl>, money <dbl>,
## #   hp <dbl>, hpl <dbl>, george <dbl>, num650 <dbl>, lab <dbl>, labs <dbl>,
## #   telnet <dbl>, num857 <dbl>, data <dbl>, num415 <dbl>, num85 <dbl>,
## #   technology <dbl>, num1999 <dbl>, parts <dbl>, pm <dbl>, direct <dbl>,
## #   cs <dbl>, meeting <dbl>, original <dbl>, project <dbl>, re <dbl>,
## #   edu <dbl>, table <dbl>, conference <dbl>, charSemicolon <dbl>,
## #   charRoundbracket <dbl>, charSquarebracket <dbl>, charExclamation <dbl>,
## #   charDollar <dbl>, charHash <dbl>, capitalAve <dbl>, capitalLong <dbl>,
## #   capitalTotal <dbl>, type <fct>
Listing 6.8. Creating the task and learner
spamTask <- makeClassifTask(data = spamTib, target = "type")
## Warning in makeTask(type = type, data = data, weights = weights, blocking =
## blocking, : Provided data is not a pure data.frame but from class tbl_df, hence
## it will be converted.
svm <- makeLearner("classif.svm")