23 Predicting the type of glass

Datasets: Glass
Algorithms:
- SVM
- Partitioning Tree

https://cran.r-project.org/web/packages/e1071/vignettes/svmdoc.pdf

In this example, we use the glass data from the UCI Repository of Machine Learning Databases for classification. The task is to predict the type of a glass on basis of its chemical analysis. We start by splitting the data into a train and test set:

library(caret)
#> Loading required package: lattice
#> Loading required package: ggplot2
library(e1071)
library(rpart)

data(Glass, package="mlbench")
str(Glass)
#> 'data.frame':    214 obs. of  10 variables:
#>  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
#>  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
#>  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
#>  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
#>  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
#>  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
#>  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
#>  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
#>  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
#>  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...

## split data into a train and test set
index <- 1:nrow(Glass)
testindex <- sample(index, trunc(length(index)/3))
testset  <- Glass[testindex,]
trainset <- Glass[-testindex,]

Both for the SVM and the partitioning tree (via rpart()), we fit the model and try to predict the test set values:

## svm
svm.model <- svm(Type ~ ., data = trainset, cost = 100, gamma = 1)
svm.pred  <- predict(svm.model, testset[,-10])

(The dependent variable, Type, has column number 10. cost is a general penalizing parameter for C-classification and gamma is the radial basis function-specific kernel parameter.)

## rpart
rpart.model <- rpart(Type ~ ., data = trainset)
rpart.pred <- predict(rpart.model, testset[,-10], type = "class")

A cross-tabulation of the true versus the predicted values yields:

## compute svm confusion matrix
table(pred = svm.pred, true = testset[,10])
#>     true
#> pred  1  2  3  5  6  7
#>    1 20  3  3  0  0  0
#>    2  6 13  5  4  2  4
#>    3  2  1  0  0  0  0
#>    5  0  0  0  1  0  0
#>    6  0  0  0  0  0  0
#>    7  0  0  0  0  0  7

## compute rpart confusion matrix
table(pred = rpart.pred, true = testset[,10])
#>     true
#> pred  1  2  3  5  6  7
#>    1 22  0  3  0  0  0
#>    2  5 12  4  0  0  0
#>    3  0  2  1  0  0  0
#>    5  0  2  0  5  2  1
#>    6  0  0  0  0  0  0
#>    7  1  1  0  0  0 10

23.0.1 Comparison test sets

confusionMatrix(svm.pred, testset$Type)
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  1  2  3  5  6  7
#>          1 20  3  3  0  0  0
#>          2  6 13  5  4  2  4
#>          3  2  1  0  0  0  0
#>          5  0  0  0  1  0  0
#>          6  0  0  0  0  0  0
#>          7  0  0  0  0  0  7
#> 
#> Overall Statistics
#>                                         
#>                Accuracy : 0.577         
#>                  95% CI : (0.454, 0.694)
#>     No Information Rate : 0.394         
#>     P-Value [Acc > NIR] : 0.00137       
#>                                         
#>                   Kappa : 0.413         
#>                                         
#>  Mcnemar's Test P-Value : NA            
#> 
#> Statistics by Class:
#> 
#>                      Class: 1 Class: 2 Class: 3 Class: 5 Class: 6 Class: 7
#> Sensitivity             0.714    0.765   0.0000   0.2000   0.0000   0.6364
#> Specificity             0.860    0.611   0.9524   1.0000   1.0000   1.0000
#> Pos Pred Value          0.769    0.382   0.0000   1.0000      NaN   1.0000
#> Neg Pred Value          0.822    0.892   0.8824   0.9429   0.9718   0.9375
#> Prevalence              0.394    0.239   0.1127   0.0704   0.0282   0.1549
#> Detection Rate          0.282    0.183   0.0000   0.0141   0.0000   0.0986
#> Detection Prevalence    0.366    0.479   0.0423   0.0141   0.0000   0.0986
#> Balanced Accuracy       0.787    0.688   0.4762   0.6000   0.5000   0.8182

confusionMatrix(rpart.pred, testset$Type)
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  1  2  3  5  6  7
#>          1 22  0  3  0  0  0
#>          2  5 12  4  0  0  0
#>          3  0  2  1  0  0  0
#>          5  0  2  0  5  2  1
#>          6  0  0  0  0  0  0
#>          7  1  1  0  0  0 10
#> 
#> Overall Statistics
#>                                         
#>                Accuracy : 0.704         
#>                  95% CI : (0.584, 0.807)
#>     No Information Rate : 0.394         
#>     P-Value [Acc > NIR] : 1.23e-07      
#>                                         
#>                   Kappa : 0.605         
#>                                         
#>  Mcnemar's Test P-Value : NA            
#> 
#> Statistics by Class:
#> 
#>                      Class: 1 Class: 2 Class: 3 Class: 5 Class: 6 Class: 7
#> Sensitivity             0.786    0.706   0.1250   1.0000   0.0000    0.909
#> Specificity             0.930    0.833   0.9683   0.9242   1.0000    0.967
#> Pos Pred Value          0.880    0.571   0.3333   0.5000      NaN    0.833
#> Neg Pred Value          0.870    0.900   0.8971   1.0000   0.9718    0.983
#> Prevalence              0.394    0.239   0.1127   0.0704   0.0282    0.155
#> Detection Rate          0.310    0.169   0.0141   0.0704   0.0000    0.141
#> Detection Prevalence    0.352    0.296   0.0423   0.1408   0.0000    0.169
#> Balanced Accuracy       0.858    0.770   0.5466   0.9621   0.5000    0.938

23.0.2 Comparison with resamples

Finally, we compare the performance of the two methods by computing the respective accuracy rates and the kappa indices (as computed by classAgreement() also contained in package e1071). In Table 1, we summarize the results of 10 replications—Support Vector Machines show better results.

set.seed(1234567)

# SVM
fit.svm <- train(Type ~., data = trainset, 
                 method = "svmRadial")

# Random Forest
fit.rpart <- train(Type ~., data = trainset, 
                method="rpart")

# collect resamples
results <- resamples(list(svm = fit.svm, 
                          rpart  = fit.rpart))

summary(results)
#> 
#> Call:
#> summary.resamples(object = results)
#> 
#> Models: svm, rpart 
#> Number of resamples: 25 
#> 
#> Accuracy 
#>        Min. 1st Qu. Median  Mean 3rd Qu.  Max. NA's
#> svm   0.510   0.565  0.600 0.599   0.625 0.704    0
#> rpart 0.462   0.519  0.554 0.558   0.600 0.660    0
#> 
#> Kappa 
#>        Min. 1st Qu. Median  Mean 3rd Qu.  Max. NA's
#> svm   0.267   0.376  0.406 0.410   0.446 0.559    0
#> rpart 0.135   0.299  0.358 0.363   0.443 0.545    0

22 Detect mines vs rocks with Random Forest

24 Naive Bayes for SMS spam