12 Feature Selection to enhance cancer detection

Datasets: breast-cancer-wisconsin.data
Algorithms:
- PCA
- Random Forest
- Recursive Feature Elimination
- Genetic Algorithm

Source: https://shiring.github.io/machine_learning/2017/01/15/rfe_ga_post

12.1 Read and process the data

bc_data <- read.table(file.path(data_raw_dir, "breast-cancer-wisconsin.data"), 
                      header = FALSE, sep = ",")

# assign the column names
colnames(bc_data) <- c("sample_code_number", "clump_thickness", 
                       "uniformity_of_cell_size", "uniformity_of_cell_shape",
                       "marginal_adhesion", "single_epithelial_cell_size", 
                       "bare_nuclei", "bland_chromatin", "normal_nucleoli", 
                       "mitosis", "classes")

# change classes from numeric to character
bc_data$classes <- ifelse(bc_data$classes == "2", "benign",
                          ifelse(bc_data$classes == "4", "malignant", NA))

# if query sign make NA
bc_data[bc_data == "?"] <- NA

# how many NAs are in the data
length(which(is.na(bc_data)))
#> [1] 16

names(bc_data)
#>  [1] "sample_code_number"          "clump_thickness"            
#>  [3] "uniformity_of_cell_size"     "uniformity_of_cell_shape"   
#>  [5] "marginal_adhesion"           "single_epithelial_cell_size"
#>  [7] "bare_nuclei"                 "bland_chromatin"            
#>  [9] "normal_nucleoli"             "mitosis"                    
#> [11] "classes"

12.1.1 Missing data

# impute missing data
library(mice)
#> 
#> Attaching package: 'mice'
#> The following objects are masked from 'package:base':
#> 
#>     cbind, rbind

# skip these columns: sample_code_number and classes
# convert to numeric
bc_data[,2:10] <- apply(bc_data[, 2:10], 2, function(x) as.numeric(as.character(x)))

# impute but mute
dataset_impute <- mice(bc_data[, 2:10],  print = FALSE)

# bind "classes" with the rest. skip "sample_code_number"
bc_data <- cbind(bc_data[, 11, drop = FALSE], 
                 mice::complete(dataset_impute, action =1))

bc_data$classes <- as.factor(bc_data$classes)

# how many benign and malignant cases are there?
summary(bc_data$classes)
#>    benign malignant 
#>       458       241

# confirm NAs have been removed
length(which(is.na(bc_data)))
#> [1] 0

str(bc_data)
#> 'data.frame':    699 obs. of  10 variables:
#>  $ classes                    : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
#>  $ clump_thickness            : num  5 5 3 6 4 8 1 2 2 4 ...
#>  $ uniformity_of_cell_size    : num  1 4 1 8 1 10 1 1 1 2 ...
#>  $ uniformity_of_cell_shape   : num  1 4 1 8 1 10 1 2 1 1 ...
#>  $ marginal_adhesion          : num  1 5 1 1 3 8 1 1 1 1 ...
#>  $ single_epithelial_cell_size: num  2 7 2 3 2 7 2 2 2 2 ...
#>  $ bare_nuclei                : num  1 10 2 4 1 10 10 1 1 1 ...
#>  $ bland_chromatin            : num  3 3 3 3 3 9 3 3 1 2 ...
#>  $ normal_nucleoli            : num  1 2 1 7 1 7 1 1 1 1 ...
#>  $ mitosis                    : num  1 1 1 1 1 1 1 1 5 1 ...

12.2 Principal Component Analysis (PCA)

To get an idea about the dimensionality and variance of the datasets, I am first looking at PCA plots for samples and features. The first two principal components (PCs) show the two components that explain the majority of variation in the data.

After defining my custom ggplot2 theme, I am creating a function that performs the PCA (using the pcaGoPromoter package), calculates ellipses of the data points (with the ellipse package) and produces the plot with ggplot2. Some of the features in datasets 2 and 3 are not very distinct and overlap in the PCA plots, therefore I am also plotting hierarchical clustering dendrograms.

12.2.0.1 theme

# plotting theme

library(ggplot2)

my_theme <- function(base_size = 12, base_family = "sans"){
  theme_minimal(base_size = base_size, base_family = base_family) +
  theme(
    axis.text = element_text(size = 12),
    axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5),
    axis.title = element_text(size = 14),
    panel.grid.major = element_line(color = "grey"),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "aliceblue"),
    strip.background = element_rect(fill = "navy", color = "navy", size = 1),
    strip.text = element_text(face = "bold", size = 12, color = "white"),
    legend.position = "right",
    legend.justification = "top", 
    legend.background = element_blank(),
    panel.border = element_rect(color = "grey", fill = NA, size = 0.5)
  )
}

theme_set(my_theme())

12.2.0.2 PCA function

# function for PCA plotting
library(pcaGoPromoter)                  # install from BioConductor
#> Loading required package: ellipse
#> 
#> Attaching package: 'ellipse'
#> The following object is masked from 'package:graphics':
#> 
#>     pairs
#> Loading required package: Biostrings
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#> 
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#> 
#>     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#>     clusterExport, clusterMap, parApply, parCapply, parLapply,
#>     parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following objects are masked from 'package:mice':
#> 
#>     cbind, rbind
#> The following objects are masked from 'package:stats':
#> 
#>     IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#> 
#>     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
#>     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
#>     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
#>     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#>     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
#>     union, unique, unsplit, which, which.max, which.min
#> Loading required package: S4Vectors
#> Loading required package: stats4
#> 
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:base':
#> 
#>     expand.grid
#> Loading required package: IRanges
#> Loading required package: XVector
#> 
#> Attaching package: 'Biostrings'
#> The following object is masked from 'package:base':
#> 
#>     strsplit
library(ellipse)

pca_func <- function(data, groups, title, print_ellipse = TRUE) {
  
  # perform pca and extract scores for all principal components: PC1:PC9
  pcaOutput <- pca(data, printDropped = FALSE, scale = TRUE, center = TRUE)
  pcaOutput2 <- as.data.frame(pcaOutput$scores)
  
  # define groups for plotting. will group the classes
  pcaOutput2$groups <- groups
  
  # when plotting samples calculate ellipses for plotting 
  # (when plotting features, there are no replicates)
  if (print_ellipse) {
    # group and summarize by classes: benign, malignant
    # centroids w/3 columns: groups, PC1, PC2
    centroids <- aggregate(cbind(PC1, PC2) ~ groups, pcaOutput2, mean)
    # bind for the two groups (classes)
    # conf.rgn w/3 columns: groups, PC1, PC2
    conf.rgn  <- do.call(rbind, lapply(unique(pcaOutput2$groups), function(t)
      data.frame(groups = as.character(t),
                 # ellipse data for PC1 and PC2
                 ellipse(cov(pcaOutput2[pcaOutput2$groups == t, 1:2]),
                       centre = as.matrix(centroids[centroids$groups == t, 2:3]),
                       level = 0.95),
                 stringsAsFactors = FALSE)))
    
    plot <- ggplot(data = pcaOutput2, aes(x = PC1, y = PC2, 
                                          group = groups, 
                                          color = groups)) + 
      geom_polygon(data = conf.rgn, aes(fill = groups), alpha = 0.2) + # ellipses
      geom_point(size = 2, alpha = 0.6) + 
      scale_color_brewer(palette = "Set1") +
      labs(title = title,
           color = "",
           fill = "",
           x = paste0("PC1: ", round(pcaOutput$pov[1], digits = 2) * 100, 
                      "% variance"),
           y = paste0("PC2: ", round(pcaOutput$pov[2], digits = 2) * 100, 
                      "% variance"))
    
  } else {
    
    # if < 10 groups (e.g. the predictor classes) have colors from RColorBrewer
    if (length(unique(pcaOutput2$groups)) <= 10) {
      
      plot <- ggplot(data = pcaOutput2, aes(x = PC1, y = PC2, 
                                            group = groups, 
                                            color = groups)) + 
        geom_point(size = 2, alpha = 0.6) + 
        scale_color_brewer(palette = "Set1") +
        labs(title = title,
             color = "",
             fill = "",
             x = paste0("PC1: ", round(pcaOutput$pov[1], digits = 2) * 100, 
                        "% variance"),
             y = paste0("PC2: ", round(pcaOutput$pov[2], digits = 2) * 100, 
                        "% variance"))
      
    } else {
      # otherwise use the default rainbow colors
      plot <- ggplot(data = pcaOutput2, aes(x = PC1, y = PC2, 
                                            group = groups, color = groups)) + 
        geom_point(size = 2, alpha = 0.6) + 
        labs(title = title,
             color = "",
             fill = "",
             x = paste0("PC1: ", round(pcaOutput$pov[1], digits = 2) * 100, 
                        "% variance"),
             y = paste0("PC2: ", round(pcaOutput$pov[2], digits = 2) * 100, 
                        "% variance"))
    }
  }
  
  return(plot)
  
}

library(gridExtra)
#> 
#> Attaching package: 'gridExtra'
#> The following object is masked from 'package:BiocGenerics':
#> 
#>     combine
library(grid)

# plot all data. one row is a feature
p1 <- pca_func(data = t(bc_data[, 2:10]), 
               groups = as.character(bc_data$classes), 
               title = "Breast cancer dataset 1: Samples")

# plot features only. features as columns
p2 <- pca_func(data = bc_data[, 2:10], 
               groups = as.character(colnames(bc_data[, 2:10])), 
               title = "Breast cancer dataset 1: Features", print_ellipse = FALSE)

grid.arrange(p1, p2, ncol = 2)

h_1 <- hclust(dist(t(bc_data[, 2:10]), method = "euclidean"), method = "complete")
plot(h_1)

12.2.1 density plots vs class

# density plot showing the feature vs classes
library(tidyr)
#> 
#> Attaching package: 'tidyr'
#> The following object is masked from 'package:S4Vectors':
#> 
#>     expand

# gather data. from column clump_thickness to mitosis
bc_data_gather <- bc_data %>%
  gather(measure, value, clump_thickness:mitosis)

ggplot(data = bc_data_gather, aes(x = value, fill = classes, color = classes)) +
  geom_density(alpha = 0.3, size = 1) +
  geom_rug() +
  scale_fill_brewer(palette = "Set1") +
  scale_color_brewer(palette = "Set1") +
  facet_wrap( ~ measure, scales = "free_y", ncol = 3)

12.3 Feature importance

To get an idea about the feature’s respective importances, I’m running Random Forest models with 10 x 10 cross validation using the caret package. If I wanted to use feature importance to select features for modeling, I would need to perform it on the training data instead of on the complete dataset. But here, I only want to use it to get acquainted with my data. I am again defining a function that estimates the feature importance and produces a plot.

library(caret)
#> Loading required package: lattice
# library(doParallel) # parallel processing
# registerDoParallel()

# prepare training scheme
control <- trainControl(method = "repeatedcv", number = 10, repeats = 10)

feature_imp <- function(model, title) {
  # estimate variable importance
  importance <- varImp(model, scale = TRUE)
  # prepare dataframes for plotting
  importance_df_1 <- importance$importance
  importance_df_1$group <- rownames(importance_df_1)
  
  importance_df_2 <- importance_df_1
  importance_df_2$Overall <- 0
  importance_df <- rbind(importance_df_1, importance_df_2)
  
  plot <- ggplot() +
    geom_point(data = importance_df_1, aes(x = Overall, 
                                           y = group, 
                                           color = group), size = 2) +
    geom_path(data = importance_df, aes(x = Overall, 
                                        y = group, 
                                        color = group, 
                                        group = group), size = 1) +
    theme(legend.position = "none") +
    labs(
      x = "Importance",
      y = "",
      title = title,
      subtitle = "Scaled feature importance",
      caption = "\nDetermined with Random Forest and
      repeated cross validation (10 repeats, 10 times)"
    )
  return(plot)
}

# train the model
set.seed(27)
imp_1 <- train(classes ~ ., data = bc_data, method = "rf", 
               preProcess = c("scale", "center"), 
               trControl = control)

p1 <- feature_imp(imp_1, title = "Breast cancer dataset 1")
p1

12.4 Feature Selection

By correlation
By Recursive Feature Elimination
By Genetic Algorithm

set.seed(27)
bc_data_index <- createDataPartition(bc_data$classes, p = 0.7, list = FALSE)
bc_data_train <- bc_data[bc_data_index, ]
bc_data_test  <- bc_data[-bc_data_index, ]

12.4.1 Correlation

library(corrplot)
#> corrplot 0.84 loaded

# calculate correlation matrix
corMatMy <- cor(bc_data_train[, -1])
corrplot(corMatMy, order = "hclust")

# Apply correlation filter at 0.70,
highlyCor <- colnames(bc_data_train[, -1])[findCorrelation(corMatMy, 
                                                           cutoff = 0.7, 
                                                           verbose = TRUE)]
#> Compare row 2  and column  3 with corr  0.9 
#>   Means:  0.709 vs 0.595 so flagging column 2 
#> Compare row 3  and column  7 with corr  0.737 
#>   Means:  0.674 vs 0.572 so flagging column 3 
#> All correlations <= 0.7

# which variables are flagged for removal?
highlyCor
#> [1] "uniformity_of_cell_size"  "uniformity_of_cell_shape"

# then we remove these variables
bc_data_cor <- bc_data_train[, which(!colnames(bc_data_train) %in% highlyCor)]
names(bc_data_cor)
#> [1] "classes"                     "clump_thickness"            
#> [3] "marginal_adhesion"           "single_epithelial_cell_size"
#> [5] "bare_nuclei"                 "bland_chromatin"            
#> [7] "normal_nucleoli"             "mitosis"

# confirm features were removed
outersect <- function(x, y) {
  sort(c(setdiff(x, y),
         setdiff(y, x)))
}
 
outersect(names(bc_data_cor), names(bc_data_train))
#> [1] "uniformity_of_cell_shape" "uniformity_of_cell_size"

Four features removed

12.4.2 Recursive Feature Elimination (RFE)

# ensure the results are repeatable
set.seed(7)

# define the control using a random forest selection function with cross validation
control <- rfeControl(functions = rfFuncs, method = "cv", number = 10)

# run the RFE algorithm
results_1 <- rfe(x = bc_data_train[, -1], 
                 y = bc_data_train$classes, 
                 sizes = c(1:9), 
                 rfeControl = control)

# chosen features
predictors(results_1)
#> [1] "bare_nuclei"                 "clump_thickness"            
#> [3] "normal_nucleoli"             "uniformity_of_cell_size"    
#> [5] "uniformity_of_cell_shape"    "single_epithelial_cell_size"
#> [7] "bland_chromatin"             "marginal_adhesion"

# subset the chosen features
sel_cols <- which(colnames(bc_data_train) %in% predictors(results_1))
bc_data_rfe <- bc_data_train[, c(1, sel_cols)]
names(bc_data_rfe)
#> [1] "classes"                     "clump_thickness"            
#> [3] "uniformity_of_cell_size"     "uniformity_of_cell_shape"   
#> [5] "marginal_adhesion"           "single_epithelial_cell_size"
#> [7] "bare_nuclei"                 "bland_chromatin"            
#> [9] "normal_nucleoli"

# confirm features removed by RFE
outersect(names(bc_data_rfe), names(bc_data_train))
#> [1] "mitosis"

No features removed with RFE

12.4.3 Genetic Algorithm (GA)

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following object is masked from 'package:gridExtra':
#> 
#>     combine
#> The following objects are masked from 'package:Biostrings':
#> 
#>     collapse, intersect, setdiff, setequal, union
#> The following object is masked from 'package:XVector':
#> 
#>     slice
#> The following objects are masked from 'package:IRanges':
#> 
#>     collapse, desc, intersect, setdiff, slice, union
#> The following objects are masked from 'package:S4Vectors':
#> 
#>     first, intersect, rename, setdiff, setequal, union
#> The following objects are masked from 'package:BiocGenerics':
#> 
#>     combine, intersect, setdiff, union
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

ga_ctrl <- gafsControl(functions = rfGA, # Assess fitness with RF
                       method = "cv",    # 10 fold cross validation
                       genParallel = TRUE, # Use parallel programming
                       allowParallel = TRUE)

lev <- c("malignant", "benign")     # Set the levels

set.seed(27)
model_1 <- gafs(x = bc_data_train[, -1], y = bc_data_train$classes,
                   iters = 10, # generations of algorithm
                   popSize = 5, # population size for each generation
                   levels = lev,
                   gafsControl = ga_ctrl)
#> 
#> Attaching package: 'recipes'
#> The following object is masked from 'package:stats':
#> 
#>     step

plot(model_1) # Plot mean fitness (AUC) by generation

# features
model_1$ga$final
#> [1] "clump_thickness"          "uniformity_of_cell_size" 
#> [3] "uniformity_of_cell_shape" "marginal_adhesion"       
#> [5] "bare_nuclei"              "bland_chromatin"         
#> [7] "normal_nucleoli"

# select features
sel_cols_ga <- which(colnames(bc_data_train) %in% model_1$ga$final)
bc_data_ga <- bc_data_train[, c(1, sel_cols_ga)]
names(bc_data_ga)
#> [1] "classes"                  "clump_thickness"         
#> [3] "uniformity_of_cell_size"  "uniformity_of_cell_shape"
#> [5] "marginal_adhesion"        "bare_nuclei"             
#> [7] "bland_chromatin"          "normal_nucleoli"

# features removed GA
outersect(names(bc_data_ga), names(bc_data_train))
#> [1] "mitosis"                     "single_epithelial_cell_size"

Two features removed with GA.

12.5 Model comparison

12.5.1 Using all features

set.seed(27)
model_bc_data_all <- train(classes ~ .,
                           data = bc_data_train,
                           method = "rf",
                           preProcess = c("scale", "center"),
                           trControl = trainControl(method = "repeatedcv", 
                                                    number = 5, repeats = 10,
                                                    verboseIter = FALSE))

# confusion matrix 
cm_all_1 <- confusionMatrix(predict(model_bc_data_all, bc_data_test[, -1]), bc_data_test$classes)
cm_all_1
#> Confusion Matrix and Statistics
#> 
#>            Reference
#> Prediction  benign malignant
#>   benign       131         2
#>   malignant      6        70
#>                                         
#>                Accuracy : 0.962         
#>                  95% CI : (0.926, 0.983)
#>     No Information Rate : 0.656         
#>     P-Value [Acc > NIR] : <2e-16        
#>                                         
#>                   Kappa : 0.916         
#>                                         
#>  Mcnemar's Test P-Value : 0.289         
#>                                         
#>             Sensitivity : 0.956         
#>             Specificity : 0.972         
#>          Pos Pred Value : 0.985         
#>          Neg Pred Value : 0.921         
#>              Prevalence : 0.656         
#>          Detection Rate : 0.627         
#>    Detection Prevalence : 0.636         
#>       Balanced Accuracy : 0.964         
#>                                         
#>        'Positive' Class : benign        
#>

12.5.2 Compare selection methods

# compare features selected by the three methods
library(gplots)
#> 
#> Attaching package: 'gplots'
#> The following object is masked from 'package:IRanges':
#> 
#>     space
#> The following object is masked from 'package:S4Vectors':
#> 
#>     space
#> The following object is masked from 'package:stats':
#> 
#>     lowess

venn_list <- list(cor = colnames(bc_data_cor)[-1],
                  rfe = colnames(bc_data_rfe)[-1],
                  ga  = colnames(bc_data_ga)[-1])

venn <- venn(venn_list)

venn
#>     num cor rfe ga
#> 000   0   0   0  0
#> 001   0   0   0  1
#> 010   0   0   1  0
#> 011   2   0   1  1
#> 100   1   1   0  0
#> 101   0   1   0  1
#> 110   1   1   1  0
#> 111   5   1   1  1
#> attr(,"intersections")
#> attr(,"intersections")$`cor:rfe:ga`
#> [1] "clump_thickness"   "marginal_adhesion" "bare_nuclei"      
#> [4] "bland_chromatin"   "normal_nucleoli"  
#> 
#> attr(,"intersections")$cor
#> [1] "mitosis"
#> 
#> attr(,"intersections")$`rfe:ga`
#> [1] "uniformity_of_cell_size"  "uniformity_of_cell_shape"
#> 
#> attr(,"intersections")$`cor:rfe`
#> [1] "single_epithelial_cell_size"
#> 
#> attr(,"class")
#> [1] "venn"

4 out of 10 features were chosen by all three methods; the biggest overlap is seen between GA and RFE with 7 features. RFE and GA both retained 8 features for modeling, compared to only 5 based on the correlation method.

12.5.3 Correlation

# correlation
set.seed(127)
model_bc_data_cor <- train(classes ~ .,
                 data = bc_data_cor,
                 method = "rf",
                 preProcess = c("scale", "center"),
                 trControl = trainControl(method = "repeatedcv", number = 5, repeats = 10, verboseIter = FALSE))

cm_cor_1 <- confusionMatrix(predict(model_bc_data_cor, bc_data_test[, -1]), bc_data_test$classes)
cm_cor_1
#> Confusion Matrix and Statistics
#> 
#>            Reference
#> Prediction  benign malignant
#>   benign       130         4
#>   malignant      7        68
#>                                         
#>                Accuracy : 0.947         
#>                  95% CI : (0.908, 0.973)
#>     No Information Rate : 0.656         
#>     P-Value [Acc > NIR] : <2e-16        
#>                                         
#>                   Kappa : 0.885         
#>                                         
#>  Mcnemar's Test P-Value : 0.546         
#>                                         
#>             Sensitivity : 0.949         
#>             Specificity : 0.944         
#>          Pos Pred Value : 0.970         
#>          Neg Pred Value : 0.907         
#>              Prevalence : 0.656         
#>          Detection Rate : 0.622         
#>    Detection Prevalence : 0.641         
#>       Balanced Accuracy : 0.947         
#>                                         
#>        'Positive' Class : benign        
#>

12.5.4 Recursive Feature Elimination

set.seed(127)
model_bc_data_rfe <- train(classes ~ .,
                           data = bc_data_rfe,
                           method = "rf",
                           preProcess = c("scale", "center"),
                           trControl = trainControl(method = "repeatedcv", 
                                                    number = 5, repeats = 10, 
                                                    verboseIter = FALSE))

cm_rfe_1 <- confusionMatrix(predict(model_bc_data_rfe, bc_data_test[, -1]), bc_data_test$classes)
cm_rfe_1
#> Confusion Matrix and Statistics
#> 
#>            Reference
#> Prediction  benign malignant
#>   benign       130         3
#>   malignant      7        69
#>                                         
#>                Accuracy : 0.952         
#>                  95% CI : (0.914, 0.977)
#>     No Information Rate : 0.656         
#>     P-Value [Acc > NIR] : <2e-16        
#>                                         
#>                   Kappa : 0.895         
#>                                         
#>  Mcnemar's Test P-Value : 0.343         
#>                                         
#>             Sensitivity : 0.949         
#>             Specificity : 0.958         
#>          Pos Pred Value : 0.977         
#>          Neg Pred Value : 0.908         
#>              Prevalence : 0.656         
#>          Detection Rate : 0.622         
#>    Detection Prevalence : 0.636         
#>       Balanced Accuracy : 0.954         
#>                                         
#>        'Positive' Class : benign        
#>

12.5.5 GA

set.seed(127)
model_bc_data_ga <- train(classes ~ .,
                           data = bc_data_ga,
                           method = "rf",
                           preProcess = c("scale", "center"),
                           trControl = trainControl(method = "repeatedcv", 
                                                    number = 5, repeats = 10, 
                                                    verboseIter = FALSE))

cm_ga_1 <- confusionMatrix(predict(model_bc_data_ga, bc_data_test[, -1]), bc_data_test$classes)
cm_ga_1
#> Confusion Matrix and Statistics
#> 
#>            Reference
#> Prediction  benign malignant
#>   benign       131         2
#>   malignant      6        70
#>                                         
#>                Accuracy : 0.962         
#>                  95% CI : (0.926, 0.983)
#>     No Information Rate : 0.656         
#>     P-Value [Acc > NIR] : <2e-16        
#>                                         
#>                   Kappa : 0.916         
#>                                         
#>  Mcnemar's Test P-Value : 0.289         
#>                                         
#>             Sensitivity : 0.956         
#>             Specificity : 0.972         
#>          Pos Pred Value : 0.985         
#>          Neg Pred Value : 0.921         
#>              Prevalence : 0.656         
#>          Detection Rate : 0.627         
#>    Detection Prevalence : 0.636         
#>       Balanced Accuracy : 0.964         
#>                                         
#>        'Positive' Class : benign        
#>

12.6 Create comparison tables

# take "overall" variable only from Confusion Matrix
overall <- data.frame(dataset = 1, 
           model = rep(c("all", "cor", "rfe", "ga"), 1),
           rbind(cm_all_1$overall,
                 cm_cor_1$overall,
                 cm_rfe_1$overall,
                 cm_ga_1$overall)
)

# convert to tidy data
library(tidyr)
overall_gather <- overall[, 1:4] %>%     # take the first columns:
  gather(measure, value, Accuracy:Kappa) # dataset, model, Accuracy and Kappa

# take "byClass" variable only from Confusion Matrix
byClass <- data.frame(dataset = 1,
                      model = rep(c("all", "cor", "rfe", "ga"), 1),
                      rbind(cm_all_1$byClass,
                      cm_cor_1$byClass,
                      cm_rfe_1$byClass,
                      cm_ga_1$byClass)
)

# convert to tidy data
byClass_gather <- byClass[, c(1:4, 7)] %>%      # select columns: dataset, model
  gather(measure, value, Sensitivity:Precision) # Sensitiv, Specific, Precis

# join the two tables
overall_byClass_gather <- rbind(overall_gather, byClass_gather)
overall_byClass_gather <- within(
  overall_byClass_gather, model <- factor(model, 
                                          levels = c("all", "cor", "rfe", "ga")))  
                                          # convert to factor

ggplot(overall_byClass_gather, aes(x = model, y = value, color = measure, 
                                   shape = measure, group = measure)) +
  geom_point(size = 4, alpha = 0.8) +
  geom_path(alpha = 0.7) +
  scale_colour_brewer(palette = "Set1") +
  facet_grid(dataset ~ ., scales = "free_y") +
  labs(
    x = "Feature Selection method",
    y = "Value",
    color = "",
    shape = "",
    title = "Comparison of feature selection methods",
    subtitle = "in three breast cancer datasets",
    caption = "\nBreast Cancer Wisconsin (Diagnostic) Data Sets: 1, 2 & 3
    Street et al., 1993;
    all: no feature selection
    cor: features with correlation > 0.7 removed
    rfe: Recursive Feature Elimination
    ga: Genetic Algorithm"
  )

Less accurate: selection of features by correlation
More accurate: genetic algorithm
Including all features is more accurate to removing features by correlation.

12.7 Notes

pcaGoPromoter is a BioConductor package. Its dependencies are BioGenerics, AnnotationDbi and BioStrings, which at their turn require DBI and RSQLite packages from CRAN. Install first those from CRAN, and then move to install pcaGoPromoter.

11 Broad view of SVM

13 Dealing with unbalanced data