Preparations

Load data analysis project

Load project.

setwd("..")
ProjectTemplate::reload.project()
setwd("reports")

Load additional packages.

library(caret)
library(glmnet)
library(randomForest)

http://topepo.github.io/caret/

Settings

trellis.par.set(caretTheme())

trCntrl <- trainControl(
  method = "repeatedcv",
  number = 3,
  repeats = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary)
tune_length <- 5
n_keep <- 5000

Prepare data

Initial steps

Non-specific filtering of features

eset <- genefilter::featureFilter(casecontstudy)
eset <- genefilter::varFilter(eset, var.cutoff = 1 - n_keep / nrow(eset))
featureNames(eset) <- make.names(fData(eset)$probeid)

print(eset)

## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 5000 features, 768 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: BCPT0950_1002_1 BCPT0006_1002_0 ... BCPT0682_3163_1
##     (768 total)
##   varLabels: tumorid setnr ... HER2_amplicon_metagene (11 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: X100127472_TGI_at X100140348_TGI_at ...
##     X100148801_TGI_at (5000 total)
##   fvarLabels: probeid entrezid symbol
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: HuRSTA2a520709

all_data <- cbind(
  pData(eset)[, "casecontcd", drop = FALSE],
  as.data.frame(t(exprs(eset))))

Data splitting

The full study is split into a training set (2/3) and a test set (1/3), with the QC substudy part of the test set.

Step 1: Create initial partition of study subjects into training and test sets that are disjoint with respect to case-control sets, but not necessary at individual tumour level.

all_casecontsets <- unique(casecontstudy$setnr)
casecontsets_not_in_qcsubstudy <- setdiff(all_casecontsets, 
  subset(pData(qcsubstudy), complete_casecontset)$setnr)

set.seed(20101216)
init_training_casecontsets <- sample(casecontsets_not_in_qcsubstudy, 
  size = length(all_casecontsets)*2/3)

init_training_subjids <- pData(casecontstudy) %>%
  filter(setnr %in% init_training_casecontsets) %>%
  dplyr::select(subjid) %>%
  unlist()
init_test_subjids <- setdiff(casecontstudy$subjid, init_training_subjids)

tumorids_qcsubstudy <- filter(pData(qcsubstudy), 
  complete_casecontset)$tumorid
tumorids_cases_init_training <- filter(pData(casecontstudy), 
  casecontcd == "case" & subjid %in% init_training_subjids)$tumorid
tumorids_cases_init_test <- filter(pData(casecontstudy), 
  casecontcd == "case" & subjid %in% init_test_subjids)$tumorid

Step 2: Exclude some study subjects corresponding to same indivudal tumour such that the training and test sets also become disjoint at individual tumour level.

## Identify tumours that correspond to study subjects in both training and test sets
tumorids_to_be_handled_tbl <- casecontstudy_design %>%
  mutate(
    init_partition = factor(
      subjid %in% init_training_subjids,
      levels = c(TRUE, FALSE),
      labels = c("init_training", "init_test"))
  ) %>%
  dplyr::select(tumorid, init_partition) %>%
  table() %>% 
  as.data.frame.matrix() %>%
  rownames_to_column(var = "tumorid") %>%
  as_tibble() %>%
  dplyr::rename(
    n_subj_init_training = init_training,
    n_subj_init_test = init_test
  ) %>%
  rowwise() %>%
  mutate(
    in_qcsubstudy = is.element(tumorid, tumorids_qcsubstudy),
    case_in_init_training = is.element(tumorid, tumorids_cases_init_training),
    case_in_init_test = is.element(tumorid, tumorids_cases_init_test)
  ) %>%
  filter(n_subj_init_training > 0 & n_subj_init_test > 0)

## Decide
tumorids_to_be_handled_tbl <- tumorids_to_be_handled_tbl %>%
  mutate(
    excl_subj_from_training = !case_in_init_training &
      (in_qcsubstudy | case_in_init_test | (n_subj_init_training <= n_subj_init_test)),
    excl_subj_from_test = !excl_subj_from_training
  )

## Assemble final training set
tumorids_to_be_exl_from_training <- tumorids_to_be_handled_tbl %>%
  filter(excl_subj_from_training) %>%
  dplyr::select(tumorid) %>%
  unlist()

subjids_to_be_excl_from_training <- casecontstudy_design %>%
  filter(tumorid %in% tumorids_to_be_exl_from_training) %>%
  dplyr::select(subjid) %>%
  unlist()

training_subjids <- 
  setdiff(init_training_subjids, subjids_to_be_excl_from_training)

training <- all_data[training_subjids, ]

## Assemble final test set
tumorids_to_be_exl_from_test <- tumorids_to_be_handled_tbl %>%
  filter(excl_subj_from_test) %>%
  dplyr::select(tumorid) %>%
  unlist()

subjids_to_be_excl_from_test <- casecontstudy_design %>%
  filter(tumorid %in% tumorids_to_be_exl_from_test) %>%
  dplyr::select(subjid) %>%
  unlist()

test_subjids <- 
  setdiff(init_test_subjids, subjids_to_be_excl_from_test)

testing <- all_data[test_subjids, ]

Table. Number of case-control sets in each partition of the full study.
	Training set	Test set	Sum
QC substudy	0	40	40
(rest)	126	24	150
Sum	126	64	190

Indidivudal tumours:

##                       tumour_in_test_set
## tumour_in_training_set FALSE TRUE Sum
##                  FALSE     0  232 232
##                  TRUE    391    0 391
##                  Sum     391  232 623

Investigated models

Panel of models:

Linear Discriminant Analysis (LDA) with Selection by Univarite Filter
- as an example of a simple – yet for gene-expression data often supprisingly powerful – approach
Logistic Regression with Elastic Net Regularisation
- as an example of advanced method with implicit feature selection
Random Forest with Selection by Univarite Filter
- as an example of another advanced approach
(LDA based on the) HER2 amplicon metagene
- for comparison and as a positive control

Compare with, for example, http://topepo.github.io/caret/feature-selection-overview.html#feature-selection-methods

Train models

Initialise container to store models.

fit_list <- list()

Construct stratified cross-validation folds of study subjects that are disjoint with respect to case-control sets. However, the folds are not necessary disjoint at individual tumour level. Therefore, the performance as estimated by this cross-validation approach is most likely slightly over-optimistic.

createMultiGroupKFold <- function(group, k = 10, times = 5) {
  ## Adopted from caret::createMultiFolds
  pretty_nums <- paste0("Rep", gsub(" ", "0", format(1:times)))
  for (i in 1:times) {
    tmp <- groupKFold(group, k = k)
    names(tmp) <- paste0(
      "Fold", gsub(" ", "0", format(seq(along = tmp))), 
      ".", pretty_nums[i])
    out <- if (i == 1) 
      tmp
    else 
      c(out, tmp)
  }
  out
}

set.seed(20101216)
my_folds <- createMultiGroupKFold(
  group = pData(casecontstudy)[rownames(training), "setnr"],
  k = trCntrl$number, 
  times = trCntrl$repeats)

trCntrl$index <- my_folds

Model: HER2 amplicon metagene

fit_list[["HER2 amplicon metagene"]] <-
  train(casecontcd ~ HER2_amplicon_metagene,
    data = pData(eset)[rownames(training), c("casecontcd", "HER2_amplicon_metagene")],
    method = "lda",
    metric = "ROC",
    trControl = trCntrl)

print(fit_list[["HER2 amplicon metagene"]])

## Linear Discriminant Analysis 
## 
## 447 samples
##   1 predictor
##   2 classes: 'control', 'case' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 5 times) 
## Summary of sample sizes: 298, 297, 299, 301, 293, 300, ... 
## Resampling results:
## 
##   ROC        Sens       Spec      
##   0.5703095  0.9831904  0.04206516

Model: Linear Discriminant Analysis with Selection by Univarite Filter

ldaSBF2 <- ldaSBF
ldaSBF2$score <- function(x, y)
  t.test(x ~ y)$p.value
ldaSBF2$filter <- function (score, x, y)
  p.adjust(score, "bonferroni") < 0.05  # stringent criterion
ldaSBF2$summary <- twoClassSummary

filterCtrl <- sbfControl(
  functions = ldaSBF2,
  method = trCntrl$method,
  number = trCntrl$number,
  repeats = trCntrl$repeats, 
  index = trCntrl$index)

fit_list[["Linear Discriminant Analysis with Selection by Univariate Filter"]] <-
  sbf(casecontcd ~ ., data = training, sbfControl = filterCtrl)

print(fit_list[["Linear Discriminant Analysis with Selection by Univariate Filter"]])

## 
## Selection By Filter
## 
## Outer resampling method: Cross-Validated (3 fold, repeated 5 times) 
## 
## Resampling performance:
## 
##     ROC   Sens   Spec   ROCSD  SensSD SpecSD
##  0.8514 0.8679 0.6587 0.01524 0.04272 0.0843
## 
## Using the training set, 117 variables were selected:
##    X100162496_TGI_at, X100137305_TGI_at, X100121844_TGI_at, X100302080_TGI_at, X100150277_TGI_at...
## 
## During resampling, the top 5 selected variables (out of a possible 156):
##    X100121844_TGI_at (100%), X100123262_TGI_at (100%), X100125764_TGI_at (100%), X100126625_TGI_at (100%), X100126750_TGI_at (100%)
## 
## On average, 66.1 variables were selected (min = 46, max = 81)

Model: Logistic Regression with Elastic Net Regularisation

fit_list[["Logistic Regression with Elastic Net Regularisation"]] <-
  train(casecontcd ~ ., data = training,
    method = "glmnet",
    family = "binomial",
    metric = "ROC",
    trControl = trCntrl,
    tuneLength = tune_length,
    NULL)

plot(fit_list[["Logistic Regression with Elastic Net Regularisation"]])

Model: Random Forest with Selection by Univarite Filter

rfSBF2 <- rfSBF
rfSBF2$score <- function(x, y)
  t.test(x ~ y)$p.value
rfSBF2$filter <- function (score, x, y)
  p.adjust(score, "BH") < 0.25  # less stringent criterion
rfSBF2$summary <- twoClassSummary

fit_list[["Random Forest with Selection by Univariate Filter"]] <-
  sbf(casecontcd ~ ., data = training,
    sbfControl = sbfControl(
      functions = rfSBF2,
      method = trCntrl$method,
      number = trCntrl$number,
      repeats = trCntrl$repeats, 
      index = trCntrl$index))

print(fit_list[["Random Forest with Selection by Univariate Filter"]])

## 
## Selection By Filter
## 
## Outer resampling method: Cross-Validated (3 fold, repeated 5 times) 
## 
## Resampling performance:
## 
##     ROC   Sens   Spec   ROCSD  SensSD  SpecSD
##  0.8596 0.9262 0.4858 0.02245 0.04314 0.07062
## 
## Using the training set, 1566 variables were selected:
##    X100303123_TGI_at, X100125605_TGI_at, X100140461_TGI_at, X100141224_TGI_at, X100129538_TGI_at...
## 
## During resampling, the top 5 selected variables (out of a possible 2659):
##    X100121670_TGI_at (100%), X100121844_TGI_at (100%), X100121953_TGI_at (100%), X100122306_TGI_at (100%), X100122724_TGI_at (100%)
## 
## On average, 1012.7 variables were selected (min = 595, max = 1428)

Compare models

Again, the performance as estimated by this cross-validation approach is most likely slightly over-optimistic. The reason is that the cross-validation folds not necessarily are disjoint at individual tumour level.

resamps <- resamples(fit_list)
parallelplot(resamps, metric = "ROC")

dotplot(resamps, metric = "ROC")

Cache results

save(testing, training, fit_list, 
  file = file.path("..", "cache", "caret", "training.RData"))

R session information

print(sessionInfo(), locale = FALSE)

## R version 3.4.0 (2017-04-21)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] MASS_7.3-47             randomForest_4.6-12    
##  [3] glmnet_2.0-5            foreach_1.4.3          
##  [5] Matrix_1.2-9            caret_6.0-76           
##  [7] lattice_0.20-35         survival_2.41-3        
##  [9] HuRSTA2a520709.db_1.0.0 org.Hs.eg.db_3.4.1     
## [11] AnnotationDbi_1.38.0    IRanges_2.10.0         
## [13] S4Vectors_0.14.0        GEOquery_2.42.0        
## [15] Biobase_2.36.0          BiocGenerics_0.22.0    
## [17] dplyr_0.5.0             purrr_0.2.2            
## [19] readr_1.1.0             tidyr_0.6.1            
## [21] tibble_1.3.0            ggplot2_2.2.1          
## [23] tidyverse_1.1.1        
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.2.1          jsonlite_1.4        splines_3.4.0      
##  [4] modelr_0.1.0        assertthat_0.2.0    highr_0.6          
##  [7] cellranger_1.1.0    yaml_2.1.14         RSQLite_1.1-2      
## [10] backports_1.0.5     quantreg_5.33       digest_0.6.12      
## [13] rvest_0.3.2         minqa_1.2.4         colorspace_1.3-2   
## [16] htmltools_0.3.5     plyr_1.8.4          psych_1.7.3.21     
## [19] XML_3.98-1.6        broom_0.4.2         SparseM_1.77       
## [22] haven_1.0.0         genefilter_1.58.0   xtable_1.8-2       
## [25] scales_0.4.1        MatrixModels_0.4-1  lme4_1.1-13        
## [28] annotate_1.54.0     mgcv_1.8-17         car_2.1-4          
## [31] nnet_7.3-12         lazyeval_0.2.0      pbkrtest_0.4-7     
## [34] mnormt_1.5-5        magrittr_1.5        readxl_1.0.0       
## [37] memoise_1.1.0       evaluate_0.10       nlme_3.1-131       
## [40] forcats_0.2.0       xml2_1.1.1          foreign_0.8-68     
## [43] tools_3.4.0         hms_0.3             ProjectTemplate_0.7
## [46] stringr_1.2.0       munsell_0.4.3       compiler_3.4.0     
## [49] grid_3.4.0          RCurl_1.95-4.8      nloptr_1.0.4       
## [52] iterators_1.0.8     bitops_1.0-6        rmarkdown_1.5      
## [55] gtable_0.2.0        ModelMetrics_1.1.0  codetools_0.2-15   
## [58] DBI_0.6-1           reshape2_1.4.2      R6_2.2.0           
## [61] lubridate_1.6.0     knitr_1.15.1        rprojroot_1.2      
## [64] stringi_1.1.5       Rcpp_0.12.10

© 2017 John Lövrot.
This work is licensed under a Creative Commons Attribution 4.0 International License.
The source code is available at github.com/lovrot/reproduce-cunha15canres.
Version 0.0.0.9005

Addendum: Train illustrative predictive models

John Lövrot

April 29, 2017