Load project.
setwd("..")
ProjectTemplate::reload.project()
setwd("reports")
Load additional packages.
library(caret)
library(glmnet)
library(randomForest)
trellis.par.set(caretTheme())
trCntrl <- trainControl(
method = "repeatedcv",
number = 3,
repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
tune_length <- 5
n_keep <- 5000
eset <- genefilter::featureFilter(casecontstudy)
eset <- genefilter::varFilter(eset, var.cutoff = 1 - n_keep / nrow(eset))
featureNames(eset) <- make.names(fData(eset)$probeid)
print(eset)
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 5000 features, 768 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: BCPT0950_1002_1 BCPT0006_1002_0 ... BCPT0682_3163_1
## (768 total)
## varLabels: tumorid setnr ... HER2_amplicon_metagene (11 total)
## varMetadata: labelDescription
## featureData
## featureNames: X100127472_TGI_at X100140348_TGI_at ...
## X100148801_TGI_at (5000 total)
## fvarLabels: probeid entrezid symbol
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: HuRSTA2a520709
all_data <- cbind(
pData(eset)[, "casecontcd", drop = FALSE],
as.data.frame(t(exprs(eset))))
The full study is split into a training set (2/3) and a test set (1/3), with the QC substudy part of the test set.
Step 1: Create initial partition of study subjects into training and test sets that are disjoint with respect to case-control sets, but not necessary at individual tumour level.
all_casecontsets <- unique(casecontstudy$setnr)
casecontsets_not_in_qcsubstudy <- setdiff(all_casecontsets,
subset(pData(qcsubstudy), complete_casecontset)$setnr)
set.seed(20101216)
init_training_casecontsets <- sample(casecontsets_not_in_qcsubstudy,
size = length(all_casecontsets)*2/3)
init_training_subjids <- pData(casecontstudy) %>%
filter(setnr %in% init_training_casecontsets) %>%
dplyr::select(subjid) %>%
unlist()
init_test_subjids <- setdiff(casecontstudy$subjid, init_training_subjids)
tumorids_qcsubstudy <- filter(pData(qcsubstudy),
complete_casecontset)$tumorid
tumorids_cases_init_training <- filter(pData(casecontstudy),
casecontcd == "case" & subjid %in% init_training_subjids)$tumorid
tumorids_cases_init_test <- filter(pData(casecontstudy),
casecontcd == "case" & subjid %in% init_test_subjids)$tumorid
Step 2: Exclude some study subjects corresponding to same indivudal tumour such that the training and test sets also become disjoint at individual tumour level.
## Identify tumours that correspond to study subjects in both training and test sets
tumorids_to_be_handled_tbl <- casecontstudy_design %>%
mutate(
init_partition = factor(
subjid %in% init_training_subjids,
levels = c(TRUE, FALSE),
labels = c("init_training", "init_test"))
) %>%
dplyr::select(tumorid, init_partition) %>%
table() %>%
as.data.frame.matrix() %>%
rownames_to_column(var = "tumorid") %>%
as_tibble() %>%
dplyr::rename(
n_subj_init_training = init_training,
n_subj_init_test = init_test
) %>%
rowwise() %>%
mutate(
in_qcsubstudy = is.element(tumorid, tumorids_qcsubstudy),
case_in_init_training = is.element(tumorid, tumorids_cases_init_training),
case_in_init_test = is.element(tumorid, tumorids_cases_init_test)
) %>%
filter(n_subj_init_training > 0 & n_subj_init_test > 0)
## Decide
tumorids_to_be_handled_tbl <- tumorids_to_be_handled_tbl %>%
mutate(
excl_subj_from_training = !case_in_init_training &
(in_qcsubstudy | case_in_init_test | (n_subj_init_training <= n_subj_init_test)),
excl_subj_from_test = !excl_subj_from_training
)
## Assemble final training set
tumorids_to_be_exl_from_training <- tumorids_to_be_handled_tbl %>%
filter(excl_subj_from_training) %>%
dplyr::select(tumorid) %>%
unlist()
subjids_to_be_excl_from_training <- casecontstudy_design %>%
filter(tumorid %in% tumorids_to_be_exl_from_training) %>%
dplyr::select(subjid) %>%
unlist()
training_subjids <-
setdiff(init_training_subjids, subjids_to_be_excl_from_training)
training <- all_data[training_subjids, ]
## Assemble final test set
tumorids_to_be_exl_from_test <- tumorids_to_be_handled_tbl %>%
filter(excl_subj_from_test) %>%
dplyr::select(tumorid) %>%
unlist()
subjids_to_be_excl_from_test <- casecontstudy_design %>%
filter(tumorid %in% tumorids_to_be_exl_from_test) %>%
dplyr::select(subjid) %>%
unlist()
test_subjids <-
setdiff(init_test_subjids, subjids_to_be_excl_from_test)
testing <- all_data[test_subjids, ]
| Training set | Test set | Sum | |
|---|---|---|---|
| QC substudy | 0 | 40 | 40 |
| (rest) | 126 | 24 | 150 |
| Sum | 126 | 64 | 190 |
Indidivudal tumours:
## tumour_in_test_set
## tumour_in_training_set FALSE TRUE Sum
## FALSE 0 232 232
## TRUE 391 0 391
## Sum 391 232 623
Panel of models:
Compare with, for example, http://topepo.github.io/caret/feature-selection-overview.html#feature-selection-methods
See also http://topepo.github.io/caret/model-training-and-tuning.html
Initialise container to store models.
fit_list <- list()
Construct stratified cross-validation folds of study subjects that are disjoint with respect to case-control sets. However, the folds are not necessary disjoint at individual tumour level. Therefore, the performance as estimated by this cross-validation approach is most likely slightly over-optimistic.
createMultiGroupKFold <- function(group, k = 10, times = 5) {
## Adopted from caret::createMultiFolds
pretty_nums <- paste0("Rep", gsub(" ", "0", format(1:times)))
for (i in 1:times) {
tmp <- groupKFold(group, k = k)
names(tmp) <- paste0(
"Fold", gsub(" ", "0", format(seq(along = tmp))),
".", pretty_nums[i])
out <- if (i == 1)
tmp
else
c(out, tmp)
}
out
}
set.seed(20101216)
my_folds <- createMultiGroupKFold(
group = pData(casecontstudy)[rownames(training), "setnr"],
k = trCntrl$number,
times = trCntrl$repeats)
trCntrl$index <- my_folds
fit_list[["HER2 amplicon metagene"]] <-
train(casecontcd ~ HER2_amplicon_metagene,
data = pData(eset)[rownames(training), c("casecontcd", "HER2_amplicon_metagene")],
method = "lda",
metric = "ROC",
trControl = trCntrl)
print(fit_list[["HER2 amplicon metagene"]])
## Linear Discriminant Analysis
##
## 447 samples
## 1 predictor
## 2 classes: 'control', 'case'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 5 times)
## Summary of sample sizes: 298, 297, 299, 301, 293, 300, ...
## Resampling results:
##
## ROC Sens Spec
## 0.5703095 0.9831904 0.04206516
ldaSBF2 <- ldaSBF
ldaSBF2$score <- function(x, y)
t.test(x ~ y)$p.value
ldaSBF2$filter <- function (score, x, y)
p.adjust(score, "bonferroni") < 0.05 # stringent criterion
ldaSBF2$summary <- twoClassSummary
filterCtrl <- sbfControl(
functions = ldaSBF2,
method = trCntrl$method,
number = trCntrl$number,
repeats = trCntrl$repeats,
index = trCntrl$index)
fit_list[["Linear Discriminant Analysis with Selection by Univariate Filter"]] <-
sbf(casecontcd ~ ., data = training, sbfControl = filterCtrl)
print(fit_list[["Linear Discriminant Analysis with Selection by Univariate Filter"]])
##
## Selection By Filter
##
## Outer resampling method: Cross-Validated (3 fold, repeated 5 times)
##
## Resampling performance:
##
## ROC Sens Spec ROCSD SensSD SpecSD
## 0.8514 0.8679 0.6587 0.01524 0.04272 0.0843
##
## Using the training set, 117 variables were selected:
## X100162496_TGI_at, X100137305_TGI_at, X100121844_TGI_at, X100302080_TGI_at, X100150277_TGI_at...
##
## During resampling, the top 5 selected variables (out of a possible 156):
## X100121844_TGI_at (100%), X100123262_TGI_at (100%), X100125764_TGI_at (100%), X100126625_TGI_at (100%), X100126750_TGI_at (100%)
##
## On average, 66.1 variables were selected (min = 46, max = 81)
fit_list[["Logistic Regression with Elastic Net Regularisation"]] <-
train(casecontcd ~ ., data = training,
method = "glmnet",
family = "binomial",
metric = "ROC",
trControl = trCntrl,
tuneLength = tune_length,
NULL)
plot(fit_list[["Logistic Regression with Elastic Net Regularisation"]])
rfSBF2 <- rfSBF
rfSBF2$score <- function(x, y)
t.test(x ~ y)$p.value
rfSBF2$filter <- function (score, x, y)
p.adjust(score, "BH") < 0.25 # less stringent criterion
rfSBF2$summary <- twoClassSummary
fit_list[["Random Forest with Selection by Univariate Filter"]] <-
sbf(casecontcd ~ ., data = training,
sbfControl = sbfControl(
functions = rfSBF2,
method = trCntrl$method,
number = trCntrl$number,
repeats = trCntrl$repeats,
index = trCntrl$index))
print(fit_list[["Random Forest with Selection by Univariate Filter"]])
##
## Selection By Filter
##
## Outer resampling method: Cross-Validated (3 fold, repeated 5 times)
##
## Resampling performance:
##
## ROC Sens Spec ROCSD SensSD SpecSD
## 0.8596 0.9262 0.4858 0.02245 0.04314 0.07062
##
## Using the training set, 1566 variables were selected:
## X100303123_TGI_at, X100125605_TGI_at, X100140461_TGI_at, X100141224_TGI_at, X100129538_TGI_at...
##
## During resampling, the top 5 selected variables (out of a possible 2659):
## X100121670_TGI_at (100%), X100121844_TGI_at (100%), X100121953_TGI_at (100%), X100122306_TGI_at (100%), X100122724_TGI_at (100%)
##
## On average, 1012.7 variables were selected (min = 595, max = 1428)
Again, the performance as estimated by this cross-validation approach is most likely slightly over-optimistic. The reason is that the cross-validation folds not necessarily are disjoint at individual tumour level.
resamps <- resamples(fit_list)
parallelplot(resamps, metric = "ROC")
dotplot(resamps, metric = "ROC")
See also http://topepo.github.io/caret/model-training-and-tuning.html#between-models
save(testing, training, fit_list,
file = file.path("..", "cache", "caret", "training.RData"))
print(sessionInfo(), locale = FALSE)
## R version 3.4.0 (2017-04-21)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] MASS_7.3-47 randomForest_4.6-12
## [3] glmnet_2.0-5 foreach_1.4.3
## [5] Matrix_1.2-9 caret_6.0-76
## [7] lattice_0.20-35 survival_2.41-3
## [9] HuRSTA2a520709.db_1.0.0 org.Hs.eg.db_3.4.1
## [11] AnnotationDbi_1.38.0 IRanges_2.10.0
## [13] S4Vectors_0.14.0 GEOquery_2.42.0
## [15] Biobase_2.36.0 BiocGenerics_0.22.0
## [17] dplyr_0.5.0 purrr_0.2.2
## [19] readr_1.1.0 tidyr_0.6.1
## [21] tibble_1.3.0 ggplot2_2.2.1
## [23] tidyverse_1.1.1
##
## loaded via a namespace (and not attached):
## [1] httr_1.2.1 jsonlite_1.4 splines_3.4.0
## [4] modelr_0.1.0 assertthat_0.2.0 highr_0.6
## [7] cellranger_1.1.0 yaml_2.1.14 RSQLite_1.1-2
## [10] backports_1.0.5 quantreg_5.33 digest_0.6.12
## [13] rvest_0.3.2 minqa_1.2.4 colorspace_1.3-2
## [16] htmltools_0.3.5 plyr_1.8.4 psych_1.7.3.21
## [19] XML_3.98-1.6 broom_0.4.2 SparseM_1.77
## [22] haven_1.0.0 genefilter_1.58.0 xtable_1.8-2
## [25] scales_0.4.1 MatrixModels_0.4-1 lme4_1.1-13
## [28] annotate_1.54.0 mgcv_1.8-17 car_2.1-4
## [31] nnet_7.3-12 lazyeval_0.2.0 pbkrtest_0.4-7
## [34] mnormt_1.5-5 magrittr_1.5 readxl_1.0.0
## [37] memoise_1.1.0 evaluate_0.10 nlme_3.1-131
## [40] forcats_0.2.0 xml2_1.1.1 foreign_0.8-68
## [43] tools_3.4.0 hms_0.3 ProjectTemplate_0.7
## [46] stringr_1.2.0 munsell_0.4.3 compiler_3.4.0
## [49] grid_3.4.0 RCurl_1.95-4.8 nloptr_1.0.4
## [52] iterators_1.0.8 bitops_1.0-6 rmarkdown_1.5
## [55] gtable_0.2.0 ModelMetrics_1.1.0 codetools_0.2-15
## [58] DBI_0.6-1 reshape2_1.4.2 R6_2.2.0
## [61] lubridate_1.6.0 knitr_1.15.1 rprojroot_1.2
## [64] stringi_1.1.5 Rcpp_0.12.10
© 2017 John Lövrot.
This work is licensed under a Creative Commons Attribution 4.0 International License.
The source code is available at github.com/lovrot/reproduce-cunha15canres.
Version 0.0.0.9005