I had some success in classifying the TMRC2 samples by strain via ML and want to try something more difficult. Thus I will use the normalized gene expression data and try classifying it by cure/fail.
All analyses in this document heavily use caret (Kuhn (2008)) and, as per the sister TMRC2 document, follow pretty closely the path from:
https://topepo.github.io/caret/data-splitting.html#simple-splitting-based-on-the-outcome
and
https://github.com/compgenomr/book/blob/master/05-supervisedLearning.Rmd
In the strain classifier I used normalized variants. I am thinking to use normalized expression here and therefore explicitly limit myself to ~ 20k variables (significantly down from the 1.6M).
In addition, caret expects the data as (rows == samples) and (columns == variables) where each element is one observation. Thus we will need to transpose the expression matrix.
input_data <- subset_expt(tc_clinical, subset="batch!='biopsy'") %>%
normalize_expt(transform = "log2", convert = "cpm")## subset_expt(): There were 184, now there are 166 samples.
## transform_counts: Found 932911 values equal to 0, adding 1 to the matrix.
ref_col <- "finaloutcome"
outcome_factor <- as.factor(as.character(pData(input_data)[[ref_col]]))
comparison_n <- 200
## Setting this up in case Najib and Maria Adelaida request a Tumaco set of exploratory analyses.
t_input_data <- subset_expt(tc_clinical, subset="batch!='biopsy'") %>%
normalize_expt(transform = "log2", convert = "cpm")## subset_expt(): There were 184, now there are 166 samples.
## transform_counts: Found 932911 values equal to 0, adding 1 to the matrix.
t_outcome_factor <- as.factor(as.character(pData(t_input_data)[[ref_col]]))Given that we have separated the various analyses, it will take me a minute to figure out where I saved the relevant differential expression analysis. I do not actually save the various DE results to rda files by default, instead opting to send them to xlsx files to share. Recall if you will, that the data that I think might be used for the paper also does not go into the default excel directory but instead mirrors the box organization scheme.
Thus, I think the most relevant file is: “analyses/3_cali_and_tumaco/DE_Cure_vs_Fail/All_Samples/tc_cf_clinical_table_sva-v202207.xlsx”
and
“analyses/4_tumaco/DE_Cure_vs_Fail/All_Samples/t_cf_clinical_table_sva-v202207.xlsx”
Note, this xlsx file contains three primary worksheets, v1cf, v2cf, v3cf which comprise all samples across cell types for each visit and compares cure and fail. When I choose sheet #2 in the following block I am explicitly asking for only the v1 cure/fail result.
input_xlsx <- glue("analyses/3_cali_and_tumaco/DE_Cure_Fail/Clinical_Samples/tc_clinical_cf_sig_sva-v{ver}.xlsx")
all_de_cf <- openxlsx::readWorkbook(input_xlsx, sheet = 3, startRow = 2)
rownames(all_de_cf) <- all_de_cf[["row.names"]]
all_de_cf[["row.names"]] <- NULL
deseq_de_cf_up <- all_de_cf[, c("deseq_logfc", "deseq_adjp", "deseq_basemean", "deseq_lfcse")]
all_de_cf <- openxlsx::readWorkbook(input_xlsx, sheet = 4, startRow = 2)
rownames(all_de_cf) <- all_de_cf[["row.names"]]
all_de_cf[["row.names"]] <- NULL
deseq_de_cf_down <- all_de_cf[, c("deseq_logfc", "deseq_adjp", "deseq_basemean", "deseq_lfcse")]
deseq_de_cf <- rbind.data.frame(deseq_de_cf_up, deseq_de_cf_down)
tumaco_xlsx <- glue("analyses/4_tumaco/DE_Cure_Fail/t_all_visitcf_sig_sva-v{ver}.xlsx")
t_de_cf <- openxlsx::readWorkbook(tumaco_xlsx, sheet = 3, startRow = 2)
rownames(t_de_cf) <- t_de_cf[["row.names"]]
t_de_cf[["row.names"]] <- NULL
t_de_cf_up <- t_de_cf[, c("deseq_logfc", "deseq_adjp", "deseq_basemean", "deseq_lfcse")]
t_de_cf <- openxlsx::readWorkbook(tumaco_xlsx, sheet = 4, startRow = 2)
rownames(t_de_cf) <- t_de_cf[["row.names"]]
t_de_cf[["row.names"]] <- NULL
t_de_cf_down <- t_de_cf[, c("deseq_logfc", "deseq_adjp", "deseq_basemean", "deseq_lfcse")]
t_de_cf <- rbind.data.frame(t_de_cf_up, t_de_cf_down)
top_bottom_ids <- rownames(t_de_cf)The following block contains a few lists which will hold comparisons between DESeq2 and the various methods which follow, some parameters, and a function to compare the methods and DESeq.
Presumably DESeq and the models should be responding to variance in the data, for which I think the logFC values, p-values, mean values, or standard errors are the most likely proxies to which I have easy access. So, let us pull the top/bottom n genes vis a vis each of those categories and see what happens?
Here is a little function to compare a ML result to the above DE table. It should provide the importance plot, a venn of the two methods, and the gene sets (with annotations) for the unique/shared genes between this ML method and the DE table. Note, this is just using the highest/lowest logFC values and ignoring the p-value for now.
comparison_lfc <- list()
comparison_adjp <- list()
comparison_wgcna <- list()
annotations <- fData(t_monocytes) ## Arbitrarily taken
compare_ml_de <- function(importance, sig_table, annotations, n = comparison_n, plot_n = 15,
according_to = "deseq_logfc", column = "Overall") {
ml_df <- importance[["importance"]] %>%
arrange(desc(!!sym(column)))
importance_plot <- plot(importance, top = plot_n)
top_ml <- head(rownames(ml_df), n = n)
de_df <- sig_table %>%
arrange(desc(deseq_logfc))
top_de <- head(rownames(de_df), n = n)
bottom_de <- tail(rownames(de_df), n = n)
top_bottom_ids <- c(top_de, bottom_de)
comparison <- list("de" = top_bottom_ids, "ml" = top_ml)
comparison_venn <- Vennerable::Venn(comparison)
shared_ids <- comparison_venn@IntersectionSets["11"][[1]]
shared_annot <- annotations[shared_ids, ]
de_ids <- comparison_venn@IntersectionSets["10"][[1]]
ml_ids <- comparison_venn@IntersectionSets["01"][[1]]
only_de <- annotations[de_ids, ]
only_ml <- annotations[ml_ids, ]
comparison_plot <- Vennerable::plot(comparison_venn, doWeights = FALSE, type = "circles")
retlist <- list(
"importance_plot" = importance_plot,
"comparison" = comparison,
"comparison_venn" = comparison_venn,
"comparison_plot" = comparison_plot,
"shared" = shared_annot,
"only_de" = only_de,
"only_ml" = only_ml)
return(retlist)
}The ML text I am reading provide some neat examples for how one might filter the data to make it more suitable for model creation.
The first filter I was introduced to is quite familiar from our sequencing data, the removal of features with near-zero-variance. Indeed, I am pretty certain that normalize_expt() could do this equivalently and significantly faster than caret::preProcess().
nrow(exprs(input_data))## [1] 19952
system.time({
equivalent <- normalize_expt(input_data, filter = "cv", cv_min = 0.1)
})## Removing 3619 low-count genes (16333 remaining).
## user system elapsed
## 1.059 0.236 1.295
dim(exprs(equivalent))## [1] 16333 166
## Given this large amount of data, this step is slow, taking > 10 minutes.
## Yeah seriously, the following three lines get 16,723 genes in 10 minutes while
## the normalize_expt() call above gets 16,749 genes in 2.4 seconds.
#system.time({
# nzv <- preProcess(texprs, method="nzv", uniqueCut=15)
# nzv_texprs <- predict(nzv, texprs)
# dim(nzv_texprs)
#}
nzv_texprs <- t(exprs(equivalent))I think I am a bit confused by this filter, one would think that the nzv filter above, if applied correctly, should give you exactly this.
For the moment, I am excluding the following block in order to see how much time/memory keeping these variables costs. If I recall properly, the model of the top-2k variant positions cost ~ 1-4G of memory. I hope that this scales linearly, but I am thinking it might not.
standard_devs <- apply(nzv_texprs, 2, sd)
top_predictors <- order(standard_devs, decreasing = TRUE)[1:3000]
nzv_texprs <- nzv_texprs[, top_predictors]I think centering may not be needed for this data, but here is how:
nzv_center <- preProcess(nzv_texprs, method = "center")
nzv_texprs <- predict(nzv_center, nzv_texprs)There are a few metadata factors which might prove of interest for classification. The most obvious are of course outcome, clinic, donor, visit, celltype. I am, for the moment, only likely to focus on outcome. AFAICT I can only include one of these at a time in the data, which is a shame.
interesting_meta <- pData(input_data)[, c("finaloutcome", "donor", "persistence",
"visitnumber", "selectionmethod",
"typeofcells", "time")]
ml_df <- as.data.frame(cbind(outcome_factor, as.data.frame(nzv_uncorr)))
ml_df[["outcome_factor"]] <- as.factor(ml_df[["outcome_factor"]])
dim(ml_df)## [1] 166 965
caret provides nice functionality for splitting up the data. I suspect there are many more fun knobs I can play with for instances where I need to exclude some levels of a factor and such. In this case I just want to split by outcome.
ml_df <- as.data.frame(cbind(outcome_factor, as.data.frame(nzv_uncorr)))
datasets <- create_partitions(nzv_uncorr, interesting_meta,
outcome_factor = outcome_factor)There are a few likely sampling methods: cross-validation, bootstrapping, and jackknifing. I will try those out later.
My goals from here on will be to get the beginnings of a sense of the various methods I can use to create the models from the training data and predict the outcome on the test data. I am hoping also to pick up some idea of what the various arguments mean while I am at it.
k-nearest neighbors is somewhat similar to a kmeans estimate. Thus the primary argument is ‘k’
split <- 1
train_all <- datasets[["trainers"]][[split]]
train_df <- datasets[["trainers_stripped"]][[split]]
train_idx <- datasets[["train_idx"]][[split]]
train_outcomes <- datasets[["trainer_outcomes"]][[split]]
test_df <- datasets[["testers"]][[split]]
test_idx <- datasets[["test_idx"]][[split]]
test_outcomes <- datasets[["tester_outcomes"]][[split]]
knn_fit <- knn3(x = train_df,
y = train_outcomes,
k = 3)
knn_predict_trained <- predict(knn_fit, train_df, type = "prob")
knn_train_evaluated <- self_evaluate_model(knn_predict_trained, datasets,
which = split, type = "train")## Setting levels: control = cure, case = failure
## Setting direction: controls > cases
knn_train_evaluated## The summary of the (in)correct calls is:
## Mode FALSE TRUE
## logical 7 60
## The missed samples are:
## [1] "TMRC30178" "TMRC30224" "TMRC30119" "TMRC30037" "TMRC30070" "TMRC30145"
## [7] "TMRC30220"
## The confusion matrix is:
## Confusion Matrix and Statistics
##
## Reference
## Prediction cure failure
## cure 42 2
## failure 5 18
##
## Accuracy : 0.896
## 95% CI : (0.797, 0.957)
## No Information Rate : 0.701
## P-Value [Acc > NIR] : 0.000143
##
## Kappa : 0.761
##
## Mcnemar's Test P-Value : 0.449692
##
## Sensitivity : 0.894
## Specificity : 0.900
## Pos Pred Value : 0.955
## Neg Pred Value : 0.783
## Precision : 0.955
## Recall : 0.894
## F1 : 0.923
## Prevalence : 0.701
## Detection Rate : 0.627
## Detection Prevalence : 0.657
## Balanced Accuracy : 0.897
##
## 'Positive' Class : cure
##
## The ROC AUC is: 0.95800395256917.
As the confusion matrix shows, this failed for a few samples. Perhaps let us change k and see if it improves.
Here is a table of fase positives/negatives for a few values of ‘k’, in this context a false positive is calling a known cure as a failure and false negative is calling a known failure as a cure.
|—|—|—| |k |fp |fn | |2 |0 |8 | |3 |5 |5 | |4 |8 |9 | |5 |11 |7 | |6 |15 |8 |
Note: this depends on the luck of rand(), so the above numbers shift moderately from one run to the next. Thus I think I will just use 2 or 3.
knn_fit2 <- knn3(x = train_df,
y = train_outcomes,
k = 5)
knn_predict_trained2 <- predict(knn_fit2, train_df, type = "prob")
knn_train_evaluated2 <- self_evaluate_model(knn_predict_trained2, datasets,
which = split, type = "train")## Setting levels: control = cure, case = failure
## Setting direction: controls > cases
knn_train_evaluated2## The summary of the (in)correct calls is:
## Mode FALSE TRUE
## logical 10 57
## The missed samples are:
## [1] "TMRC30178" "TMRC30224" "TMRC30080" "TMRC30119" "TMRC30103" "TMRC30037"
## [7] "TMRC30070" "TMRC30183" "TMRC30145" "TMRC30220"
## The confusion matrix is:
## Confusion Matrix and Statistics
##
## Reference
## Prediction cure failure
## cure 39 5
## failure 5 18
##
## Accuracy : 0.851
## 95% CI : (0.743, 0.926)
## No Information Rate : 0.657
## P-Value [Acc > NIR] : 0.00032
##
## Kappa : 0.669
##
## Mcnemar's Test P-Value : 1.00000
##
## Sensitivity : 0.886
## Specificity : 0.783
## Pos Pred Value : 0.886
## Neg Pred Value : 0.783
## Precision : 0.886
## Recall : 0.886
## F1 : 0.886
## Prevalence : 0.657
## Detection Rate : 0.582
## Detection Prevalence : 0.657
## Balanced Accuracy : 0.834
##
## 'Positive' Class : cure
##
## The ROC AUC is: 0.919466403162055.
knn_predict_test <- predict(knn_fit, test_df)
knn_test_evaluated <- self_evaluate_model(knn_predict_test, datasets,
which = split, type = "test")## Setting levels: control = cure, case = failure
## Setting direction: controls < cases
knn_test_evaluated## The summary of the (in)correct calls is:
## Mode FALSE TRUE
## logical 33 66
## The missed samples are:
## [1] "TMRC30179" "TMRC30221" "TMRC30222" "TMRC30223" "TMRC30071" "TMRC30094"
## [7] "TMRC30169" "TMRC30096" "TMRC30118" "TMRC30121" "TMRC30030" "TMRC30194"
## [13] "TMRC30166" "TMRC30048" "TMRC30054" "TMRC30191" "TMRC30139" "TMRC30157"
## [19] "TMRC30184" "TMRC30129" "TMRC30172" "TMRC30174" "TMRC30142" "TMRC30198"
## [25] "TMRC30200" "TMRC30202" "TMRC30238" "TMRC30074" "TMRC30208" "TMRC30077"
## [31] "TMRC30218" "TMRC30079" "TMRC30265"
## The confusion matrix is:
## Confusion Matrix and Statistics
##
## Reference
## Prediction cure failure
## cure 53 12
## failure 21 13
##
## Accuracy : 0.667
## 95% CI : (0.565, 0.758)
## No Information Rate : 0.747
## P-Value [Acc > NIR] : 0.973
##
## Kappa : 0.211
##
## Mcnemar's Test P-Value : 0.164
##
## Sensitivity : 0.716
## Specificity : 0.520
## Pos Pred Value : 0.815
## Neg Pred Value : 0.382
## Precision : 0.815
## Recall : 0.716
## F1 : 0.763
## Prevalence : 0.747
## Detection Rate : 0.535
## Detection Prevalence : 0.657
## Balanced Accuracy : 0.618
##
## 'Positive' Class : cure
##
## The ROC AUC is: 0.290723981900453.
knn_predict_test2 <- predict(knn_fit2, test_df)
knn_test_evaluated2 <- self_evaluate_model(knn_predict_test2, datasets,
which = split, type = "test")## Setting levels: control = cure, case = failure
## Setting direction: controls > cases
knn_test_evaluated2## The summary of the (in)correct calls is:
## Mode FALSE TRUE
## logical 36 63
## The missed samples are:
## [1] "TMRC30179" "TMRC30221" "TMRC30222" "TMRC30223" "TMRC30071" "TMRC30094"
## [7] "TMRC30122" "TMRC30169" "TMRC30096" "TMRC30083" "TMRC30118" "TMRC30121"
## [13] "TMRC30030" "TMRC30194" "TMRC30166" "TMRC30048" "TMRC30054" "TMRC30191"
## [19] "TMRC30139" "TMRC30132" "TMRC30157" "TMRC30184" "TMRC30129" "TMRC30172"
## [25] "TMRC30174" "TMRC30142" "TMRC30198" "TMRC30200" "TMRC30202" "TMRC30238"
## [31] "TMRC30074" "TMRC30208" "TMRC30077" "TMRC30218" "TMRC30079" "TMRC30265"
## The confusion matrix is:
## Confusion Matrix and Statistics
##
## Reference
## Prediction cure failure
## cure 52 13
## failure 23 11
##
## Accuracy : 0.636
## 95% CI : (0.534, 0.731)
## No Information Rate : 0.758
## P-Value [Acc > NIR] : 0.998
##
## Kappa : 0.133
##
## Mcnemar's Test P-Value : 0.134
##
## Sensitivity : 0.693
## Specificity : 0.458
## Pos Pred Value : 0.800
## Neg Pred Value : 0.324
## Precision : 0.800
## Recall : 0.693
## F1 : 0.743
## Prevalence : 0.758
## Detection Rate : 0.525
## Detection Prevalence : 0.657
## Balanced Accuracy : 0.576
##
## 'Positive' Class : cure
##
## The ROC AUC is: 0.655429864253394.
The cross validation method of repeated sampling the data is all done within the train() function. With that in mind, here it is operating with the knn method.
When train() is called with the trControl and tuneGrid, we can control how the knn training is repeated, in this case it will iterate over k from 1 to 10.
This currently fails due to a stack overflow…
cv_control <- trainControl(method = "cv", number = 10)
knn_train_fit <- train(outcome_factor ~ ., data = train_df,
method = "knn",
trControl = cv_control,
tuneGrid = data.frame(k = 1:10))
knn_train_fit[["bestTune"]]
plot(x = 1:10, 1 - knn_train_fit$results[, 2], pch = 19,
ylab = "prediction error", xlab = "k")
lines(loess.smooth(x = 1:10, 1 - knn_train_fit$results[, 2],degree = 2),
col = "#CC0000")boot_control <- trainControl(method = "boot", number = 20,
returnResamp = "all")
knn_train_fit <- train(outcome ~ ., data = train_all,
method = "knn",
trControl = boot_control,
tuneGrid = data.frame(k = 1:10))
knn_train_fit[["bestTune"]]## k
## 1 1
plot(x = 1:10, 1 - knn_train_fit$results[, 2], pch = 19,
ylab = "prediction error", xlab = "k")
lines(loess.smooth(x = 1:10, 1 - knn_train_fit$results[, 2],degree = 2),
col = "#CC0000")In this instance we will search for genes which were important for the model’s creation.
The DALEX package provides a function: feature_importance() which seeks to use a series of other methods to extract (in this case, genes) features which do a good job of explaining the result produced by the model. In the case of this dataset, which has thousands of features, this does not appear to end well.
explainer_knn <- DALEX::explain(knn_fit, label = "knn",
data = train_df,
y = as.numeric(train_outcomes))
## AFAICT the following will take forever unless we drastically reduce the complexity of the model.
## yeah, I let it run for a week.
## features <- feature_importance(explainer_knn, n_sample = 50, type = "difference")Conversely, we can use the varImp() function…
knn_variable_importance <- varImp(knn_train_fit)
plot(knn_variable_importance, top = 15)knn_variables <- knn_variable_importance[["importance"]] %>%
arrange(desc(cure))Given that, we can ask for the similarity to the DESeq results…
knn_comparison <- compare_ml_de(knn_variable_importance, deseq_de_cf, annotations,
n = comparison_n, plot_n = 15, column = "cure")knn_comparison[["comparison_plot"]]## Vennmar
comparison_lfc[["knn"]] <- knn_comparison[["shared"]]I think R/caret provides a few implementation of the forest family of methods, this is using ranger(Wright and Ziegler (2017)).
The parameter ‘mtry’ is often important, if I read the text correctly it controls how many variables to sample in each split of the tree. Thus higher numbers should presumably make it more specific at the risk of overfitting.
Setting min.node.size sets the minimume node size of terminal nodes in each tree. Each increment up speeds the algorithm.
I am going to use my boot control trainer from above and see how it goes.
rf_train_fit <- train(outcome ~ ., data = train_all,
method = "ranger", trControl = boot_control,
importance = "permutation",
tuneGrid = data.frame(
mtry = 200,
min.node.size = 1,
splitrule = "gini"),
verbose = TRUE)
rf_train_fit[["finalModel"]][["prediction.error"]]## [1] 0.1791
rf_variable_importance <- varImp(rf_train_fit)
plot(rf_variable_importance, top = 15)rf_variables <- rf_variable_importance[["importance"]] %>%
arrange(desc(Overall))
rf_predict_trained <- predict(rf_train_fit, train_df)
rf_predict_evaluated <- self_evaluate_model(rf_predict_trained, datasets,
which = split, type = "train")## Setting levels: control = cure, case = failure
## Setting direction: controls < cases
rf_predict_evaluated## The summary of the (in)correct calls is:
## Mode TRUE
## logical 67
## The missed samples are:
## character(0)
## The confusion matrix is:
## Confusion Matrix and Statistics
##
## Reference
## Prediction cure failure
## cure 44 0
## failure 0 23
##
## Accuracy : 1
## 95% CI : (0.946, 1)
## No Information Rate : 0.657
## P-Value [Acc > NIR] : 5.81e-13
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.000
## Specificity : 1.000
## Pos Pred Value : 1.000
## Neg Pred Value : 1.000
## Precision : 1.000
## Recall : 1.000
## F1 : 1.000
## Prevalence : 0.657
## Detection Rate : 0.657
## Detection Prevalence : 0.657
## Balanced Accuracy : 1.000
##
## 'Positive' Class : cure
##
## The ROC AUC is: 1.
A couple months ago I spent a little time attempting to recapitulate Alejandro’s WGCNA results. I think I did so by mostly copy/pasting his work and adding some commentary and tweaking parts of it so that it was easier for me to read/understand. In the process, I generated a series of modules which looked similar/identical to his. Unfortunately, I did not add some sections to record the genes/modules to some output files. I am therefore going back to that now and doing so in the hopes that I can compare those modules to the results produced by the clasifiers.
It seems that the interest in this comparison has waned, so I am just going to disable it.
wgcna_result <- openxlsx::readWorkbook(glue("excel/wgcna_interesting_genes-v{ver}.xlsx"))
rownames(wgcna_result) <- wgcna_result[["row.names"]]
wgcna_result[["row.names"]] <- NULL
top_ml <- rownames(head(rf_variables, n = comparison_n))
comparison <- list("wgcna" = rownames(wgcna_result), "ml" = top_ml)
comparison_venn <- Vennerable::Venn(comparison)
comparison_wgcna[["rf"]] <- annotations[comparison_venn@IntersectionSets["11"][[1]], ]
Vennerable::plot(comparison_venn, doWeights = FALSE, type = "circles")Let us take a moment and see if the top-n genes returned by varImp() have some meaning which jumps out. One might assume, given our extant Differential Expression results, that the interleukin response will be a likely candidate.
importance_gp <- simple_gprofiler(rownames(head(rf_variables, n = comparison_n)))
importance_gp## A set of ontologies produced by gprofiler using 200
## genes against the hsapiens annotations and significance cutoff 0.05.
## There are:
## 3 MF
## 53 BP
## 0 KEGG
## 0 REAC
## 1 WP
## 3 TF
## 1 MIRNA
## 0 HPA
## 0 CORUM
## 0 HP hits.
rf_predict_test <- predict(rf_train_fit, test_df)
rf_predict_test_evaluated <- self_evaluate_model(rf_predict_test, datasets,
which = split, type = "test")## Setting levels: control = cure, case = failure
## Setting direction: controls < cases
rf_predict_test_evaluated## The summary of the (in)correct calls is:
## Mode FALSE TRUE
## logical 30 69
## The missed samples are:
## [1] "TMRC30179" "TMRC30221" "TMRC30222" "TMRC30223" "TMRC30176" "TMRC30071"
## [7] "TMRC30094" "TMRC30122" "TMRC30096" "TMRC30083" "TMRC30118" "TMRC30121"
## [13] "TMRC30194" "TMRC30048" "TMRC30054" "TMRC30139" "TMRC30157" "TMRC30184"
## [19] "TMRC30129" "TMRC30142" "TMRC30198" "TMRC30200" "TMRC30202" "TMRC30204"
## [25] "TMRC30238" "TMRC30074" "TMRC30208" "TMRC30077" "TMRC30218" "TMRC30079"
## The confusion matrix is:
## Confusion Matrix and Statistics
##
## Reference
## Prediction cure failure
## cure 58 7
## failure 23 11
##
## Accuracy : 0.697
## 95% CI : (0.596, 0.785)
## No Information Rate : 0.818
## P-Value [Acc > NIR] : 0.99888
##
## Kappa : 0.243
##
## Mcnemar's Test P-Value : 0.00617
##
## Sensitivity : 0.716
## Specificity : 0.611
## Pos Pred Value : 0.892
## Neg Pred Value : 0.324
## Precision : 0.892
## Recall : 0.716
## F1 : 0.795
## Prevalence : 0.818
## Detection Rate : 0.586
## Detection Prevalence : 0.657
## Balanced Accuracy : 0.664
##
## 'Positive' Class : cure
##
## The ROC AUC is: 0.607918552036199.
Logistic regression is a statistical method for binary responses. However, it is able to work with multiple classes as well. The general idea of this method is to find parameters which increase the likelihood that the observed data is sampled from a statistical distribution of interest. The transformations and linear regression-esque tasks performed are confusing, but once those are performed, the task becomes setting the model’s (fitting) parameters to values which increase the probability that the statistical model looks like the actual dataset given the training data, and that when samples, will return values which are similar. The most likely statistical distributions one will want to fit are the Gaussian, in which case we want to transform/normalize the mean/variance of our variables so they look whatever normal distribution we are using. Conversely, logistic regression uses a binnomial distribution (like our raw sequencing data!) but which is from 0-1.
Let us take the most important gene observed in one of our previous training sets: ENSG00000248405 PRR5-ARHGAP8
gene_id <- "ENSG00000248405"
single_fit <- train(
outcome ~ ENSG00000248405, data = train_all,
method = "glm", family = "binomial", trControl = trainControl("none"))
tt <- data.frame("ENSG00000248405" = seq(min(train_df[[gene_id]]),
max(train_df[[gene_id]]), len = 100))
## predict probabilities for the simulated data
tt$subtype = predict(single_fit, newdata = tt, type="prob")[, 1]
## plot the sigmoid curve and the training data
plot(ifelse(outcome == "cure", 1, 0) ~ ENSG00000248405,
data = train_all, col = "red4",
ylab = "CF as 0 or 1", xlab = "favorite gene expression")
lines(subtype ~ ENSG00000248405, tt, col = "green4", lwd = 2)
plot_df <- train_all[, c("outcome", "ENSG00000248405")]
ggbetweenstats(plot_df, "outcome", "ENSG00000248405")Having tried with 1 gene, let us extend this to all genes. In my first try of this, it took a long time.
glm_train_fit <- train(outcome ~ ., data = train_all,
trControl = boot_control,
method = "glm", family = "binomial")## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
The previous block does not take into account our parameter sweep, so it isn’t theoretically very useful for this series of methological tests. As a result, the following block is also not particularly helpful – though I suspect the results are identical to what follows.
glm_variable_importance <- varImp(glm_train_fit)
## Oh, this only produces 100 entries -- so me getting the top 400 is silly.
glm_variables <- glm_variable_importance[["importance"]] %>%
arrange(desc(Overall))
tt = plot(glm_variable_importance, top = 15)
simple_gprofiler(rownames(head(glm_variables, n = comparison_n)))## A set of ontologies produced by gprofiler using 66
## genes against the hsapiens annotations and significance cutoff 0.05.
## There are:
## 3 MF
## 163 BP
## 3 KEGG
## 8 REAC
## 8 WP
## 2 TF
## 0 MIRNA
## 4 HPA
## 0 CORUM
## 0 HP hits.
top_glm <- rownames(glm_variables)
comparison <- list("de" = top_bottom_ids, "ml" = top_glm)
comparison_venn <- Vennerable::Venn(comparison)
comparison_lfc[["glm"]] <- fData(c_monocytes)[comparison_venn@IntersectionSets["11"][[1]], ]
Vennerable::plot(comparison_venn, doWeights = FALSE, type = "circles")Dropping the adjp and wgcna comparisons, they show little overlap.
comparison <- list("de" = lowest_adjp, "ml" = top_glm)
comparison_venn <- Vennerable::Venn(comparison)
comparison_adjp[["glm"]] <- fData(c_monocytes)[comparison_venn@IntersectionSets["11"][[1]], ]
Vennerable::plot(comparison_venn, doWeights = FALSE, type = "circles")
comparison <- list("de" = highest_exprs, "ml" = top_glm)
comparison_venn <- Vennerable::Venn(comparison)
Vennerable::plot(comparison_venn, doWeights = FALSE, type = "circles")In this block, I repeat the above with the tuneGrid option set so that it may do the training using a sweep of putatively helpful parameters; as a result it should provide a more appropriate (but quite possibly identical) trainer.
##rf_method <- trainControl(method = "ranger", number = 10, verbose = TRUE)
## train_method <- trainControl(method = "cv", number = 10)
glm_fit <- train(outcome ~ ., data = train_all, method = "glmnet",
trControl = boot_control, importance = "permutation",
tuneGrid = data.frame(
alpha = 0.5,
lambda = seq(0.1, 0.7, 0.05)),
verbose = TRUE)
glm_fit## glmnet
##
## 67 samples
## 964 predictors
## 2 classes: 'cure', 'failure'
##
## No pre-processing
## Resampling: Bootstrapped (20 reps)
## Summary of sample sizes: 67, 67, 67, 67, 67, 67, ...
## Resampling results across tuning parameters:
##
## lambda Accuracy Kappa
## 0.10 0.7661 0.44993
## 0.15 0.7394 0.38191
## 0.20 0.7270 0.33173
## 0.25 0.7290 0.32021
## 0.30 0.7286 0.28591
## 0.35 0.7127 0.22504
## 0.40 0.7053 0.16911
## 0.45 0.6780 0.02789
## 0.50 0.6759 0.00000
## 0.55 0.6759 0.00000
## 0.60 0.6759 0.00000
## 0.65 0.6759 0.00000
## 0.70 0.6759 0.00000
##
## Tuning parameter 'alpha' was held constant at a value of 0.5
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.5 and lambda = 0.1.
glm_predict_trained <- predict(glm_fit, train_df)
glm_train_eval <- self_evaluate_model(glm_predict_trained, datasets,
which = split, type = "train")## Setting levels: control = cure, case = failure
## Setting direction: controls < cases
glm_train_eval## The summary of the (in)correct calls is:
## Mode FALSE TRUE
## logical 3 64
## The missed samples are:
## [1] "TMRC30224" "TMRC30145" "TMRC30220"
## The confusion matrix is:
## Confusion Matrix and Statistics
##
## Reference
## Prediction cure failure
## cure 43 1
## failure 2 21
##
## Accuracy : 0.955
## 95% CI : (0.875, 0.991)
## No Information Rate : 0.672
## P-Value [Acc > NIR] : 1.61e-08
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.956
## Specificity : 0.955
## Pos Pred Value : 0.977
## Neg Pred Value : 0.913
## Precision : 0.977
## Recall : 0.956
## F1 : 0.966
## Prevalence : 0.672
## Detection Rate : 0.642
## Detection Prevalence : 0.657
## Balanced Accuracy : 0.955
##
## 'Positive' Class : cure
##
## The ROC AUC is: 0.945158102766798.
glm_predict_test <- predict(glm_fit, test_df)
glm_fit_eval_test <- self_evaluate_model(glm_predict_test, datasets,
which = split, type = "test")## Setting levels: control = cure, case = failure
## Setting direction: controls < cases
glm_fit_eval_test## The summary of the (in)correct calls is:
## Mode FALSE TRUE
## logical 18 81
## The missed samples are:
## [1] "TMRC30179" "TMRC30222" "TMRC30094" "TMRC30122" "TMRC30083" "TMRC30118"
## [7] "TMRC30121" "TMRC30041" "TMRC30171" "TMRC30139" "TMRC30142" "TMRC30200"
## [13] "TMRC30202" "TMRC30204" "TMRC30238" "TMRC30208" "TMRC30218" "TMRC30265"
## The confusion matrix is:
## Confusion Matrix and Statistics
##
## Reference
## Prediction cure failure
## cure 61 4
## failure 14 20
##
## Accuracy : 0.818
## 95% CI : (0.728, 0.889)
## No Information Rate : 0.758
## P-Value [Acc > NIR] : 0.0958
##
## Kappa : 0.566
##
## Mcnemar's Test P-Value : 0.0339
##
## Sensitivity : 0.813
## Specificity : 0.833
## Pos Pred Value : 0.938
## Neg Pred Value : 0.588
## Precision : 0.938
## Recall : 0.813
## F1 : 0.871
## Prevalence : 0.758
## Detection Rate : 0.616
## Detection Prevalence : 0.657
## Balanced Accuracy : 0.823
##
## 'Positive' Class : cure
##
## The ROC AUC is: 0.763348416289593.
In the following block I extract the top and bottom DE genes vs. the top set of genes in the glm_variables ordered by the ‘Overall’ column.
I should make this a function, there is no way this will kept correct across multiple iterations of the ml/wgcna/etc comparisons.
I am going to stop trying to compare these results across WGCNA/adjp/expression, those comparisons are all basically null.
glm_variable_importance <- varImp(glm_fit)
glm_comparison <- compare_ml_de(glm_variable_importance, deseq_de_cf, annotations,
n = comparison_n, plot_n = 15)comparison_lfc[["glm"]] <- glm_comparison[["shared"]]At this point I basically have a feeling for how one may use the tuneGrid to apply a parameter sweep of the data along with the various (in this case cv) sampling methods. Thus, the following is basically identical to the previous blocks except it is using xgbTree (which is the R implementation of extreme boosting: (Chen and Guestrin (2016))).
##rf_method <- trainControl(method = "ranger", number = 10, verbose = TRUE)
train_method <- trainControl(method = "cv", number = 10)
gb_fit <- train(outcome ~ ., data = train_all,
method = "xgbTree", trControl = train_method,
tuneGrid = data.frame(
nrounds = 200,
eta = c(0.05, 0.1, 0.3),
max_depth = 4,
gamma = 0,
colsample_bytree = 1,
subsample = 0.5,
min_child_weight = 1),
verbose = TRUE)
gb_predict_trained <- predict(gb_fit, train_df)
gb_predict_trained## [1] failure failure cure cure cure cure cure cure cure
## [10] cure cure cure cure cure cure cure cure cure
## [19] cure cure cure cure failure cure failure failure cure
## [28] failure cure cure cure cure failure cure cure cure
## [37] cure cure failure failure failure failure failure cure cure
## [46] cure cure cure failure failure cure failure failure cure
## [55] cure cure cure failure cure cure failure failure failure
## [64] failure failure failure cure
## Levels: cure failure
gb_train_eval <- self_evaluate_model(gb_predict_trained, datasets,
which = split, type = "train")## Setting levels: control = cure, case = failure
## Setting direction: controls < cases
gb_train_eval## The summary of the (in)correct calls is:
## Mode TRUE
## logical 67
## The missed samples are:
## character(0)
## The confusion matrix is:
## Confusion Matrix and Statistics
##
## Reference
## Prediction cure failure
## cure 44 0
## failure 0 23
##
## Accuracy : 1
## 95% CI : (0.946, 1)
## No Information Rate : 0.657
## P-Value [Acc > NIR] : 5.81e-13
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.000
## Specificity : 1.000
## Pos Pred Value : 1.000
## Neg Pred Value : 1.000
## Precision : 1.000
## Recall : 1.000
## F1 : 1.000
## Prevalence : 0.657
## Detection Rate : 0.657
## Detection Prevalence : 0.657
## Balanced Accuracy : 1.000
##
## 'Positive' Class : cure
##
## The ROC AUC is: 1.
gb_predict_test <- predict(gb_fit, test_df)
gb_predict_test_evaluated <- self_evaluate_model(gb_predict_test, datasets,
which = split, type = "test")## Setting levels: control = cure, case = failure
## Setting direction: controls < cases
gb_predict_test_evaluated## The summary of the (in)correct calls is:
## Mode FALSE TRUE
## logical 18 81
## The missed samples are:
## [1] "TMRC30179" "TMRC30221" "TMRC30222" "TMRC30223" "TMRC30176" "TMRC30094"
## [7] "TMRC30122" "TMRC30083" "TMRC30121" "TMRC30048" "TMRC30054" "TMRC30139"
## [13] "TMRC30198" "TMRC30200" "TMRC30202" "TMRC30204" "TMRC30238" "TMRC30265"
## The confusion matrix is:
## Confusion Matrix and Statistics
##
## Reference
## Prediction cure failure
## cure 63 2
## failure 16 18
##
## Accuracy : 0.818
## 95% CI : (0.728, 0.889)
## No Information Rate : 0.798
## P-Value [Acc > NIR] : 0.36186
##
## Kappa : 0.553
##
## Mcnemar's Test P-Value : 0.00218
##
## Sensitivity : 0.797
## Specificity : 0.900
## Pos Pred Value : 0.969
## Neg Pred Value : 0.529
## Precision : 0.969
## Recall : 0.797
## F1 : 0.875
## Prevalence : 0.798
## Detection Rate : 0.636
## Detection Prevalence : 0.657
## Balanced Accuracy : 0.849
##
## 'Positive' Class : cure
##
## The ROC AUC is: 0.749321266968326.
gb_variable_importance <- varImp(gb_fit)
gb_variables <- glm_variable_importance[["importance"]] %>%
arrange(desc(Overall))
plot(gb_variable_importance, top = 15)gb_comparison <- compare_ml_de(gb_variable_importance, deseq_de_cf, annotations, n = comparison_n, plot_n = 15)comparison_lfc[["gb"]] <- gb_comparison[["shared"]]Doing all of the above for each method and collecting the results is super fiddly, so I wrote a little function to try to make that easier.
WARNING: Turning on the following block takes ~ 2 days on our pretty nice server. Your mileage may vary.
There are four versions of the data that Maria Adelaida and Najib are interested in:
Ergo, let us set up an appropriate naming convention for them to make these iterative runs more coherent. I will continue using ‘tc’ and ‘t’ for the clinics as a prefix and follow it with ‘vall’ vs. ‘v1’.
tc_vall_input <- subset_expt(tc_clinical, subset = "batch!='biopsy'")## subset_expt(): There were 184, now there are 166 samples.
t_vall_input <- subset_expt(tc_vall_input, subset = "clinic=='tumaco'")## subset_expt(): There were 166, now there are 109 samples.
tc_v1_input <- subset_expt(tc_vall_input, subset = "visitbipart=='v1'")## subset_expt(): There were 166, now there are 65 samples.
t_v1_input <- subset_expt(t_vall_input, subset = "visitbipart=='v1'")## subset_expt(): There were 109, now there are 40 samples.
tc_vall_input <- normalize_expt(tc_vall_input, transform = "log2", convert = "cpm")## transform_counts: Found 932911 values equal to 0, adding 1 to the matrix.
t_vall_input <- normalize_expt(t_vall_input, transform = "log2", convert = "cpm")## transform_counts: Found 587592 values equal to 0, adding 1 to the matrix.
tc_v1_input <- normalize_expt(tc_v1_input, transform = "log2", convert = "cpm")## transform_counts: Found 369543 values equal to 0, adding 1 to the matrix.
t_v1_input <- normalize_expt(t_v1_input, transform = "log2", convert = "cpm")## transform_counts: Found 216947 values equal to 0, adding 1 to the matrix.
tc_vall_outcome_factor <- as.factor(as.character(pData(tc_vall_input)[[ref_col]]))
t_vall_outcome_factor <- as.factor(as.character(pData(t_vall_input)[[ref_col]]))
tc_v1_outcome_factor <- as.factor(as.character(pData(tc_v1_input)[[ref_col]]))
t_v1_outcome_factor <- as.factor(as.character(pData(t_v1_input)[[ref_col]]))In the following block I will use the same normalizations and filters as above on each of the four inputs.
tc_vall_norm <- normalize_expt(tc_vall_input, filter = "cv", cv_min = 0.1)## Removing 3619 low-count genes (16333 remaining).
tc_v1_norm <- normalize_expt(tc_v1_input, filter = "cv", cv_min = 0.1)## Removing 3968 low-count genes (15984 remaining).
t_vall_norm <- normalize_expt(t_vall_input, filter = "cv", cv_min = 0.1)## Removing 3785 low-count genes (16167 remaining).
t_v1_norm <- normalize_expt(t_v1_input, filter = "cv", cv_min = 0.1)## Removing 4109 low-count genes (15843 remaining).
tc_vall_texprs <- t(exprs(tc_vall_norm))
tc_v1_texprs <- t(exprs(tc_v1_norm))
t_vall_texprs <- t(exprs(t_vall_norm))
t_v1_texprs <- t(exprs(t_v1_norm))
sd_genes <- 3000
tc_vall_stdev <- apply(tc_vall_texprs, 2, sd)
tc_vall_pred <- order(tc_vall_stdev, decreasing = TRUE)[1:sd_genes]
tc_vall_texprs <- tc_vall_texprs[, tc_vall_pred]
tc_v1_stdev <- apply(tc_v1_texprs, 2, sd)
tc_v1_pred <- order(tc_v1_stdev, decreasing = TRUE)[1:sd_genes]
tc_v1_texprs <- tc_v1_texprs[, tc_v1_pred]
t_vall_stdev <- apply(t_vall_texprs, 2, sd)
t_vall_pred <- order(t_vall_stdev, decreasing = TRUE)[1:sd_genes]
t_vall_texprs <- t_vall_texprs[, t_vall_pred]
t_v1_stdev <- apply(t_v1_texprs, 2, sd)
t_v1_pred <- order(t_v1_stdev, decreasing = TRUE)[1:sd_genes]
t_v1_texprs <- t_v1_texprs[, t_v1_pred]
tc_vall_preproc <- preProcess(tc_vall_texprs, method = "center")
tc_vall_texprs <- predict(tc_vall_preproc, tc_vall_texprs)
tc_v1_preproc <- preProcess(tc_v1_texprs, method = "center")
tc_v1_texprs <- predict(tc_v1_preproc, tc_v1_texprs)
t_vall_preproc <- preProcess(t_vall_texprs, method = "center")
t_vall_texprs <- predict(t_vall_preproc, t_vall_texprs)
t_v1_preproc <- preProcess(t_v1_texprs, method = "center")
t_v1_texprs <- predict(t_v1_preproc, t_v1_texprs)
tc_vall_preproc <- preProcess(tc_vall_texprs, method = "corr", cutoff = 0.95)
tc_vall_texprs <- predict(tc_vall_preproc, tc_vall_texprs)
tc_v1_preproc <- preProcess(tc_v1_texprs, method = "corr", cutoff = 0.95)
tc_v1_texprs <- predict(tc_v1_preproc, tc_v1_texprs)
t_vall_preproc <- preProcess(t_vall_texprs, method = "corr", cutoff = 0.95)
t_vall_texprs <- predict(t_vall_preproc, t_vall_texprs)
t_v1_preproc <- preProcess(t_v1_texprs, method = "corr", cutoff = 0.95)
t_v1_texprs <- predict(t_v1_preproc, t_v1_texprs)
chosen_factors <- c("finaloutcome", "donor", "persistence", "visitnumber",
"selectionmethod", "typeofcells", "time")
tc_vall_meta <- pData(tc_vall_norm)[, chosen_factors]
tc_v1_meta <- pData(tc_v1_norm)[, chosen_factors]
t_vall_meta <- pData(t_vall_norm)[, chosen_factors]
t_v1_meta <- pData(t_v1_norm)[, chosen_factors]
ref_col <- "finaloutcome"
tc_vall_outcome <- as.factor(as.character(pData(tc_vall_norm)[[ref_col]]))
tc_v1_outcome <- as.factor(as.character(pData(tc_v1_norm)[[ref_col]]))
t_vall_outcome <- as.factor(as.character(pData(t_vall_norm)[[ref_col]]))
t_v1_outcome <- as.factor(as.character(pData(t_v1_norm)[[ref_col]]))
tc_vall_ml_df <- cbind.data.frame(tc_vall_outcome, as.data.frame(tc_vall_texprs))
tc_vall_ml_df[[ref_col]] <- as.factor(tc_vall_ml_df[["tc_vall_outcome"]])
tc_v1_ml_df <- cbind.data.frame(tc_v1_outcome, as.data.frame(tc_v1_texprs))
tc_v1_ml_df[[ref_col]] <- as.factor(tc_v1_ml_df[["tc_v1_outcome"]])
t_vall_ml_df <- cbind.data.frame(t_vall_outcome, as.data.frame(t_vall_texprs))
t_vall_ml_df[[ref_col]] <- as.factor(t_vall_ml_df[["t_vall_outcome"]])
t_v1_ml_df <- cbind.data.frame(t_v1_outcome, as.data.frame(t_v1_texprs))
t_v1_ml_df[[ref_col]] <- as.factor(t_v1_ml_df[["t_v1_outcome"]])tc_vall_summary_xlsx <- glue("excel/tc_vall_ml_summary-v{ver}.xlsx")
tc_vall_knn <- classify_n_times(tc_vall_texprs, tc_vall_meta,
outcome_column = ref_col,
method = "knn", sampler = "cv")
written <- write_classifier_summary(tc_vall_knn, excel = tc_vall_summary_xlsx)
tc_vall_gb <- classify_n_times(tc_vall_texprs, tc_vall_meta,
outcome_column = ref_col,
method = "xgbTree", sampler = "cv")
written <- write_classifier_summary(tc_vall_gb, excel = written[["wb"]])
tc_vall_glm <- classify_n_times(tc_vall_texprs, tc_vall_meta,
outcome_column = ref_col,
method = "glmnet", sampler = "cv")
written <- write_classifier_summary(tc_vall_glm, excel = written[["wb"]])
tc_vall_rf <- classify_n_times(tc_vall_texprs, tc_vall_meta,
outcome_column = ref_col,
method = "ranger", sampler = "cv")
written <- write_classifier_summary(tc_vall_rf, excel = written[["wb"]])
openxlsx::saveWorkbook(written[["wb"]], file = tc_vall_summary_xlsx)tc_v1_summary_xlsx <- glue("excel/tc_v1_ml_summary-v{ver}.xlsx")
tc_v1_knn <- classify_n_times(tc_v1_texprs, tc_v1_meta,
outcome_column = ref_col,
method = "knn", sampler = "cv")
written <- write_classifier_summary(tc_v1_knn, excel = tc_v1_summary_xlsx)
tc_v1_gb <- classify_n_times(tc_v1_texprs, tc_v1_meta,
outcome_column = ref_col,
method = "xgbTree", sampler = "cv")
written <- write_classifier_summary(tc_v1_gb, excel = written[["wb"]])
tc_v1_glm <- classify_n_times(tc_v1_texprs, tc_v1_meta,
outcome_column = ref_col,
method = "glmnet", sampler = "cv")
written <- write_classifier_summary(tc_v1_glm, excel = written[["wb"]])
tc_v1_rf <- classify_n_times(tc_v1_texprs, tc_v1_meta,
outcome_column = ref_col,
method = "ranger", sampler = "cv")
written <- write_classifier_summary(tc_v1_rf, excel = written[["wb"]])
openxlsx::saveWorkbook(written[["wb"]], file = tc_v1_summary_xlsx)t_vall_summary_xlsx <- glue("excel/t_vall_ml_summary-v{ver}.xlsx")
t_vall_knn <- classify_n_times(t_vall_texprs, t_vall_meta,
outcome_column = ref_col,
method = "knn", sampler = "cv")
written <- write_classifier_summary(t_vall_knn, excel = t_vall_summary_xlsx)
t_vall_gb <- classify_n_times(t_vall_texprs, t_vall_meta,
outcome_column = ref_col,
method = "xgbTree", sampler = "cv")
written <- write_classifier_summary(t_vall_gb, excel = written[["wb"]])
t_vall_glm <- classify_n_times(t_vall_texprs, t_vall_meta,
outcome_column = ref_col,
method = "glmnet", sampler = "cv")
written <- write_classifier_summary(t_vall_glm, excel = written[["wb"]])
t_vall_rf <- classify_n_times(t_vall_texprs, t_vall_meta,
outcome_column = ref_col,
method = "ranger", sampler = "cv")
written <- write_classifier_summary(t_vall_rf, excel = written[["wb"]])
openxlsx::saveWorkbook(written[["wb"]], file = t_vall_summary_xlsx)t_v1_summary_xlsx <- glue("excel/t_v1_ml_summary-v{ver}.xlsx")
t_v1_knn <- classify_n_times(t_v1_texprs, t_v1_meta,
outcome_column = ref_col,
method = "knn", sampler = "cv")
written <- write_classifier_summary(t_v1_knn, excel = t_v1_summary_xlsx)
t_v1_gb <- classify_n_times(t_v1_texprs, t_v1_meta,
outcome_column = ref_col,
method = "xgbTree", sampler = "cv")
written <- write_classifier_summary(t_v1_gb, excel = written[["wb"]])
t_v1_glm <- classify_n_times(t_v1_texprs, t_v1_meta,
outcome_column = ref_col,
method = "glmnet", sampler = "cv")
written <- write_classifier_summary(t_v1_glm, excel = written[["wb"]])
t_v1_rf <- classify_n_times(t_v1_texprs, t_v1_meta,
outcome_column = ref_col,
method = "ranger", sampler = "cv")
written <- write_classifier_summary(t_v1_rf, excel = written[["wb"]])
openxlsx::saveWorkbook(written[["wb"]], file = t_v1_summary_xlsx)