I am using Alejandro’s document to get a feeling for how WGCNA is getting modules relevant to cure/fail.
I am reasonably certain he is in turn using this document as his input:
https://bioinformaticsworkbook.org/tutorials/wgcna.html#gsc.tab=0
<- WGCNA::enableWGCNAThreads(nThreads = 8) threads
## Allowing parallel execution with up to 8 working processes.
The input used is the cell-type specific, tmaco-only, rpkm, modified by sva.
In the following block I am breaking down what Alejandro did into smaller pieces so that I can make certain I understand what happened.
<- normalize_expt(t_monocytes, filter = TRUE, batch = "svaseq") %>%
input normalize_expt(convert = "rpkm", column = "mean_cds_len", na_to_zero = TRUE)
## Removing 9090 low-count genes (10862 remaining).
## Setting 295 low elements to zero.
<- as.data.frame(exprs(input))
wgcna_input "ENSEMBLID"]] <- rownames(wgcna_input)
wgcna_input[[
<- wgcna_input %>%
wgcna_melted gather(key = "samples", value = "counts", -ENSEMBLID)
<- wgcna_melted %>%
wgcna_with_meta inner_join(., pData(input), by = c("samples" = "tmrcidentifier"))
<- wgcna_with_meta %>%
wgcna_selected select("ENSEMBLID", "samples", "counts") %>%
spread(key = "samples", value = "counts")
Unless I am mistaken, the above just converted the matrix of counts into a merged/melted copy of same with the metadata, then removed the metadata and returned us back to the original state? hmmm…
<- WGCNA::goodSamplesGenes(t(exprs(input))) good_samples_genes
## Flagging genes and samples with too many missing values...
## ..step 1
summary(good_samples_genes)
## Length Class Mode
## goodGenes 10862 -none- logical
## goodSamples 42 -none- logical
## allOK 1 -none- logical
"allOK"]] good_samples_genes[[
## [1] TRUE
<- normalize_expt(input, transform = "log2") l2input
## transform_counts: Found 295 values equal to 0, adding 1 to the matrix.
<- c(c(1:10), seq(from = 12, to = 20, by = 2))
power_test <- WGCNA::pickSoftThreshold(
threshold_search t(exprs(l2input)), powerVector = power_test,
networkType = "signed", verbose = 5)
## pickSoftThreshold: will use block size 4118.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 4118 of 10862
## ..working on genes 4119 through 8236 of 10862
## ..working on genes 8237 through 10862 of 10862
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.2150 7.550 0.906 5610.0 5630.00 6290
## 2 2 0.1310 2.470 0.912 3170.0 3130.00 4000
## 3 3 0.0158 -0.437 0.844 1910.0 1860.00 2840
## 4 4 0.2350 -1.190 0.834 1210.0 1160.00 2150
## 5 5 0.5270 -1.560 0.892 807.0 748.00 1710
## 6 6 0.6860 -1.680 0.924 556.0 495.00 1400
## 7 7 0.7720 -1.760 0.942 396.0 337.00 1170
## 8 8 0.8120 -1.770 0.945 289.0 233.00 996
## 9 9 0.8410 -1.780 0.949 216.0 164.00 858
## 10 10 0.8670 -1.770 0.961 165.0 118.00 748
## 11 12 0.8660 -1.790 0.948 101.0 62.70 583
## 12 14 0.8610 -1.800 0.942 65.5 35.00 467
## 13 16 0.8770 -1.770 0.951 44.3 20.40 381
## 14 18 0.8860 -1.750 0.959 31.0 12.30 316
## 15 20 0.8750 -1.750 0.952 22.4 7.61 265
<- ggplot(threshold_search[["fitIndices"]], aes(Power, SFT.R.sq, label = Power)) +
a1 geom_point() +
geom_text(nudge_y = 0.1) +
geom_hline(yintercept = 0.8, color = 'red') +
labs(x = 'Power', y = 'Scale free topology model fit, signed R^2')
a1
<- ggplot(threshold_search[["fitIndices"]], aes(Power, mean.k., label = Power)) +
a2 geom_point() +
geom_text(nudge_y = 0.1) +
labs(x = 'Power', y = 'Mean Connectivity')
a2
<- 8
chosen_power ## WGCNA calls cor() without specifying its own namespace, so overwrite cor for the moment.
<- WGCNA::cor
cor <- WGCNA::blockwiseModules(
initial_modules t(exprs(l2input)), maxBlockSize = 11000, TOMType = "signed",
power = chosen_power, mergeCutHeight = 0.25, numericLabels = FALSE,
verbose = 3)
## Calculating module eigengenes block-wise from all genes
## Flagging genes and samples with too many missing values...
## ..step 1
## ..Working on block 1 .
## TOM calculation: adjacency..
## ..will use 8 parallel threads.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 17 genes from module 1 because their KME is too low.
## ..removing 4 genes from module 2 because their KME is too low.
## ..removing 6 genes from module 3 because their KME is too low.
## ..removing 1 genes from module 4 because their KME is too low.
## ..removing 1 genes from module 7 because their KME is too low.
## ..removing 1 genes from module 9 because their KME is too low.
## ..removing 1 genes from module 14 because their KME is too low.
## ..reassigning 20 genes from module 1 to modules with higher KME.
## ..reassigning 4 genes from module 2 to modules with higher KME.
## ..reassigning 14 genes from module 3 to modules with higher KME.
## ..reassigning 6 genes from module 4 to modules with higher KME.
## ..reassigning 2 genes from module 5 to modules with higher KME.
## ..reassigning 3 genes from module 6 to modules with higher KME.
## ..reassigning 1 genes from module 7 to modules with higher KME.
## ..reassigning 1 genes from module 9 to modules with higher KME.
## ..reassigning 5 genes from module 14 to modules with higher KME.
## ..reassigning 2 genes from module 17 to modules with higher KME.
## ..reassigning 1 genes from module 20 to modules with higher KME.
## ..reassigning 1 genes from module 37 to modules with higher KME.
## ..reassigning 1 genes from module 41 to modules with higher KME.
## ..merging modules that are too close..
## mergeCloseModules: Merging modules whose distance is less than 0.25
## Calculating new MEs...
<- stats::cor
cor
<- initial_modules[["MEs"]] initial_eigen
<- WGCNA::labels2colors(initial_modules[["colors"]])
network_colors ::plotDendroAndColors(
WGCNA"dendrograms"]][[1]],
initial_modules[["blockGenes"]][[1]]],
network_colors[initial_modules[["Modules",
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05)
::plotDendroAndColors(
WGCNA"dendrograms"]][[1]],
initial_modules[[cbind(initial_modules[["unmergedColors"]], initial_modules[["colors"]]),
c("unmerged", "merged"),
dendroLabels = FALSE,
addGuide = TRUE,
hang = 0.03,
guideHang = 0.05)
::plotDendroAndColors(
WGCNA"dendrograms"]][[1]],
initial_modules[["colors"]],
initial_modules[["ME",
dendroLabels = FALSE,
addGuide = TRUE,
hang= 0.03,
cex.colorLabels = 2,
marAll = c(1, 5, 3, 1),
main = ("WGCNA Cluster Dendrogram"),
guideHang = 0.05)
This does not appear to work, FIXME
<- WGCNA::consensusOrderMEs(
initial_reorder useAbs = FALSE,
initial_eigen, useSets = NULL, greyLast = TRUE,
greyName = paste(WGCNA::moduleColor.getMEprefix(), "grey", sep = ""),
method = "consensus")
<- data.frame(
meta_numeric "cf_numeric" = as.numeric(as.factor(pData(l2input)[["finaloutcome"]])),
"visit_numeric" = as.numeric(as.factor(pData(l2input)[["visitnumber"]])))
rownames(meta_numeric) <- rownames(pData(l2input))
<- pData(l2input)[, c("finaloutcome", "visitnumber")]
meta_factors <- merge(initial_eigen, meta_factors, by = "row.names")
meta_eigen rownames(meta_eigen) <- meta_eigen[["Row.names"]]
"Row.names"]] <- NULL
meta_eigen[[<- irr::kappam.fleiss(meta_eigen)
kappa
<- stats::cor(initial_eigen, meta_numeric, use = "p")
module_trait_corr module_trait_corr
## cf_numeric visit_numeric
## MEsalmon -0.164780 -0.0957133
## MEmagenta 0.050974 -0.2070258
## MEyellow 0.205038 -0.0381654
## MEgrey60 -0.149312 0.1647037
## MEred 0.343408 0.1883602
## MEturquoise 0.440232 0.0112585
## MEgreenyellow -0.258766 0.0778174
## MEblack 0.019155 0.1988695
## MEgreen -0.307949 0.2907309
## MElightyellow -0.352589 0.0926494
## MEtan 0.004369 -0.0252043
## MEblue -0.231199 0.1616756
## MElightcyan -0.236288 0.0004079
## MEbrown 0.026864 0.2638062
## MEcyan -0.229113 0.2260581
## MEmidnightblue 0.038247 0.0447213
## MElightgreen 0.343593 -0.0219696
## MEpink 0.519836 -0.0327724
## MEpurple 0.184913 -0.0301122
## MEgrey 0.184723 0.1002500
<- WGCNA::corPvalueStudent(module_trait_corr, nrow(meta_numeric))
module_trait_pvalues module_trait_pvalues
## cf_numeric visit_numeric
## MEsalmon 0.2970327 0.54653
## MEmagenta 0.7485238 0.18834
## MEyellow 0.1927163 0.81036
## MEgrey60 0.3452960 0.29726
## MEred 0.0259760 0.23224
## MEturquoise 0.0035281 0.94359
## MEgreenyellow 0.0979842 0.62425
## MEblack 0.9041618 0.20674
## MEgreen 0.0472540 0.06178
## MElightyellow 0.0220110 0.55951
## MEtan 0.9780913 0.87411
## MEblue 0.1407068 0.30635
## MElightcyan 0.1319310 0.99795
## MEbrown 0.8658948 0.09139
## MEcyan 0.1444260 0.15000
## MEmidnightblue 0.8099612 0.77854
## MElightgreen 0.0258906 0.89016
## MEpink 0.0004183 0.83676
## MEpurple 0.2410557 0.84986
## MEgrey 0.2415476 0.52759
On my computer at least, there seems to be difficulty installing the CorLvelPlot library, so I will just remove this piece of Alejandro’s code for now.
<- merge(initial_eigen, meta_numeric, by = "row.names")
module_trait_merged rownames(module_trait_merged) <- module_trait_merged[["Row.names"]]
"Row.names"]] <- NULL
module_trait_merged[[
#CorLevelPlot::CorLevelPlot(
# module_trait_merged,
# x = names(module_trait_merged)[1:18],
# rotLabX = 90,
# y = names(module_trait_merged)[19:20],
# posColKey = "top",
# col = c("blue1", "skyblue", "white", "pink", "red"))
It appears that the modules ‘turqoise’ and ‘pink’ are likely the most interesting. We can extract the genes from them:
<- initial_modules[["colors"]] == "turqoise" | initial_modules[["colors"]] == "pink"
wanted sum(wanted)
## [1] 187
<- names(initial_modules[["colors"]])[wanted]
interesting_genes
fData(l2input)[interesting_genes, "hgnc_symbol"]
## [1] "LAP3" "CASP10" "CD38" "CYB561" "QPCTL" "EXTL3"
## [7] "SAMD4A" "OSBPL5" "VRK2" "GABARAPL2" "PSMA4" "CUL1"
## [13] "EIF2AK2" "PARP12" "MTHFD2" "DIP2B" "DHRS9" "SP140"
## [19] "EPB41L2" "HSP90AA1" "DELE1" "EPB41L3" "SSH1" "LPCAT2"
## [25] "C3orf18" "FXYD5" "BRPF3" "JAK2" "SYNGR1" "SPTLC2"
## [31] "MTHFD1" "SMCHD1" "RBBP8" "TNFSF13B" "N4BP1" "GPI"
## [37] "ARRDC2" "ZC3HAV1" "TFEC" "C1GALT1" "DDX58" "KRT23"
## [43] "PMP22" "IL23A" "PARP11" "NCOA7" "MTRF1L" "TENT5A"
## [49] "XRN1" "MOB1A" "STEAP3" "IFIH1" "FANCL" "STAT1"
## [55] "ST3GAL5" "GADD45A" "ZNF684" "GBP3" "GBP1" "RCAN3"
## [61] "ACYP1" "CD274" "MASTL" "TRIM25" "TNFSF10" "NT5C3A"
## [67] "ACO1" "NMI" "DNPEP" "ZBP1" "HELB" "APOL3"
## [73] "APOL2" "CAMSAP1" "GCH1" "XAF1" "ALOX5AP" "EPSTI1"
## [79] "LRRCC1" "NHSL1" "SP110" "STX17" "DERL1" "CMTR1"
## [85] "TPMT" "FAM8A1" "DDX60" "IFI44L" "IFI44" "PNPT1"
## [91] "MYOF" "STAMBPL1" "DUSP5" "SSB" "PARP9" "SEMA7A"
## [97] "HERC5" "BRCA2" "WARS1" "HAPLN3" "NLRC5" "CMTM2"
## [103] "RAB40B" "ZCCHC2" "MISP3" "ADAMTS10" "RERE" "CELSR2"
## [109] "ATP1B1" "RGL1" "ASB6" "RNF144A" "ASAP2" "RABGAP1L"
## [115] "TMEM123" "RASGRP3" "ANKRD22" "SMARCA5" "PITPNC1" "GBP5"
## [121] "ADAMTS5" "PPP4R1" "SSBP3" "FMNL2" "B4GALT5" "CDA"
## [127] "CPAMD8" "SLC37A1" "PDXK" "ADAR" "GBP2" "GBP4"
## [133] "SGO2" "IFI16" "PPM1K" "HESX1" "DTX3L" "ARHGEF3"
## [139] "KIAA1958" "UBTD1" "HTRA1" "RIMKLB" "SLFN5" "LTBP3"
## [145] "DHRSX" "ITGAM" "STAT2" "TANC2" "ZNF318" "FRMD3"
## [151] "AFF1" "PARP14" "PC" "RNF213" "FGFBP3" "TP53I11"
## [157] "RMI2" "SAMD9L" "ZC3H12D" "P4HTM" "RAB39A" "CIITA"
## [163] "OLFML1" "ACTG1" "ADSS1" "ANKFY1" "CEACAM19" "PLSCR1"
## [169] "ZNF573" "TDRD7" "TCF4" "HSH2D" "PGAP1" "IL27"
## [175] "PDCD1LG2" "CD2AP" "ARMH1" "TMEM184B" "TBKBP1" "SAMD9"
## [181] "KCTD11" "SMTNL1" "APOL6" "TAPBP" "" "MARCKS"
## [187] "SCO2"
<- write_xlsx(fData(l2input)[interesting_genes, ],
written_interesting excel = glue("excel/wgcna_interesting_genes-v{ver}.xlsx"))
## Note that we can do similarity matrices on the samples too in order to get
## dendrograms which may get interesting groups of samples?
<- initial_modules[["colors"]] != "grey"
not_grey <- t(exprs(l2input))[, not_grey]
not_grey_exprs dim(not_grey_exprs)
## [1] 42 8219
<- colnames(not_grey_exprs)
not_grey_genes <- 1 - WGCNA::TOMsimilarityFromExpr(
dist_tom
not_grey_exprs,power = chosen_power)
## TOM calculation: adjacency..
## ..will use 8 parallel threads.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
colnames(dist_tom) <- not_grey_genes
rownames(dist_tom) <- colnames(dist_tom)
<- flashClust::flashClust(as.dist(dist_tom), method = "average")
similarity_cluster
::plotDendroAndColors(
WGCNA
similarity_cluster,colors = initial_modules[["colors"]][not_grey_genes],
"ME",
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05,
cex.colorLabels = 1.5,
main = ("Cluster Dendrogram (WGCNA)"))
## In my initial pass, I did the clustering on the samples instead of genes and got
## two primary groups:
## 26,6,7,12,4,27,3,13,9,30,17,22,18,42,15,31,8,25,5,16,14,20,10,38
## 1,2,23,11,40,34,33,35,37,24,41,19,29,28,36,21,32,39
## I assume that these two groups will have some meaning vis a vis the monocyte samples?
<- c(26,6,7,12,4,27,3,13,9,30,17,22,18,42,15,31,8,25,5,16,14,20,10,38)
first_group <- c(1,2,23,11,40,34,33,35,37,24,41,19,29,28,36,21,32,39)
second_group unique(pData(l2input)[first_group, "tubelabelorigin"])
## [1] "su2168" "su2066" "su2071" "su2065" "su2172" "su2068" "su2173" "su2161"
## [9] "su2167" "su2162" "su2201" "su2073" "su2188"
unique(pData(l2input)[second_group, "tubelabelorigin"])
## [1] "su2052" "su2167" "su2068" "su2190" "su2183" "su2184" "su2168" "su2162"
## [9] "su2172" "su2173"
## So, they are two distinct groups of donors...
table(pData(l2input)[first_group, "finaloutcome"])
##
## cure failure
## 14 10
table(pData(l2input)[second_group, "finaloutcome"])
##
## cure failure
## 7 11
table(pData(l2input)[first_group, "visitnumber"])
##
## 3 2 1
## 5 8 11
table(pData(l2input)[second_group, "visitnumber"])
##
## 3 2 1
## 8 5 5
::pander(sessionInfo()) pander
R version 4.3.3 (2024-02-29)
Platform: x86_64-conda-linux-gnu (64-bit)
locale: C
attached base packages: stats4, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: ruv(v.0.9.7.1), lubridate(v.1.9.3), stringr(v.1.5.1), purrr(v.1.0.2), readr(v.2.1.5), tidyr(v.1.3.1), tibble(v.3.2.1), ggplot2(v.3.5.1), tidyverse(v.2.0.0), forcats(v.1.0.0), dplyr(v.1.1.4), hpgltools(v.1.0), Matrix(v.1.6-5), glue(v.1.7.0), SummarizedExperiment(v.1.32.0), GenomicRanges(v.1.54.1), GenomeInfoDb(v.1.38.8), IRanges(v.2.36.0), S4Vectors(v.0.40.2), MatrixGenerics(v.1.14.0), matrixStats(v.1.3.0), Biobase(v.2.62.0) and BiocGenerics(v.0.48.1)
loaded via a namespace (and not attached): splines(v.4.3.3), later(v.1.3.2), BiocIO(v.1.12.0), bitops(v.1.0-8), filelock(v.1.0.3), preprocessCore(v.1.64.0), graph(v.1.80.0), XML(v.3.99-0.17), rpart(v.4.1.23), lifecycle(v.1.0.4), fastcluster(v.1.2.6), edgeR(v.4.0.16), doParallel(v.1.0.17), lattice(v.0.22-6), flashClust(v.1.01-2), backports(v.1.5.0), magrittr(v.2.0.3), openxlsx(v.4.2.6.1), limma(v.3.58.1), Hmisc(v.5.1-3), plotly(v.4.10.4), sass(v.0.4.9), rmarkdown(v.2.27), jquerylib(v.0.1.4), yaml(v.2.3.10), httpuv(v.1.6.15), zip(v.2.3.1), cowplot(v.1.1.3), DBI(v.1.2.3), abind(v.1.4-5), zlibbioc(v.1.48.2), RCurl(v.1.98-1.16), yulab.utils(v.0.1.5), nnet(v.7.3-19), sva(v.3.50.0), GenomeInfoDbData(v.1.2.11), genefilter(v.1.84.0), annotate(v.1.80.0), codetools(v.0.2-20), DelayedArray(v.0.28.0), DOSE(v.3.28.2), tidyselect(v.1.2.1), farver(v.2.1.2), BiocFileCache(v.2.10.2), dynamicTreeCut(v.1.63-1), base64enc(v.0.1-3), GenomicAlignments(v.1.38.2), jsonlite(v.1.8.8), Formula(v.1.2-5), survival(v.3.7-0), iterators(v.1.0.14), foreach(v.1.5.2), tools(v.4.3.3), Rcpp(v.1.0.13), gridExtra(v.2.3), SparseArray(v.1.2.4), xfun(v.0.46), mgcv(v.1.9-1), qvalue(v.2.34.0), withr(v.3.0.1), fastmap(v.1.2.0), fansi(v.1.0.6), digest(v.0.6.36), timechange(v.0.3.0), R6(v.2.5.1), mime(v.0.12), colorspace(v.2.1-1), GO.db(v.3.18.0), lpSolve(v.5.6.20), RSQLite(v.2.3.7), utf8(v.1.2.4), generics(v.0.1.3), data.table(v.1.15.4), rtracklayer(v.1.62.0), httr(v.1.4.7), htmlwidgets(v.1.6.4), S4Arrays(v.1.2.1), pkgconfig(v.2.0.3), gtable(v.0.3.5), blob(v.1.2.4), impute(v.1.76.0), XVector(v.0.42.0), htmltools(v.0.5.8.1), fgsea(v.1.28.0), GSEABase(v.1.64.0), scales(v.1.3.0), png(v.0.1-8), knitr(v.1.48), rstudioapi(v.0.16.0), tzdb(v.0.4.0), reshape2(v.1.4.4), rjson(v.0.2.21), checkmate(v.2.3.2), nlme(v.3.1-165), curl(v.5.2.1), cachem(v.1.1.0), parallel(v.4.3.3), HDO.db(v.0.99.1), foreign(v.0.8-87), AnnotationDbi(v.1.64.1), restfulr(v.0.0.15), pillar(v.1.9.0), grid(v.4.3.3), vctrs(v.0.6.5), promises(v.1.3.0), dbplyr(v.2.5.0), xtable(v.1.8-4), cluster(v.2.1.6), htmlTable(v.2.4.3), evaluate(v.0.24.0), cli(v.3.6.3), locfit(v.1.5-9.10), compiler(v.4.3.3), Rsamtools(v.2.18.0), rlang(v.1.1.4), crayon(v.1.5.3), labeling(v.0.4.3), plyr(v.1.8.9), fs(v.1.6.4), pander(v.0.6.5), stringi(v.1.8.4), viridisLite(v.0.4.2), WGCNA(v.1.72-5), BiocParallel(v.1.36.0), munsell(v.0.5.1), Biostrings(v.2.70.3), lazyeval(v.0.2.2), GOSemSim(v.2.28.1), hms(v.1.1.3), bit64(v.4.0.5), varhandle(v.2.0.6), KEGGREST(v.1.42.0), statmod(v.1.5.0), shiny(v.1.9.1), highr(v.0.11), memoise(v.2.0.1), bslib(v.0.8.0), fastmatch(v.1.1-4), bit(v.4.0.5) and irr(v.0.84.1)
message("This is hpgltools commit: ", get_git_commit())
## If you wish to reproduce this exact build of hpgltools, invoke the following:
## > git clone http://github.com/abelew/hpgltools.git
## > git reset
## This is hpgltools commit:
message("Saving to ", savefile)
## Saving to 05wgcna.rda.xz
# tmp <- sm(saveme(filename = savefile))