Sparse Estimation of Correlations among Microbiomes (SECOM) (Lin, Eggesbø, and Peddada 2022) is a methodology that aims to detect both linear and nonlinear relationships between a pair of taxa within an ecosystem (e.g., gut) or across ecosystems (e.g., gut and tongue). SECOM corrects both sample-specific and taxon-specific biases and obtains a consistent estimator for the correlation matrix of microbial absolute abundances while maintaining the underlying true sparsity. For more details, please refer to the SECOM paper.
Download package.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("ANCOMBC")
Load the package.
The HITChip Atlas dataset contains genus-level microbiota profiling with HITChip for 1006 western adults with no reported health complications, reported in (Lahti et al. 2014). The dataset is also available via the microbiome R package (Lahti et al. 2017) in phyloseq (McMurdie and Holmes 2013) format.
data(atlas1006)
# Subset to baseline
tse = atlas1006[, atlas1006$time == 0]
# Re-code the bmi group
tse$bmi = recode(tse$bmi_group,
obese = "obese",
severeobese = "obese",
morbidobese = "obese")
# Subset to lean, overweight, and obese subjects
tse = tse[, tse$bmi %in% c("lean", "overweight", "obese")]
# Create the region variable
tse$region = recode(as.character(tse$nationality),
Scandinavia = "NE", UKIE = "NE", SouthEurope = "SE",
CentralEurope = "CE", EasternEurope = "EE",
.missing = "unknown")
# Discard "EE" as it contains only 1 subject
# Discard subjects with missing values of region
tse = tse[, ! tse$region %in% c("EE", "unknown")]
print(tse)
class: TreeSummarizedExperiment
dim: 130 873
metadata(0):
assays(1): counts
rownames(130): Actinomycetaceae Aerococcus ... Xanthomonadaceae
Yersinia et rel.
rowData names(3): Phylum Family Genus
colnames(873): Sample-1 Sample-2 ... Sample-1005 Sample-1006
colData names(12): age sex ... bmi region
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
rowLinks: NULL
rowTree: NULL
colLinks: NULL
colTree: NULL
set.seed(123)
# Linear relationships
res_linear = secom_linear(data = list(tse), assay_name = "counts",
tax_level = "Phylum", pseudo = 0,
prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5,
wins_quant = c(0.05, 0.95), method = "pearson",
soft = FALSE, thresh_len = 20, n_cv = 10,
thresh_hard = 0.3, max_p = 0.005, n_cl = 2)
# Nonlinear relationships
res_dist = secom_dist(data = list(tse), assay_name = "counts",
tax_level = "Phylum", pseudo = 0,
prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5,
wins_quant = c(0.05, 0.95), R = 1000,
thresh_hard = 0.3, max_p = 0.005, n_cl = 2)
corr_linear = res_linear$corr_th
cooccur_linear = res_linear$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0
df_linear = data.frame(get_upper_tri(corr_linear)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)
heat_linear_th = df_linear %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Pearson (Thresholding)") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic"),
axis.text.y = element_text(size = 12, face = "italic"),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_linear_th
corr_linear = res_linear$corr_fl
cooccur_linear = res_linear$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0
df_linear = data.frame(get_upper_tri(corr_linear)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)
heat_linear_fl = df_linear %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Pearson (Filtering)") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic"),
axis.text.y = element_text(size = 12, face = "italic"),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_linear_fl
corr_dist = res_dist$dcorr_fl
cooccur_dist = res_dist$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_dist[cooccur_dist < overlap] = 0
df_dist = data.frame(get_upper_tri(corr_dist)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
tax_name = sort(union(df_dist$var1, df_dist$var2))
df_dist$var1 = factor(df_dist$var1, levels = tax_name)
df_dist$var2 = factor(df_dist$var2, levels = tax_name)
heat_dist_fl = df_dist %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Distance (Filtering)") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic"),
axis.text.y = element_text(size = 12, face = "italic"),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_dist_fl
To compute correlations whithin and across different ecosystems, one needs to make sure that there are samples in common across these ecosystems.
# Select subjects from "CE" and "NE"
tse1 = tse[, tse$region == "CE"]
tse2 = tse[, tse$region == "NE"]
# Rename samples to ensure there is an overlap of samples between CE and NE
colnames(tse1) = paste0("Sample-", seq_len(ncol(tse1)))
colnames(tse2) = paste0("Sample-", seq_len(ncol(tse2)))
print(tse1)
class: TreeSummarizedExperiment
dim: 130 578
metadata(0):
assays(1): counts
rownames(130): Actinomycetaceae Aerococcus ... Xanthomonadaceae
Yersinia et rel.
rowData names(3): Phylum Family Genus
colnames(578): Sample-1 Sample-2 ... Sample-577 Sample-578
colData names(12): age sex ... bmi region
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
rowLinks: NULL
rowTree: NULL
colLinks: NULL
colTree: NULL
class: TreeSummarizedExperiment
dim: 130 181
metadata(0):
assays(1): counts
rownames(130): Actinomycetaceae Aerococcus ... Xanthomonadaceae
Yersinia et rel.
rowData names(3): Phylum Family Genus
colnames(181): Sample-1 Sample-2 ... Sample-180 Sample-181
colData names(12): age sex ... bmi region
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
rowLinks: NULL
rowTree: NULL
colLinks: NULL
colTree: NULL
set.seed(123)
# Linear relationships
res_linear = secom_linear(data = list(CE = tse1, NE = tse2),
assay_name = c("counts", "counts"),
tax_level = c("Phylum", "Phylum"), pseudo = 0,
prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5,
wins_quant = c(0.05, 0.95), method = "pearson",
soft = FALSE, thresh_len = 20, n_cv = 10,
thresh_hard = 0.3, max_p = 0.005, n_cl = 2)
# Nonlinear relationships
res_dist = secom_dist(data = list(CE = tse1, NE = tse2),
assay_name = c("counts", "counts"),
tax_level = c("Phylum", "Phylum"), pseudo = 0,
prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5,
wins_quant = c(0.05, 0.95), R = 1000,
thresh_hard = 0.3, max_p = 0.005, n_cl = 2)
corr_linear = res_linear$corr_th
cooccur_linear = res_linear$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0
df_linear = data.frame(get_upper_tri(corr_linear)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(var2 = gsub("\\...", " - ", var2),
value = round(value, 2))
tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)
txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02")
heat_linear_th = df_linear %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
na.value = "grey", midpoint = 0, limit = c(-1,1),
space = "Lab", name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Pearson (Thresholding)") +
theme_bw() +
geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") +
geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic", color = txt_color),
axis.text.y = element_text(size = 12, face = "italic",
color = txt_color),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_linear_th
corr_linear = res_linear$corr_th
cooccur_linear = res_linear$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0
df_linear = data.frame(get_upper_tri(corr_linear)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(var2 = gsub("\\...", " - ", var2),
value = round(value, 2))
tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)
txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02")
heat_linear_fl = df_linear %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
na.value = "grey", midpoint = 0, limit = c(-1,1),
space = "Lab", name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Pearson (Filtering)") +
theme_bw() +
geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") +
geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic", color = txt_color),
axis.text.y = element_text(size = 12, face = "italic",
color = txt_color),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_linear_fl
corr_dist = res_dist$dcorr_fl
cooccur_dist = res_dist$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_dist[cooccur_dist < overlap] = 0
df_dist = data.frame(get_upper_tri(corr_dist)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(var2 = gsub("\\...", " - ", var2),
value = round(value, 2))
tax_name = sort(union(df_dist$var1, df_dist$var2))
df_dist$var1 = factor(df_dist$var1, levels = tax_name)
df_dist$var2 = factor(df_dist$var2, levels = tax_name)
txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02")
heat_dist_fl = df_dist %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
na.value = "grey", midpoint = 0, limit = c(-1,1),
space = "Lab", name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Distance (Filtering)") +
theme_bw() +
geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") +
geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic", color = txt_color),
axis.text.y = element_text(size = 12, face = "italic",
color = txt_color),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_dist_fl
R version 4.2.1 (2022-06-23)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.5 LTS
Matrix products: default
BLAS: /home/biocbuild/bbs-3.16-bioc/R/lib/libRblas.so
LAPACK: /home/biocbuild/bbs-3.16-bioc/R/lib/libRlapack.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB LC_COLLATE=C
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] caret_6.0-93 lattice_0.20-45 DT_0.26 forcats_0.5.2
[5] stringr_1.4.1 dplyr_1.0.10 purrr_0.3.5 readr_2.1.3
[9] tidyr_1.2.1 tibble_3.1.8 ggplot2_3.4.0 tidyverse_1.3.2
[13] ANCOMBC_2.0.1
loaded via a namespace (and not attached):
[1] utf8_1.2.2 tidyselect_1.2.0
[3] lme4_1.1-31 RSQLite_2.2.18
[5] htmlwidgets_1.5.4 grid_4.2.1
[7] BiocParallel_1.32.0 gmp_0.6-7
[9] pROC_1.18.0 munsell_0.5.0
[11] ScaledMatrix_1.6.0 codetools_0.2-18
[13] interp_1.1-3 future_1.28.0
[15] withr_2.5.0 colorspace_2.0-3
[17] Biobase_2.58.0 energy_1.7-10
[19] highr_0.9 knitr_1.40
[21] rstudioapi_0.14 stats4_4.2.1
[23] SingleCellExperiment_1.20.0 DescTools_0.99.47
[25] listenv_0.8.0 labeling_0.4.2
[27] MatrixGenerics_1.10.0 Rdpack_2.4
[29] emmeans_1.8.2 GenomeInfoDbData_1.2.9
[31] farver_2.1.1 bit64_4.0.5
[33] parallelly_1.32.1 coda_0.19-4
[35] vctrs_0.5.0 treeio_1.22.0
[37] generics_0.1.3 TH.data_1.1-1
[39] ipred_0.9-13 xfun_0.34
[41] R6_2.5.1 doParallel_1.0.17
[43] GenomeInfoDb_1.34.1 ggbeeswarm_0.6.0
[45] rsvd_1.0.5 bitops_1.0-7
[47] cachem_1.0.6 DelayedArray_0.24.0
[49] assertthat_0.2.1 scales_1.2.1
[51] multcomp_1.4-20 nnet_7.3-18
[53] googlesheets4_1.0.1 beeswarm_0.4.0
[55] rootSolve_1.8.2.3 gtable_0.3.1
[57] beachmat_2.14.0 globals_0.16.1
[59] lmom_2.9 sandwich_3.0-2
[61] timeDate_4021.106 rlang_1.0.6
[63] splines_4.2.1 lazyeval_0.2.2
[65] ModelMetrics_1.2.2.2 gargle_1.2.1
[67] broom_1.0.1 checkmate_2.1.0
[69] modelr_0.1.9 yaml_2.3.6
[71] reshape2_1.4.4 crosstalk_1.2.0
[73] backports_1.4.1 Hmisc_4.7-1
[75] lava_1.7.0 tools_4.2.1
[77] ellipsis_0.3.2 decontam_1.18.0
[79] jquerylib_0.1.4 RColorBrewer_1.1-3
[81] proxy_0.4-27 BiocGenerics_0.44.0
[83] MultiAssayExperiment_1.24.0 Rcpp_1.0.9
[85] plyr_1.8.7 base64enc_0.1-3
[87] sparseMatrixStats_1.10.0 zlibbioc_1.44.0
[89] RCurl_1.98-1.9 rpart_4.1.19
[91] TreeSummarizedExperiment_2.6.0 deldir_1.0-6
[93] viridis_0.6.2 S4Vectors_0.36.0
[95] zoo_1.8-11 SummarizedExperiment_1.28.0
[97] haven_2.5.1 ggrepel_0.9.1
[99] cluster_2.1.4 fs_1.5.2
[101] DECIPHER_2.26.0 magrittr_2.0.3
[103] data.table_1.14.4 lmerTest_3.1-3
[105] reprex_2.0.2 googledrive_2.0.0
[107] mvtnorm_1.1-3 matrixStats_0.62.0
[109] gsl_2.1-7.1 hms_1.1.2
[111] evaluate_0.17 xtable_1.8-4
[113] jpeg_0.1-9 readxl_1.4.1
[115] IRanges_2.32.0 gridExtra_2.3
[117] compiler_4.2.1 scater_1.26.0
[119] crayon_1.5.2 minqa_1.2.5
[121] htmltools_0.5.3 tzdb_0.3.0
[123] mgcv_1.8-41 Formula_1.2-4
[125] expm_0.999-6 Exact_3.2
[127] lubridate_1.8.0 DBI_1.1.3
[129] dbplyr_2.2.1 MASS_7.3-58.1
[131] boot_1.3-28 Matrix_1.5-1
[133] permute_0.9-7 cli_3.4.1
[135] rbibutils_2.2.9 gower_1.0.0
[137] parallel_4.2.1 GenomicRanges_1.50.0
[139] pkgconfig_2.0.3 numDeriv_2016.8-1.1
[141] foreign_0.8-83 recipes_1.0.2
[143] scuttle_1.8.0 xml2_1.3.3
[145] foreach_1.5.2 hardhat_1.2.0
[147] vipor_0.4.5 bslib_0.4.1
[149] DirichletMultinomial_1.40.0 rngtools_1.5.2
[151] XVector_0.38.0 prodlim_2019.11.13
[153] estimability_1.4.1 mia_1.6.0
[155] rvest_1.0.3 CVXR_1.0-11
[157] doRNG_1.8.2 yulab.utils_0.0.5
[159] digest_0.6.30 vegan_2.6-4
[161] Biostrings_2.66.0 rmarkdown_2.17
[163] cellranger_1.1.0 tidytree_0.4.1
[165] htmlTable_2.4.1 gld_2.6.6
[167] DelayedMatrixStats_1.20.0 nloptr_2.0.3
[169] lifecycle_1.0.3 nlme_3.1-160
[171] jsonlite_1.8.3 BiocNeighbors_1.16.0
[173] viridisLite_0.4.1 fansi_1.0.3
[175] pillar_1.8.1 fastmap_1.1.0
[177] httr_1.4.4 survival_3.4-0
[179] glue_1.6.2 png_0.1-7
[181] iterators_1.0.14 bit_4.0.4
[183] class_7.3-20 stringi_1.7.8
[185] sass_0.4.2 blob_1.2.3
[187] BiocSingular_1.14.0 latticeExtra_0.6-30
[189] memoise_2.0.1 Rmpfr_0.8-9
[191] future.apply_1.9.1 irlba_2.3.5.1
[193] e1071_1.7-12 ape_5.6-2
Lahti, Leo, Jarkko Salojärvi, Anne Salonen, Marten Scheffer, and Willem M De Vos. 2014. “Tipping Elements in the Human Intestinal Ecosystem.” Nature Communications 5 (1): 1–10.
Lahti, Leo, Sudarshan Shetty, T Blake, J Salojarvi, and others. 2017. “Tools for Microbiome Analysis in R.” Version 1: 10013.
Lin, Huang, Merete Eggesbø, and Shyamal Das Peddada. 2022. “Linear and Nonlinear Correlation Estimators Unveil Undescribed Taxa Interactions in Microbiome Data.” Nature Communications 13 (1): 1–16.
McMurdie, Paul J, and Susan Holmes. 2013. “Phyloseq: An R Package for Reproducible Interactive Analysis and Graphics of Microbiome Census Data.” PloS One 8 (4): e61217.