vignettes/articles/Stammler_2016_16S_spikein.Rmd
Stammler_2016_16S_spikein.Rmd
library(MicrobiomeBenchmarkDataAnalyses)
library(MicrobiomeBenchmarkData)
library(dplyr)
library(tibble)
library(tidyr)
library(biobroom)
library(ggplot2)
library(purrr)
The objective of this vignette is to compare the coefficient of variation (CV) of the three spike-in bacteria in the Stammler_2016_16S_spikein dataset using relative abundance (TSS) and centered-log-ratio transformation (CLR). However, since CLR is log transformed and TSS is not, a geometric mean normalization (GMN) method will be used instead of CLR.
S. ruber will be used for re-calibrating the abundance data (counts). This is referred to as SCML. The data will also be normalized with TSS (TSS) or GMN (GMN). The CV of the abundance data of the three spike-in bacteria across samples using these two normalization methods will be calculated and compared.
tse <- getBenchmarkData('Stammler_2016_16S_spikein', dryrun = FALSE)[[1]]
spk_bac <- c(
`S. ruber` = 'AF323500XXXX',
`R. radiobacter` = 'AB247615XXXX',
`A. acidiphilus` = 'AB076660XXXX'
)
counts <- assay(tse, 'counts')
s_ruber <- counts[spk_bac['S. ruber'], ]
size_factor <- s_ruber/mean(s_ruber)
SCML_data <- counts
for(i in seq(ncol(SCML_data))){
SCML_data[,i] <- round(SCML_data[,i] / size_factor[i])
}
assay(tse, 'SCML') <- SCML_data
tss_fun <- function(x) (x + 1) / sum((x + 1))
# tss_fun <- function(x) log((x + 1) / sum((x + 1)))
# tss_fun <- function(x) (x) / sum((x))
gmn_fun <- function(x) (x + 1) / exp(mean(log((x + 1))))
# gnm_fun <- function(x) (x + 1) / prod((x + 1)^(1 / length(x)))
# gmn_fun <- function(x) log((x + 1) / exp(mean(log((x + 1)))))
assay(tse, "TSS") <- apply(assay(tse, 'counts'), 2, tss_fun)
assay(tse, "GMN") <- apply(assay(tse, 'counts'), 2, gmn_fun)
spk_bac_tse <- tse[spk_bac,]
rownames(spk_bac_tse) <- names(spk_bac)
spk_bac_tse
#> class: TreeSummarizedExperiment
#> dim: 3 17
#> metadata(0):
#> assays(4): counts SCML TSS GMN
#> rownames(3): S. ruber R. radiobacter A. acidiphilus
#> rowData names(1): taxonomy
#> colnames(17): MID26 MID27 ... MID42 MID43
#> colData names(12): dataset subject_id ... country description
#> reducedDimNames(0):
#> mainExpName: NULL
#> altExpNames(0):
#> rowLinks: NULL
#> rowTree: NULL
#> colLinks: NULL
#> colTree: NULL
data <- spk_bac_tse |>
assays() |>
names() |>
map({
~ tidy.RangedSummarizedExperiment(spk_bac_tse, assay = .x) |>
magrittr::set_colnames(c("taxon", "sample", .x))
}) |>
reduce(.f = \(.x, .y) left_join(.x, .y, by = c("taxon", "sample")))
DT::datatable(data, filter = 'top')
get_cv <- function(x) {
cv <- function(x, n) { sd(x[n]) / abs(mean(x[n])) }
boot::boot(x, cv, R = 1000) |>
broom::tidy() |>
dplyr::rename(cv = statistic)
}
cv_res <- data %>%
group_by(taxon) %>%
summarize(across(.cols = counts:last_col(), .fns = get_cv)) %>%
pivot_longer(
cols = 2:last_col(), names_to = 'norm', values_to = 'cv_res'
) %>%
unnest(cols = 'cv_res')
DT::datatable(cv_res, filter = 'top')
Table in wider format:
cv_res |>
rename(
Species = taxon, `Normalization method` = norm,
CV = cv, SE = std.error
) |>
filter(`Normalization method` %in% c("GMN", "TSS")) |>
select(-bias) |>
mutate(
CV = round(CV, 2), SE = round(SE, 2)
) |>
mutate(
`Normalization method` = ifelse(
test = `Normalization method` == "TSS",
yes = "Relative abundance",
no = `Normalization method`
)
) |>
DT::datatable(
extensions = 'Buttons',,
filter = "top",
options = list(
dom = 'Bfrtip',
buttons = list(
list(
extend = 'copy',
text = 'Copy '
)
)
)
)
cv_res |>
filter(norm != 'counts') |>
mutate(
norm = factor(norm, levels = c(
'counts', 'SCML', 'TSS', 'GMN'
)
)
) %>%
mutate(taxon = forcats::fct_relevel(taxon, 'S. ruber')) |>
ggplot(aes(reorder(norm, cv), cv)) +
geom_point(aes(color = norm), size = 2) +
geom_errorbar(
aes(ymin = cv - std.error, ymax = cv + std.error, color = norm),
width = 0.4, size = 0.5
) +
scale_color_brewer(type = 'qual', palette = 'Set2') +
facet_wrap(~taxon) +
labs(
y = 'Coefficient of variation across all samples',
x = 'Data transformation'
) +
theme_bw() +
theme(
# axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.x = element_blank(),
strip.text = element_text(face = 'italic'),
legend.position = 'none'
)
#> Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
#> ℹ Please use `linewidth` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
TSS normalizaion has lower CV values than GMN. This could indicate that it introduces less bias than GMN, which is related to CLR.
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.4.1 (2024-06-14)
#> os Ubuntu 22.04.4 LTS
#> system x86_64, linux-gnu
#> ui X11
#> language en
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Etc/UTC
#> date 2024-09-24
#> pandoc 3.2 @ /usr/bin/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> abind 1.4-8 2024-09-12 [1] RSPM (R 4.4.0)
#> ape 5.8 2024-04-11 [1] RSPM (R 4.4.0)
#> backports 1.5.0 2024-05-23 [1] RSPM (R 4.4.0)
#> Biobase * 2.64.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> biobroom * 1.36.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> BiocFileCache 2.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> BiocGenerics * 0.50.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> BiocParallel 1.38.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> Biostrings * 2.72.1 2024-06-02 [1] Bioconductor 3.19 (R 4.4.1)
#> bit 4.5.0 2024-09-20 [1] RSPM (R 4.4.0)
#> bit64 4.5.2 2024-09-22 [1] RSPM (R 4.4.0)
#> blob 1.2.4 2023-03-17 [1] RSPM (R 4.4.0)
#> boot 1.3-30 2024-02-26 [2] CRAN (R 4.4.1)
#> broom * 1.0.6 2024-05-17 [1] RSPM (R 4.4.0)
#> bslib 0.8.0 2024-07-29 [1] RSPM (R 4.4.0)
#> cachem 1.1.0 2024-05-16 [1] RSPM (R 4.4.0)
#> cli 3.6.3 2024-06-21 [1] RSPM (R 4.4.0)
#> codetools 0.2-20 2024-03-31 [2] CRAN (R 4.4.1)
#> coin 1.4-3 2023-09-27 [1] RSPM (R 4.4.0)
#> colorspace 2.1-1 2024-07-26 [1] RSPM (R 4.4.0)
#> crayon 1.5.3 2024-06-20 [1] RSPM (R 4.4.0)
#> crosstalk 1.2.1 2023-11-23 [1] RSPM (R 4.4.0)
#> curl 5.2.3 2024-09-20 [1] RSPM (R 4.4.0)
#> DBI 1.2.3 2024-06-02 [1] RSPM (R 4.4.0)
#> dbplyr 2.5.0 2024-03-19 [1] RSPM (R 4.4.0)
#> DelayedArray 0.30.1 2024-05-07 [1] Bioconductor 3.19 (R 4.4.1)
#> desc 1.4.3 2023-12-10 [1] RSPM (R 4.4.0)
#> digest 0.6.37 2024-08-19 [1] RSPM (R 4.4.0)
#> dplyr * 1.1.4 2023-11-17 [1] RSPM (R 4.4.0)
#> DT 0.33 2024-04-04 [1] RSPM (R 4.4.0)
#> evaluate 1.0.0 2024-09-17 [1] RSPM (R 4.4.0)
#> fansi 1.0.6 2023-12-08 [1] RSPM (R 4.4.0)
#> farver 2.1.2 2024-05-13 [1] RSPM (R 4.4.0)
#> fastmap 1.2.0 2024-05-15 [1] RSPM (R 4.4.0)
#> filelock 1.0.3 2023-12-11 [1] RSPM (R 4.4.0)
#> forcats 1.0.0 2023-01-29 [1] RSPM (R 4.4.0)
#> fs 1.6.4 2024-04-25 [1] RSPM (R 4.4.0)
#> generics 0.1.3 2022-07-05 [1] RSPM (R 4.4.0)
#> GenomeInfoDb * 1.40.1 2024-05-24 [1] Bioconductor 3.19 (R 4.4.1)
#> GenomeInfoDbData 1.2.12 2024-06-25 [1] Bioconductor
#> GenomicRanges * 1.56.1 2024-06-12 [1] Bioconductor 3.19 (R 4.4.1)
#> ggplot2 * 3.5.1 2024-04-23 [1] RSPM (R 4.4.0)
#> glue 1.7.0 2024-01-09 [1] RSPM (R 4.4.0)
#> gtable 0.3.5 2024-04-22 [1] RSPM (R 4.4.0)
#> highr 0.11 2024-05-26 [1] RSPM (R 4.4.0)
#> htmltools 0.5.8.1 2024-04-04 [1] RSPM (R 4.4.0)
#> htmlwidgets 1.6.4 2023-12-06 [1] RSPM (R 4.4.0)
#> httr 1.4.7 2023-08-15 [1] RSPM (R 4.4.0)
#> IRanges * 2.38.1 2024-07-03 [1] Bioconductor 3.19 (R 4.4.1)
#> jquerylib 0.1.4 2021-04-26 [1] RSPM (R 4.4.0)
#> jsonlite 1.8.9 2024-09-20 [1] RSPM (R 4.4.0)
#> knitr 1.48 2024-07-07 [1] RSPM (R 4.4.0)
#> labeling 0.4.3 2023-08-29 [1] RSPM (R 4.4.0)
#> lattice 0.22-6 2024-03-20 [2] CRAN (R 4.4.1)
#> lazyeval 0.2.2 2019-03-15 [1] RSPM (R 4.4.0)
#> libcoin 1.0-10 2023-09-27 [1] RSPM (R 4.4.0)
#> lifecycle 1.0.4 2023-11-07 [1] RSPM (R 4.4.0)
#> magrittr 2.0.3 2022-03-30 [1] RSPM (R 4.4.0)
#> MASS 7.3-61 2024-06-13 [2] RSPM (R 4.4.0)
#> Matrix 1.7-0 2024-04-26 [2] CRAN (R 4.4.1)
#> MatrixGenerics * 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> matrixStats * 1.4.1 2024-09-08 [1] RSPM (R 4.4.0)
#> memoise 2.0.1 2021-11-26 [1] RSPM (R 4.4.0)
#> MicrobiomeBenchmarkData * 1.6.0 2024-05-02 [1] Bioconductor 3.19 (R 4.4.1)
#> MicrobiomeBenchmarkDataAnalyses * 0.99.11 2024-09-24 [1] local
#> modeltools 0.2-23 2020-03-05 [1] RSPM (R 4.4.0)
#> multcomp 1.4-26 2024-07-18 [1] RSPM (R 4.4.0)
#> munsell 0.5.1 2024-04-01 [1] RSPM (R 4.4.0)
#> mvtnorm 1.3-1 2024-09-03 [1] RSPM (R 4.4.0)
#> nlme 3.1-165 2024-06-06 [2] RSPM (R 4.4.0)
#> pillar 1.9.0 2023-03-22 [1] RSPM (R 4.4.0)
#> pkgconfig 2.0.3 2019-09-22 [1] RSPM (R 4.4.0)
#> pkgdown 2.1.1 2024-09-17 [1] RSPM (R 4.4.0)
#> purrr * 1.0.2 2023-08-10 [1] RSPM (R 4.4.0)
#> R6 2.5.1 2021-08-19 [1] RSPM (R 4.4.0)
#> ragg 1.3.3 2024-09-11 [1] RSPM (R 4.4.0)
#> RColorBrewer 1.1-3 2022-04-03 [1] RSPM (R 4.4.0)
#> Rcpp 1.0.13 2024-07-17 [1] RSPM (R 4.4.0)
#> rlang 1.1.4 2024-06-04 [1] RSPM (R 4.4.0)
#> rmarkdown 2.28 2024-08-17 [1] RSPM (R 4.4.0)
#> RSQLite 2.3.7 2024-05-27 [1] RSPM (R 4.4.0)
#> S4Arrays 1.4.1 2024-05-20 [1] Bioconductor 3.19 (R 4.4.1)
#> S4Vectors * 0.42.1 2024-07-03 [1] Bioconductor 3.19 (R 4.4.1)
#> sandwich 3.1-1 2024-09-15 [1] RSPM (R 4.4.0)
#> sass 0.4.9 2024-03-15 [1] RSPM (R 4.4.0)
#> scales 1.3.0 2023-11-28 [1] RSPM (R 4.4.0)
#> sessioninfo 1.2.2 2021-12-06 [1] RSPM (R 4.4.0)
#> SingleCellExperiment * 1.26.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> SparseArray 1.4.8 2024-05-24 [1] Bioconductor 3.19 (R 4.4.1)
#> SummarizedExperiment * 1.34.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.1)
#> survival 3.7-0 2024-06-05 [2] RSPM (R 4.4.0)
#> systemfonts 1.1.0 2024-05-15 [1] RSPM (R 4.4.0)
#> textshaping 0.4.0 2024-05-24 [1] RSPM (R 4.4.0)
#> TH.data 1.1-2 2023-04-17 [1] RSPM (R 4.4.0)
#> tibble * 3.2.1 2023-03-20 [1] RSPM (R 4.4.0)
#> tidyr * 1.3.1 2024-01-24 [1] RSPM (R 4.4.0)
#> tidyselect 1.2.1 2024-03-11 [1] RSPM (R 4.4.0)
#> tidytree 0.4.6 2023-12-12 [1] RSPM (R 4.4.0)
#> treeio 1.28.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> TreeSummarizedExperiment * 2.12.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> UCSC.utils 1.0.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> utf8 1.2.4 2023-10-22 [1] RSPM (R 4.4.0)
#> vctrs 0.6.5 2023-12-01 [1] RSPM (R 4.4.0)
#> withr 3.0.1 2024-07-31 [1] RSPM (R 4.4.0)
#> xfun 0.47 2024-08-17 [1] RSPM (R 4.4.0)
#> XVector * 0.44.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> yaml 2.3.10 2024-07-26 [1] RSPM (R 4.4.0)
#> yulab.utils 0.1.7 2024-08-26 [1] RSPM (R 4.4.0)
#> zlibbioc 1.50.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> zoo 1.8-12 2023-04-13 [1] RSPM (R 4.4.0)
#>
#> [1] /usr/local/lib/R/site-library
#> [2] /usr/local/lib/R/library
#>
#> ──────────────────────────────────────────────────────────────────────────────