fieldworkanalysis_samara.Rmd
install.packages(c("devtools", "tidyverse", "kableExtra"))
devtools::install_github("waldronlab/bugSigSimple")
devtools::install_github("waldronlab/BugSigDBStats")
devtools::install_github("waldronlab/bugsigdbr")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.1.6 ✔ dplyr 1.0.8
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
dat <- bugsigdbr::importBugSigDB(cache= TRUE)
## Using cached version from 2022-03-31 12:38:44
dim(dat)
## [1] 2098 48
names(dat)
## [1] "Study" "Study design"
## [3] "PMID" "DOI"
## [5] "URL" "Authors"
## [7] "Title" "Journal"
## [9] "Year" "Experiment"
## [11] "Location of subjects" "Host species"
## [13] "Body site" "UBERON ID"
## [15] "Condition" "EFO ID"
## [17] "Group 0 name" "Group 1 name"
## [19] "Group 1 definition" "Group 0 sample size"
## [21] "Group 1 sample size" "Antibiotics exclusion"
## [23] "Sequencing type" "16S variable region"
## [25] "Sequencing platform" "Statistical test"
## [27] "Significance threshold" "MHT correction"
## [29] "LDA Score above" "Matched on"
## [31] "Confounders controlled for" "Pielou"
## [33] "Shannon" "Chao1"
## [35] "Simpson" "Inverse Simpson"
## [37] "Richness" "Signature page name"
## [39] "Source" "Curated date"
## [41] "Curator" "Revision editor"
## [43] "Description" "Abundance in Group 1"
## [45] "MetaPhlAn taxon names" "NCBI Taxonomy IDs"
## [47] "State" "Reviewer"
Subsetting only studies done on humans - 6 of 17 studies were excluded because they were either done on mice or not statistically reliable (Cregger et. al)
subset.dat <-
dat[which(
dat$PMID == "30778155" |
dat$PMID == "32192080" |
dat$PMID == "31087436" |
dat$PMID == "26901400" |
dat$PMID == "33839907" |
dat$PMID == "32046455" |
dat$PMID == "33925708" |
dat$PMID == "32299442" |
dat$PMID == "33313185" |
dat$PMID == "34268384" | dat$PMID == "33660232"
), ]
Summary of studies and most frequent taxa increased and decreased in endometriosis patients for all studies
bugSigSimple::createStudyTable(subset.dat)
## # A tibble: 11 × 5
## Study Condition Cases Controls `Study Design`
## <chr> <chr> <dbl> <dbl> <chr>
## 1 Akiyama 2019 endometriosis 30 39 cross-sectional observational, n…
## 2 Ata 2019 endometriosis 14 14 prospective cohort
## 3 Chao 2021 endometriosis 37 66 case-control
## 4 Chen 2020 endometriosis 14 67 cross-sectional observational, n…
## 5 Hernandes 2020 endometriosis 10 11 case-control
## 6 Khan 2016 endometriosis 32 32 case-control
## 7 Lee 2021 endometriosis 45 45 case-control
## 8 Perrotta 2020 endometriosis 14 21 cross-sectional observational, n…
## 9 Shan 2021 endometriosis 12 12 case-control
## 10 Svensson 2021 endometriosis 66 198 case-control
## 11 Wei 2020 endometriosis 36 14 cross-sectional observational, n…
getMostFrequentTaxa(subset.dat,n=30)
##
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Atopobiaceae|g__Atopobium
## 7
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus
## 5
## k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae
## 5
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae
## 4
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae
## 4
## k__Bacteria|p__Proteobacteria|c__Epsilonproteobacteria|o__Campylobacterales|f__Campylobacteraceae|g__Campylobacter
## 4
## k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|s__Escherichia/Shigella sp.
## 4
## k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas viridiflava
## 4
## k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Gardnerella
## 3
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus iners
## 3
## k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Clostridiaceae|g__Clostridium
## 3
## k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Lachnospiraceae|g__Lachnospira
## 3
## k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae
## 3
## k__Bacteria|p__Actinobacteria
## 2
## k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces
## 2
## k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Actinomycetales|f__Actinomycetaceae|g__Mobiluncus
## 2
## k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium
## 2
## k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Micrococcales|f__Micrococcaceae
## 2
## k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales
## 2
## k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella
## 2
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Staphylococcaceae
## 2
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|g__Gemella
## 2
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Aerococcaceae|g__Aerococcus
## 2
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Enterococcaceae|g__Enterococcus
## 2
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae
## 2
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus
## 2
## k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Lachnospiraceae|g__Mediterraneibacter|s__[Ruminococcus] gnavus
## 2
## k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Faecalibacterium
## 2
## k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Oscillospira
## 2
## k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Ruminiclostridium
## 2
getMostFrequentTaxa(subset.dat,sig.type="increased")
##
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus
## 5
## k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae
## 5
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae
## 4
## k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|s__Escherichia/Shigella sp.
## 4
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae
## 3
## k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Clostridiaceae|g__Clostridium
## 3
## k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae
## 3
## k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas viridiflava
## 3
## k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium
## 2
## k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Gardnerella
## 2
getMostFrequentTaxa(subset.dat,sig.type="decreased")
##
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Atopobiaceae|g__Atopobium
## 5
## k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Actinomycetales|f__Actinomycetaceae|g__Mobiluncus
## 2
## k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales
## 2
## k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella
## 2
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus iners
## 2
## k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Lachnospiraceae|g__Lachnospira
## 2
## k__Bacteria|p__Firmicutes|c__Negativicutes|o__Veillonellales|f__Veillonellaceae|g__Megasphaera
## 2
## k__Bacteria|p__Fusobacteria|c__Fusobacteriia|o__Fusobacteriales|f__Leptotrichiaceae|g__Sneathia
## 2
## k__Bacteria|p__Proteobacteria|c__Epsilonproteobacteria|o__Campylobacterales|f__Campylobacteraceae|g__Campylobacter
## 2
## k__Bacteria|p__Actinobacteria
## 1
Summary of studies and most frequent taxa in only samples from female reproductive tract, excluding feces samples
subset.dat2 <-
dat[which(
dat$PMID == "30778155" |
dat$PMID == "32192080" |
dat$PMID == "31087436" |
dat$PMID == "26901400" |
dat$PMID == "32046455" |
dat$PMID == "33925708" |
dat$PMID == "32299442" |
dat$PMID == "33313185" | dat$PMID == "34268384"
), ]
reproductive_sigs <-
subset.dat2[which(subset.dat2$`Body site` != "feces" |
is.na(subset.dat2$`Body site`)), ]
bugSigSimple::createStudyTable(reproductive_sigs)
## # A tibble: 9 × 5
## Study Condition Cases Controls `Study Design`
## <chr> <chr> <dbl> <dbl> <chr>
## 1 Akiyama 2019 endometriosis 30 39 cross-sectional observational, no…
## 2 Ata 2019 endometriosis 14 14 prospective cohort
## 3 Chao 2021 endometriosis 37 66 case-control
## 4 Chen 2020 endometriosis 14 67 cross-sectional observational, no…
## 5 Hernandes 2020 endometriosis 10 11 case-control
## 6 Khan 2016 endometriosis 32 32 case-control
## 7 Lee 2021 endometriosis 45 45 case-control
## 8 Perrotta 2020 endometriosis 14 21 cross-sectional observational, no…
## 9 Wei 2020 endometriosis 36 14 cross-sectional observational, no…
allfreqs <- bugSigSimple::createTaxonTable(reproductive_sigs, n = 20) %>% #could change number
arrange(I(decreased_signatures - increased_signatures))
incfreqs <- filter(allfreqs, I(increased_signatures - decreased_signatures) > 0)
decfreqs <- filter(allfreqs, I(increased_signatures - decreased_signatures) < 0)
kableExtra::kbl(allfreqs) %>%
kable_paper("hover", full_width = FALSE)
Taxon Name | Taxonomic Level | total_signatures | increased_signatures | decreased_signatures | Binomial Test pval | kingdom | phylum | class | order | family | genus | species | n_signatures | metaphlan_name |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Streptococcus | genus | 4 | 4 | 0 | 0.12 | Bacteria | Firmicutes | Bacilli | Lactobacillales | Streptococcaceae | Streptococcus | NA | 4 | k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus |
Escherichia/Shigella sp. | species | 4 | 4 | 0 | 0.12 | Bacteria | Proteobacteria | Gammaproteobacteria | Enterobacterales | Enterobacteriaceae | Escherichia/Shigella sp. | NA | 4 | k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|s__Escherichia/Shigella sp. |
Coriobacteriaceae | family | 3 | 3 | 0 | 0.25 | Bacteria | Actinobacteria | Coriobacteriia | Coriobacteriales | Coriobacteriaceae | NA | NA | 3 | k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae |
Streptococcaceae | family | 3 | 3 | 0 | 0.25 | Bacteria | Firmicutes | Bacilli | Lactobacillales | Streptococcaceae | NA | NA | 7 | k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae |
Enterobacteriaceae | family | 3 | 3 | 0 | 0.25 | Bacteria | Proteobacteria | Gammaproteobacteria | Enterobacterales | Enterobacteriaceae | NA | NA | 7 | k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae |
Pseudomonadaceae | family | 3 | 3 | 0 | 0.25 | Bacteria | Proteobacteria | Gammaproteobacteria | Pseudomonadales | Pseudomonadaceae | NA | NA | 7 | k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae |
Pseudomonas viridiflava | species | 4 | 3 | 1 | 0.63 | Bacteria | Proteobacteria | Gammaproteobacteria | Pseudomonadales | Pseudomonadaceae | Pseudomonas | Pseudomonas viridiflava | 4 | k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas viridiflava |
Gardnerella | genus | 2 | 2 | 0 | 0.50 | Bacteria | Actinobacteria | Actinomycetia | Bifidobacteriales | Bifidobacteriaceae | Gardnerella | NA | 2 | k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Gardnerella |
Staphylococcaceae | family | 2 | 2 | 0 | 0.50 | Bacteria | Firmicutes | Bacilli | Bacillales | Staphylococcaceae | NA | NA | 4 | k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Staphylococcaceae |
Clostridium | genus | 2 | 2 | 0 | 0.50 | Bacteria | Firmicutes | Clostridia | Eubacteriales | Clostridiaceae | Clostridium | NA | 2 | k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Clostridiaceae|g__Clostridium |
Faecalibacterium | genus | 2 | 2 | 0 | 0.50 | Bacteria | Firmicutes | Clostridia | Eubacteriales | Oscillospiraceae | Faecalibacterium | NA | 2 | k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Faecalibacterium |
Campylobacter | genus | 4 | 2 | 2 | 1.00 | Bacteria | Proteobacteria | Epsilonproteobacteria | Campylobacterales | Campylobacteraceae | Campylobacter | NA | 4 | k__Bacteria|p__Proteobacteria|c__Epsilonproteobacteria|o__Campylobacterales|f__Campylobacteraceae|g__Campylobacter |
Micrococcaceae | family | 2 | 1 | 1 | 1.00 | Bacteria | Actinobacteria | Actinomycetia | Micrococcales | Micrococcaceae | NA | NA | 3 | k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Micrococcales|f__Micrococcaceae |
Gemella | genus | 2 | 1 | 1 | 1.00 | Bacteria | Firmicutes | Bacilli | Bacillales | Gemella | NA | NA | 2 | k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|g__Gemella |
Aerococcus | genus | 2 | 1 | 1 | 1.00 | Bacteria | Firmicutes | Bacilli | Lactobacillales | Aerococcaceae | Aerococcus | NA | 2 | k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Aerococcaceae|g__Aerococcus |
Erysipelotrichaceae | family | 2 | 1 | 1 | 1.00 | Bacteria | Firmicutes | Erysipelotrichia | Erysipelotrichales | Erysipelotrichaceae | NA | NA | 2 | k__Bacteria|p__Firmicutes|c__Erysipelotrichia|o__Erysipelotrichales|f__Erysipelotrichaceae |
Lactobacillus iners | species | 3 | 1 | 2 | 1.00 | Bacteria | Firmicutes | Bacilli | Lactobacillales | Lactobacillaceae | Lactobacillus | Lactobacillus iners | 3 | k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus iners |
Mobiluncus | genus | 2 | 0 | 2 | 0.50 | Bacteria | Actinobacteria | Actinomycetia | Actinomycetales | Actinomycetaceae | Mobiluncus | NA | 2 | k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Actinomycetales|f__Actinomycetaceae|g__Mobiluncus |
Prevotella | genus | 2 | 0 | 2 | 0.50 | Bacteria | Bacteroidetes | Bacteroidia | Bacteroidales | Prevotellaceae | Prevotella | NA | 3 | k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella |
Atopobium | genus | 7 | 2 | 5 | 0.45 | Bacteria | Actinobacteria | Coriobacteriia | Coriobacteriales | Atopobiaceae | Atopobium | NA | 7 | k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Atopobiaceae|g__Atopobium |
suppressPackageStartupMessages(library(curatedMetagenomicData))
# hmp_vagina_metadata <- filter(sampleMetadata, study_name == "HMP_2012" & body_site == "vagina")
all_healthy_adult_vagina_metadata <- filter(sampleMetadata, body_site == "vagina" & disease == "healthy" & age_category == "adult")
se <-
curatedMetagenomicData::returnSamples(all_healthy_adult_vagina_metadata, dataType = "relative_abundance", counts = FALSE)
Get matrices of species and genus relative abundance
allranks <- mia::splitByRanks(se)
#species_relab <- t(assay(allranks[["species"]]))
genus_relab <- t(assay(allranks[["genus"]]))
#family_relab <- t(assay(allranks[["family"]]))
Centered log-ratio transformation on genus level relative abundance:
genus_relab_clr <- Hotelling::clr(genus_relab + 1)
df <- dplyr::select(data.frame(genus_relab_clr), c("Streptococcus", "Lactobacillus"))
df$age <- se$age
df$BMI <- se$BMI
fit <- lm(Streptococcus ~ Lactobacillus + age + BMI, data = df)
summary(fit)
##
## Call:
## lm(formula = Streptococcus ~ Lactobacillus + age + BMI, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6728 -0.1875 -0.0264 0.0893 3.6729
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.386413 0.436694 -0.885 0.378987
## Lactobacillus -0.078095 0.047933 -1.629 0.107344
## age 0.035738 0.010073 3.548 0.000665 ***
## BMI -0.009094 0.014295 -0.636 0.526581
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4857 on 77 degrees of freedom
## (14 observations deleted due to missingness)
## Multiple R-squared: 0.1799, Adjusted R-squared: 0.1479
## F-statistic: 5.629 on 3 and 77 DF, p-value: 0.00153
##
## Call:
## lm(formula = Streptococcus ~ Lactobacillus, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6006 -0.0609 -0.0462 -0.0440 4.1909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.55246 0.20193 2.736 0.00745 **
## Lactobacillus -0.11409 0.04751 -2.401 0.01832 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5642 on 93 degrees of freedom
## Multiple R-squared: 0.05839, Adjusted R-squared: 0.04826
## F-statistic: 5.767 on 1 and 93 DF, p-value: 0.01832
wilcox.test(df$Streptococcus, df$Lactobacillus)
##
## Wilcoxon rank sum test with continuity correction
##
## data: df$Streptococcus and df$Lactobacillus
## W = 213.5, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
t.test(df$Streptococcus, df$Lactobacillus)
##
## Welch Two Sample t-test
##
## data: df$Streptococcus and df$Lactobacillus
## t = -28.666, df = 133.93, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.258800 -3.709054
## sample estimates:
## mean of x mean of y
## 0.0879008 4.0718280
secount <-
curatedMetagenomicData::returnSamples(all_healthy_adult_vagina_metadata, dataType = "relative_abundance", counts = TRUE)
## snapshotDate(): 2021-10-19
##
## $`2021-03-31.FerrettiP_2018.relative_abundance`
## dropping rows without rowTree matches:
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_unclassified|g__Gemella|s__Gemella_bergeri
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
## k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus|s__Ruminococcus_champanellensis
## k__Bacteria|p__Firmicutes|c__Erysipelotrichia|o__Erysipelotrichales|f__Erysipelotrichaceae|g__Bulleidia|s__Bulleidia_extructa
## k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
## k__Bacteria|p__Spirochaetes|c__Spirochaetia|o__Leptospirales|f__Leptospiraceae|g__Leptospira|s__Leptospira_hartskeerlii
## k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
## $`2021-03-31.HMP_2012.relative_abundance`
## dropping rows without rowTree matches:
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Atopobiaceae|g__Olsenella|s__Olsenella_profusa
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_unclassified|g__Gemella|s__Gemella_bergeri
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
## k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus|s__Ruminococcus_champanellensis
## k__Bacteria|p__Firmicutes|c__Erysipelotrichia|o__Erysipelotrichales|f__Erysipelotrichaceae|g__Bulleidia|s__Bulleidia_extructa
## k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
## k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
## $`2021-03-31.WampachL_2018.relative_abundance`
## dropping rows without rowTree matches:
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
## k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
## k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
allranks <- mia::splitByRanks(secount)
genus_relab_count <- t(assay(allranks[["genus"]]))
df <- dplyr::select(data.frame(genus_relab_count), c("Streptococcus", "Lactobacillus"))
df$age <- secount$age
df$BMI <- secount$BMI
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
fit.ZInegbin <- pscl::zeroinfl(Streptococcus ~ Lactobacillus + age + BMI | 1,
data = df,
dist = "negbin")
library(curatedMetagenomicData)
library(dplyr)
library(DT)
library(purrr)
taxon_strep <- c("1884" = "Streptococcus")
taxon_lact <- c("1578" = "Lactobacillus")
target_sample <- "vagina"
sample_metadata <- sampleMetadata %>%
filter(
grepl(target_sample, body_site) | grepl(target_sample, body_subsite)
) %>%
discard(~all(is.na(.x))) %>%
as_tibble()
dim(sample_metadata)
## [1] 96 26
unique(sample_metadata$study_name)
## [1] "FerrettiP_2018" "HMP_2012" "WampachL_2018"
tse <- returnSamples(
sampleMetadata = sample_metadata,
dataType = "relative_abundance",
counts = FALSE,
rownames = "short"
)
## snapshotDate(): 2021-10-19
##
## $`2021-03-31.FerrettiP_2018.relative_abundance`
## dropping rows without rowTree matches:
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_unclassified|g__Gemella|s__Gemella_bergeri
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
## k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus|s__Ruminococcus_champanellensis
## k__Bacteria|p__Firmicutes|c__Erysipelotrichia|o__Erysipelotrichales|f__Erysipelotrichaceae|g__Bulleidia|s__Bulleidia_extructa
## k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
## k__Bacteria|p__Spirochaetes|c__Spirochaetia|o__Leptospirales|f__Leptospiraceae|g__Leptospira|s__Leptospira_hartskeerlii
## k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
## $`2021-03-31.HMP_2012.relative_abundance`
## dropping rows without rowTree matches:
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Atopobiaceae|g__Olsenella|s__Olsenella_profusa
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_unclassified|g__Gemella|s__Gemella_bergeri
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
## k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus|s__Ruminococcus_champanellensis
## k__Bacteria|p__Firmicutes|c__Erysipelotrichia|o__Erysipelotrichales|f__Erysipelotrichaceae|g__Bulleidia|s__Bulleidia_extructa
## k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
## k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
## $`2021-03-31.WampachL_2018.relative_abundance`
## dropping rows without rowTree matches:
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
## k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
## k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
## k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
## k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
tse
## class: TreeSummarizedExperiment
## dim: 902 96
## metadata(0):
## assays(1): relative_abundance
## rownames(902): Escherichia coli Klebsiella michiganensis ...
## Pediococcus pentosaceus Rahnella aquatilis
## rowData names(7): superkingdom phylum ... genus species
## colnames(96): CA_C10001MS2003VA_t0M15 CA_C10003MS2031VA_t0M15 ...
## MV_C118 MV_C120
## colData names(25): study_name subject_id ... family_role body_subsite
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
## rowLinks: a LinkDataFrame (902 rows)
## rowTree: 1 phylo tree(s) (10430 leaves)
## colLinks: NULL
## colTree: NULL
## [1] 4049.558
var(df$Streptococcus)
## [1] 509068831
## [1] 0.8736842
#Fit log-linear models
fit.pois <- glm(Streptococcus ~ Lactobacillus + age + BMI,
data = df,
family = poisson(link = "log"))
summary (fit.pois)
##
## Call:
## glm(formula = Streptococcus ~ Lactobacillus + age + BMI, family = poisson(link = "log"),
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -206.14 -36.50 -19.90 -10.99 541.19
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.313e-01 1.873e-02 -28.36 <2e-16 ***
## Lactobacillus 5.141e-08 9.828e-10 52.32 <2e-16 ***
## age 3.599e-01 4.136e-04 870.25 <2e-16 ***
## BMI -1.573e-01 6.453e-04 -243.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1929049 on 80 degrees of freedom
## Residual deviance: 632598 on 77 degrees of freedom
## (14 observations deleted due to missingness)
## AIC: 632704
##
## Number of Fisher Scoring iterations: 20
library(pscl)
fit.ZIpois <- zeroinfl(Streptococcus ~ Lactobacillus + age + BMI | 1,
data = df,
dist = "poisson")
summary (fit.ZIpois)
##
## Call:
## zeroinfl(formula = Streptococcus ~ Lactobacillus + age + BMI | 1, data = df,
## dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.3753 -0.3753 -0.3752 -0.3750 27.8168
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.169e+00 NaN NaN NaN
## Lactobacillus 1.321e-07 NaN NaN NaN
## age 2.420e-01 NaN NaN NaN
## BMI -2.116e-01 NaN NaN NaN
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.96 NaN NaN NaN
##
## Number of iterations in BFGS optimization: 1
## Log-likelihood: -1.02e+05 on 5 Df
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
fit.negbin <-
glm.nb(
Streptococcus ~ Lactobacillus + age + BMI,
data = df,
control = glm.control(
epsilon = 1e-8,
maxit = 25,
trace = FALSE
)
)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in glm.nb(Streptococcus ~ Lactobacillus + age + BMI, data = df, :
## alternation limit reached
summary(fit.negbin)
##
## Call:
## glm.nb(formula = Streptococcus ~ Lactobacillus + age + BMI, data = df,
## control = glm.control(epsilon = 1e-08, maxit = 25, trace = FALSE),
## init.theta = 0.01199558153, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -206.1 -36.5 -19.9 -11.0 541.2
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.313e-01 1.873e-02 -28.36 <2e-16 ***
## Lactobacillus 5.141e-08 9.828e-10 52.32 <2e-16 ***
## age 3.599e-01 4.136e-04 870.25 <2e-16 ***
## BMI -1.573e-01 6.453e-04 -243.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(5.380153e+14) family taken to be 1)
##
## Null deviance: 1929048 on 80 degrees of freedom
## Residual deviance: 632598 on 77 degrees of freedom
## (14 observations deleted due to missingness)
## AIC: 280.39
##
## Number of Fisher Scoring iterations: 5
##
##
## Theta: 0.01200
## Std. Err.: 0.00395
## Warning while fitting theta: alternation limit reached
##
## 2 x log-likelihood: -270.38700
fit.ZInegbin <- zeroinfl(Streptococcus ~ Lactobacillus + age + BMI | 1,
data = df,
dist = "negbin")
summary(fit.ZInegbin)
##
## Call:
## zeroinfl(formula = Streptococcus ~ Lactobacillus + age + BMI | 1, data = df,
## dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.1898 -0.1898 -0.1898 -0.1898 7.0566
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.945e+01 NaN NaN NaN
## Lactobacillus -2.787e-07 NaN NaN NaN
## age 2.837e-02 NaN NaN NaN
## BMI -4.085e-01 NaN NaN NaN
## Log(theta) -9.837e-01 NaN NaN NaN
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.921 NaN NaN NaN
##
## Theta = 0.3739
## Number of iterations in BFGS optimization: 14
## Log-likelihood: -131.8 on 6 Df