Analysis objectives

  1. Import, recode, and subset data from bugsigdb.org
  2. Create a table of studies
  3. Calculate the frequency of appearance of each taxa in independent signatures and identify the most frequently reported taxa
  4. Estimate the probability of the most frequently identified taxa occuring by chance

Making sure packages are installed

install.packages(c("devtools", "tidyverse", "kableExtra"))
devtools::install_github("waldronlab/bugSigSimple")
devtools::install_github("waldronlab/BugSigDBStats")
devtools::install_github("waldronlab/bugsigdbr")

Load and subset data

## 
## Attaching package: 'BugSigDBStats'
## The following object is masked from 'package:bugSigSimple':
## 
##     getMostFrequentTaxa
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
##  dplyr     1.1.4      readr     2.1.5
##  forcats   1.0.0      stringr   1.5.1
##  ggplot2   3.5.1      tibble    3.2.1
##  lubridate 1.9.4      tidyr     1.3.1
##  purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
##  dplyr::filter() masks stats::filter()
##  dplyr::lag()    masks stats::lag()
##  Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(bugSigSimple)
dat <- bugsigdbr::importBugSigDB(cache= TRUE)
## Using cached version from 2025-01-30 22:02:34
dim(dat)
## [1] 5520   50
names(dat)
##  [1] "BSDB ID"                    "Study"                     
##  [3] "Study design"               "PMID"                      
##  [5] "DOI"                        "URL"                       
##  [7] "Authors list"               "Title"                     
##  [9] "Journal"                    "Year"                      
## [11] "Keywords"                   "Experiment"                
## [13] "Location of subjects"       "Host species"              
## [15] "Body site"                  "UBERON ID"                 
## [17] "Condition"                  "EFO ID"                    
## [19] "Group 0 name"               "Group 1 name"              
## [21] "Group 1 definition"         "Group 0 sample size"       
## [23] "Group 1 sample size"        "Antibiotics exclusion"     
## [25] "Sequencing type"            "16S variable region"       
## [27] "Sequencing platform"        "Statistical test"          
## [29] "Significance threshold"     "MHT correction"            
## [31] "LDA Score above"            "Matched on"                
## [33] "Confounders controlled for" "Pielou"                    
## [35] "Shannon"                    "Chao1"                     
## [37] "Simpson"                    "Inverse Simpson"           
## [39] "Richness"                   "Signature page name"       
## [41] "Source"                     "Curated date"              
## [43] "Curator"                    "Revision editor"           
## [45] "Description"                "Abundance in Group 1"      
## [47] "MetaPhlAn taxon names"      "NCBI Taxonomy IDs"         
## [49] "State"                      "Reviewer"

Subsetting

Subsetting only studies done on humans - 6 of 17 studies were excluded because they were either done on mice or not statistically reliable (Cregger et. al)

subset.dat <-
  dat[which(
    dat$PMID == "30778155" |
      dat$PMID == "32192080" |
      dat$PMID == "31087436" |
      dat$PMID == "26901400" |
      dat$PMID == "33839907" |
      dat$PMID == "32046455" |
      dat$PMID == "33925708" |
      dat$PMID == "32299442" |
      dat$PMID == "33313185" |
      dat$PMID == "34268384" | dat$PMID == "33660232"
  ), ]

All studies

Summary of studies and most frequent taxa increased and decreased in endometriosis patients for all studies

bugSigSimple::createStudyTable(subset.dat)
## # A tibble: 11 × 5
##    Study          Condition                 Cases Controls `Study Design`       
##    <chr>          <chr>                     <dbl>    <dbl> <chr>                
##  1 Akiyama 2019   Endometriosis                30       39 cross-sectional obse…
##  2 Ata 2019       Endometriosis                14       14 prospective cohort   
##  3 Chao 2021      Endometriosis                37       66 case-control         
##  4 Chen 2020      Adenomyosis,Endometriosis    14      120 case-control         
##  5 Hernandes 2020 Endometriosis                21       11 case-control         
##  6 Khan 2016      Endometriosis                32       32 case-control         
##  7 Lee 2021       Endometriosis                45       45 case-control         
##  8 Perrotta 2020  Endometriosis                 8       12 cross-sectional obse…
##  9 Shan 2021      Endometriosis                12       12 case-control         
## 10 Svensson 2021  Endometriosis                66      198 case-control         
## 11 Wei 2020       Endometriosis                36       14 cross-sectional obse…
getMostFrequentTaxa(subset.dat, n=30)
##    1301    1380    1578     194     543     838   84107  119852    1300    1350 
##       7       5       4       4       4       4       4       3       3       3 
##  135621    1375  171549 1940338    2050   28050     286   39948     469     906 
##       3       3       3       3       3       3       3       3       3       3 
##   13687 1508657 1582879    1654    1663  168808    1743  216851    2701   33038 
##       2       2       2       2       2       2       2       2       2       2
getMostFrequentTaxa(subset.dat, direction="UP")
##    1301     543  119852    1300    1350  135621    1578 1940338     286     469 
##       6       4       3       3       3       3       3       3       3       3
getMostFrequentTaxa(subset.dat, direction="DOWN")
##   1380 168808 171549    194   2050  28050  33958    838    906   1033 
##      3      2      2      2      2      2      2      2      2      1

Excluding feces samples

Summary of studies and most frequent taxa in only samples from female reproductive tract, excluding feces samples

subset.dat2 <-
  dat[which(
    dat$PMID == "30778155" |
      dat$PMID == "32192080" |
      dat$PMID == "31087436" |
      dat$PMID == "26901400" |
      dat$PMID == "32046455" |
      dat$PMID == "33925708" |
      dat$PMID == "32299442" |
      dat$PMID == "33313185" | dat$PMID == "34268384"
  ), ]

reproductive_sigs <-
  subset.dat2[which(subset.dat2$`Body site` != "feces" |
                      is.na(subset.dat2$`Body site`)), ]

bugSigSimple::createStudyTable(reproductive_sigs)
## # A tibble: 9 × 5
##   Study          Condition                 Cases Controls `Study Design`        
##   <chr>          <chr>                     <dbl>    <dbl> <chr>                 
## 1 Akiyama 2019   Endometriosis                30       39 cross-sectional obser…
## 2 Ata 2019       Endometriosis                14       14 prospective cohort    
## 3 Chao 2021      Endometriosis                37       66 case-control          
## 4 Chen 2020      Adenomyosis,Endometriosis    14      120 case-control          
## 5 Hernandes 2020 Endometriosis                21       11 case-control          
## 6 Khan 2016      Endometriosis                32       32 case-control          
## 7 Lee 2021       Endometriosis                45       45 case-control          
## 8 Perrotta 2020  Endometriosis                 8       12 cross-sectional obser…
## 9 Wei 2020       Endometriosis                36       14 cross-sectional obser…
allfreqs <- bugSigSimple::createTaxonTable(reproductive_sigs, n = 20) %>% #could change number
  arrange(I(decreased_signatures - increased_signatures))
incfreqs <- filter(allfreqs, I(increased_signatures - decreased_signatures) > 0)
decfreqs <- filter(allfreqs, I(increased_signatures - decreased_signatures) < 0)
kableExtra::kbl(allfreqs) %>%
  kable_paper("hover", full_width = FALSE)
Taxon Name Taxonomic Level total_signatures increased_signatures decreased_signatures Binomial Test pval kingdom phylum class order family genus species n_signatures metaphlan_name
Streptococcus genus 6 5 1 0.22 Bacteria Bacillota Bacilli Lactobacillales Streptococcaceae Streptococcus NA 6 k__Bacteria|p__Bacillota|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus
Coriobacteriaceae family 3 3 0 0.25 Bacteria Actinomycetota Coriobacteriia Coriobacteriales Coriobacteriaceae NA NA 3 k__Bacteria|p__Actinomycetota|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae
Streptococcaceae family 3 3 0 0.25 Bacteria Bacillota Bacilli Lactobacillales Streptococcaceae NA NA 9 k__Bacteria|p__Bacillota|c__Bacilli|o__Lactobacillales|f__Streptococcaceae
Enterobacteriaceae family 3 3 0 0.25 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae NA NA 6 k__Bacteria|p__Pseudomonadota|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae
Escherichia/Shigella sp. species 3 3 0 0.25 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Escherichia/Shigella sp. NA 3 k__Bacteria|p__Pseudomonadota|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|s__Escherichia/Shigella sp.
Acinetobacter genus 3 3 0 0.25 Bacteria Pseudomonadota Gammaproteobacteria Moraxellales Moraxellaceae Acinetobacter NA 3 k__Bacteria|p__Pseudomonadota|c__Gammaproteobacteria|o__Moraxellales|f__Moraxellaceae|g__Acinetobacter
Pseudomonadaceae family 3 3 0 0.25 Bacteria Pseudomonadota Gammaproteobacteria Pseudomonadales Pseudomonadaceae NA NA 4 k__Bacteria|p__Pseudomonadota|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae
Pseudomonas genus 3 3 0 0.25 Bacteria Pseudomonadota Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas NA 3 k__Bacteria|p__Pseudomonadota|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas
Alloscardovia genus 2 2 0 0.50 Bacteria Actinomycetota Actinomycetes Bifidobacteriales Bifidobacteriaceae Alloscardovia NA 2 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Alloscardovia
Arthrobacter genus 2 2 0 0.50 Bacteria Actinomycetota Actinomycetes Micrococcales Micrococcaceae Arthrobacter NA 2 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Micrococcales|f__Micrococcaceae|g__Arthrobacter
Aerococcus genus 3 2 1 1.00 Bacteria Bacillota Bacilli Lactobacillales Aerococcaceae Aerococcus NA 3 k__Bacteria|p__Bacillota|c__Bacilli|o__Lactobacillales|f__Aerococcaceae|g__Aerococcus
Lactobacillus genus 3 2 1 1.00 Bacteria Bacillota Bacilli Lactobacillales Lactobacillaceae Lactobacillus NA 4 k__Bacteria|p__Bacillota|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus
Dialister genus 3 2 1 1.00 Bacteria Bacillota Negativicutes Veillonellales Veillonellaceae Dialister NA 3 k__Bacteria|p__Bacillota|c__Negativicutes|o__Veillonellales|f__Veillonellaceae|g__Dialister
Prevotella genus 4 2 2 1.00 Bacteria Bacteroidota Bacteroidia Bacteroidales Prevotellaceae Prevotella NA 4 k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella
Campylobacter genus 4 2 2 1.00 Bacteria Campylobacterota Epsilonproteobacteria Campylobacterales Campylobacteraceae Campylobacter NA 4 k__Bacteria|p__Campylobacterota|c__Epsilonproteobacteria|o__Campylobacterales|f__Campylobacteraceae|g__Campylobacter
Gardnerella genus 2 1 1 1.00 Bacteria Actinomycetota Actinomycetes Bifidobacteriales Bifidobacteriaceae Gardnerella NA 2 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Gardnerella
Propionibacterium genus 2 1 1 1.00 Bacteria Actinomycetota Actinomycetes Propionibacteriales Propionibacteriaceae Propionibacterium NA 2 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Propionibacteriales|f__Propionibacteriaceae|g__Propionibacterium
Atopobium genus 5 2 3 1.00 Bacteria Actinomycetota Coriobacteriia Coriobacteriales Atopobiaceae Atopobium NA 5 k__Bacteria|p__Actinomycetota|c__Coriobacteriia|o__Coriobacteriales|f__Atopobiaceae|g__Atopobium
Mobiluncus genus 3 1 2 1.00 Bacteria Actinomycetota Actinomycetes Actinomycetales Actinomycetaceae Mobiluncus NA 3 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Actinomycetales|f__Actinomycetaceae|g__Mobiluncus
Megasphaera genus 3 1 2 1.00 Bacteria Bacillota Negativicutes Veillonellales Veillonellaceae Megasphaera NA 3 k__Bacteria|p__Bacillota|c__Negativicutes|o__Veillonellales|f__Veillonellaceae|g__Megasphaera

Load HMP data from curatedMetagenomicData

suppressPackageStartupMessages(library(curatedMetagenomicData))
# hmp_vagina_metadata <- filter(sampleMetadata, study_name == "HMP_2012" & body_site == "vagina")
all_healthy_adult_vagina_metadata <- filter(sampleMetadata, body_site == "vagina" & disease == "healthy" & age_category == "adult")
se <-
  curatedMetagenomicData::returnSamples(all_healthy_adult_vagina_metadata, dataType = "relative_abundance", counts = FALSE)
## Warning: There was 1 warning in `mutate()`.
##  In argument: `across(.fns = ~replace_na(.x, 0))`.
## Caused by warning:
## ! Using `across()` without supplying `.cols` was deprecated in dplyr 1.1.0.
##  Please supply `.cols` instead.

Get matrices of species and genus relative abundance

allranks <- mia::splitByRanks(se)
#species_relab <- t(assay(allranks[["species"]]))
genus_relab <- t(assay(allranks[["genus"]]))
#family_relab <- t(assay(allranks[["family"]]))

Centered log-ratio transformation on genus level relative abundance:

genus_relab_clr <- Hotelling::clr(genus_relab + 1)
df <- dplyr::select(data.frame(genus_relab_clr), c("Streptococcus", "Lactobacillus"))
df$age <- se$age
df$BMI <- se$BMI
fit <- lm(Streptococcus ~ Lactobacillus + age + BMI, data = df)  
summary(fit)
## 
## Call:
## lm(formula = Streptococcus ~ Lactobacillus + age + BMI, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.06004 -0.02525 -0.01591 -0.00853  0.71736 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)   -0.112785   0.117024  -0.964    0.339
## Lactobacillus  0.008483   0.015007   0.565    0.574
## age            0.001061   0.003520   0.301    0.764
## BMI            0.001921   0.004362   0.440    0.661
## 
## Residual standard error: 0.1149 on 61 degrees of freedom
##   (30 observations deleted due to missingness)
## Multiple R-squared:  0.01484,    Adjusted R-squared:  -0.03361 
## F-statistic: 0.3064 on 3 and 61 DF,  p-value: 0.8207
summary(lm(Streptococcus ~ Lactobacillus, data = df))
## 
## Call:
## lm(formula = Streptococcus ~ Lactobacillus, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6006 -0.0609 -0.0462 -0.0440  4.1909 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    0.55246    0.20193   2.736  0.00745 **
## Lactobacillus -0.11409    0.04751  -2.401  0.01832 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5642 on 93 degrees of freedom
## Multiple R-squared:  0.05839,    Adjusted R-squared:  0.04826 
## F-statistic: 5.767 on 1 and 93 DF,  p-value: 0.01832
wilcox.test(df$Streptococcus, df$Lactobacillus)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  df$Streptococcus and df$Lactobacillus
## W = 213.5, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
t.test(df$Streptococcus, df$Lactobacillus)
## 
##  Welch Two Sample t-test
## 
## data:  df$Streptococcus and df$Lactobacillus
## t = -28.666, df = 133.93, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.258800 -3.709054
## sample estimates:
## mean of x mean of y 
## 0.0879008 4.0718280
secount <-
  curatedMetagenomicData::returnSamples(all_healthy_adult_vagina_metadata, dataType = "relative_abundance", counts = TRUE)
## 
## $`2021-03-31.FerrettiP_2018.relative_abundance`
## dropping rows without rowTree matches:
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_unclassified|g__Gemella|s__Gemella_bergeri
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
##   k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus|s__Ruminococcus_champanellensis
##   k__Bacteria|p__Firmicutes|c__Erysipelotrichia|o__Erysipelotrichales|f__Erysipelotrichaceae|g__Bulleidia|s__Bulleidia_extructa
##   k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
##   k__Bacteria|p__Spirochaetes|c__Spirochaetia|o__Leptospirales|f__Leptospiraceae|g__Leptospira|s__Leptospira_hartskeerlii
##   k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
## $`2021-03-31.HMP_2012.relative_abundance`
## dropping rows without rowTree matches:
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Atopobiaceae|g__Olsenella|s__Olsenella_profusa
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_unclassified|g__Gemella|s__Gemella_bergeri
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
##   k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus|s__Ruminococcus_champanellensis
##   k__Bacteria|p__Firmicutes|c__Erysipelotrichia|o__Erysipelotrichales|f__Erysipelotrichaceae|g__Bulleidia|s__Bulleidia_extructa
##   k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
##   k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
## $`2021-03-31.WampachL_2018.relative_abundance`
## dropping rows without rowTree matches:
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
##   k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
##   k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
allranks <- mia::splitByRanks(secount)
genus_relab_count <- t(assay(allranks[["genus"]]))
df <- dplyr::select(data.frame(genus_relab_count), c("Streptococcus", "Lactobacillus"))
df$age <- secount$age
df$BMI <- secount$BMI
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.
fit.ZInegbin <- pscl::zeroinfl(Streptococcus ~ Lactobacillus + age + BMI | 1,
                data = df,
                dist = "negbin")
## Warning: glm.fit: algorithm did not converge

Create a data frame for regression analysis from healthy controls for log-linear model

library(curatedMetagenomicData)
library(dplyr)
library(DT)
library(purrr)

taxon_strep <- c("1884" = "Streptococcus")
taxon_lact <- c("1578" = "Lactobacillus")

target_sample <- "vagina"
sample_metadata <- sampleMetadata %>% 
    filter(
        grepl(target_sample, body_site) | grepl(target_sample, body_subsite)
    ) %>% 
    discard(~all(is.na(.x))) %>% 
    as_tibble()
dim(sample_metadata)
## [1] 96 26
unique(sample_metadata$study_name)
## [1] "FerrettiP_2018" "HMP_2012"       "WampachL_2018"
tse <- returnSamples(
    sampleMetadata = sample_metadata,
    dataType = "relative_abundance",
    counts = FALSE,
    rownames = "short"
)
## 
## $`2021-03-31.FerrettiP_2018.relative_abundance`
## dropping rows without rowTree matches:
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_unclassified|g__Gemella|s__Gemella_bergeri
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
##   k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus|s__Ruminococcus_champanellensis
##   k__Bacteria|p__Firmicutes|c__Erysipelotrichia|o__Erysipelotrichales|f__Erysipelotrichaceae|g__Bulleidia|s__Bulleidia_extructa
##   k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
##   k__Bacteria|p__Spirochaetes|c__Spirochaetia|o__Leptospirales|f__Leptospiraceae|g__Leptospira|s__Leptospira_hartskeerlii
##   k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
## $`2021-03-31.HMP_2012.relative_abundance`
## dropping rows without rowTree matches:
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Atopobiaceae|g__Olsenella|s__Olsenella_profusa
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_unclassified|g__Gemella|s__Gemella_bergeri
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
##   k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus|s__Ruminococcus_champanellensis
##   k__Bacteria|p__Firmicutes|c__Erysipelotrichia|o__Erysipelotrichales|f__Erysipelotrichaceae|g__Bulleidia|s__Bulleidia_extructa
##   k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
##   k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
## $`2021-03-31.WampachL_2018.relative_abundance`
## dropping rows without rowTree matches:
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
##   k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
##   k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
tse
## class: TreeSummarizedExperiment 
## dim: 902 96 
## metadata(0):
## assays(1): relative_abundance
## rownames(902): [Clostridium] innocuum [Clostridium] leptum ... Rahnella
##   aquatilis Rothia kristinae
## rowData names(7): superkingdom phylum ... genus species
## colnames(96): CA_C10001MS2003VA_t0M15 CA_C10003MS2031VA_t0M15 ...
##   MV_C118 MV_C120
## colData names(25): study_name subject_id ... family_role body_subsite
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
## rowLinks: a LinkDataFrame (902 rows)
## rowTree: 1 phylo tree(s) (10430 leaves)
## colLinks: NULL
## colTree: NULL

Comparing mean to variance of the outcome variable

library(ggplot2)
mean(df$Streptococcus)
## [1] 4049.558
var(df$Streptococcus)
## [1] 509068831
fraczeroes <- nrow(df[df$Streptococcus == 0,])/nrow(df)
fraczeroes
## [1] 0.8736842

#Fit log-linear models

fit.pois <- glm(Streptococcus ~ Lactobacillus + age + BMI,
                data = df,
                family = poisson(link = "log"))
## Warning: glm.fit: algorithm did not converge
summary (fit.pois)
## 
## Call:
## glm(formula = Streptococcus ~ Lactobacillus + age + BMI, family = poisson(link = "log"), 
##     data = df)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    5.211e+00  3.056e-02 170.485  < 2e-16 ***
## Lactobacillus  2.307e-08  6.221e-10  37.085  < 2e-16 ***
## age            5.232e-02  1.049e-03  49.868  < 2e-16 ***
## BMI           -4.464e-03  1.356e-03  -3.291 0.000999 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 339596  on 64  degrees of freedom
## Residual deviance: 335673  on 61  degrees of freedom
##   (30 observations deleted due to missingness)
## AIC: 335733
## 
## Number of Fisher Scoring iterations: 25
library(pscl)
fit.ZIpois <- zeroinfl(Streptococcus ~ Lactobacillus + age + BMI | 1,
                data = df,
                dist = "poisson")
## Warning: glm.fit: algorithm did not converge
summary (fit.ZIpois)
## 
## Call:
## zeroinfl(formula = Streptococcus ~ Lactobacillus + age + BMI | 1, data = df, 
##     dist = "poisson")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3189 -0.3189 -0.3189 -0.3189  9.6361 
## 
## Count model coefficients (poisson with log link):
##                 Estimate Std. Error z value Pr(>|z|)
## (Intercept)    2.122e+01        NaN     NaN      NaN
## Lactobacillus -1.116e-07        NaN     NaN      NaN
## age           -7.023e-01        NaN     NaN      NaN
## BMI            3.252e-01        NaN     NaN      NaN
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)    2.286        NaN     NaN      NaN
## 
## Number of iterations in BFGS optimization: 33 
## Log-likelihood: -3.134e+04 on 5 Df
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
fit.negbin <-
  glm.nb(
    Streptococcus ~ Lactobacillus + age + BMI,
    data = df,
    control = glm.control(
      epsilon = 1e-8,
      maxit = 50,
      trace = FALSE
    )
  )
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in glm.nb(Streptococcus ~ Lactobacillus + age + BMI, data = df, :
## alternation limit reached
library(broom)
tidy(fit.negbin)
## # A tibble: 4 × 5
##   term               estimate std.error statistic   p.value
##   <chr>                 <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)    5.21          3.06e- 2    170.   0        
## 2 Lactobacillus  0.0000000229  6.52e-10     35.1  1.77e-269
## 3 age            0.0524        1.05e- 3     49.9  0        
## 4 BMI           -0.00453       1.36e- 3     -3.32 8.92e-  4
fit.ZInegbin <- zeroinfl(Streptococcus ~ Lactobacillus + age + BMI | 1,
                data = df,
                dist = "negbin")
## Warning: glm.fit: algorithm did not converge
summary(fit.ZInegbin)
## 
## Call:
## zeroinfl(formula = Streptococcus ~ Lactobacillus + age + BMI | 1, data = df, 
##     dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.2128 -0.2128 -0.2128 -0.2127  6.3373 
## 
## Count model coefficients (negbin with log link):
##                 Estimate Std. Error z value Pr(>|z|)
## (Intercept)    2.478e+01        NaN     NaN      NaN
## Lactobacillus -5.468e-07        NaN     NaN      NaN
## age           -1.129e+00        NaN     NaN      NaN
## BMI            7.501e-01        NaN     NaN      NaN
## Log(theta)    -2.122e-01        NaN     NaN      NaN
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)    2.233        NaN     NaN      NaN
## 
## Theta = 0.8088 
## Number of iterations in BFGS optimization: 39 
## Log-likelihood: -70.77 on 6 Df