Analysis objectives

  1. Import, recode, and subset data from bugsigdb.org
  2. Create a table of studies
  3. Calculate the frequency of appearance of each taxa in independent signatures and identify the most frequently reported taxa
  4. Estimate the probability of the most frequently identified taxa occuring by chance

Making sure packages are installed

install.packages(c("devtools", "tidyverse", "kableExtra"))
devtools::install_github("waldronlab/bugSigSimple")
devtools::install_github("waldronlab/BugSigDBStats")
devtools::install_github("waldronlab/bugsigdbr")

Load and subset data

## 
## Attaching package: 'BugSigDBStats'
## The following object is masked from 'package:bugSigSimple':
## 
##     getMostFrequentTaxa
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
##  dplyr     1.1.4      readr     2.1.5
##  forcats   1.0.0      stringr   1.5.1
##  ggplot2   3.5.2      tibble    3.2.1
##  lubridate 1.9.4      tidyr     1.3.1
##  purrr     1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
##  dplyr::filter() masks stats::filter()
##  dplyr::lag()    masks stats::lag()
##  Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(bugSigSimple)
dat <- bugsigdbr::importBugSigDB(cache= TRUE)
## Using cached version from 2025-05-05 19:25:55
dim(dat)
## [1] 8163   50
names(dat)
##  [1] "BSDB ID"                    "Study"                     
##  [3] "Study design"               "PMID"                      
##  [5] "DOI"                        "URL"                       
##  [7] "Authors list"               "Title"                     
##  [9] "Journal"                    "Year"                      
## [11] "Keywords"                   "Experiment"                
## [13] "Location of subjects"       "Host species"              
## [15] "Body site"                  "UBERON ID"                 
## [17] "Condition"                  "EFO ID"                    
## [19] "Group 0 name"               "Group 1 name"              
## [21] "Group 1 definition"         "Group 0 sample size"       
## [23] "Group 1 sample size"        "Antibiotics exclusion"     
## [25] "Sequencing type"            "16S variable region"       
## [27] "Sequencing platform"        "Statistical test"          
## [29] "Significance threshold"     "MHT correction"            
## [31] "LDA Score above"            "Matched on"                
## [33] "Confounders controlled for" "Pielou"                    
## [35] "Shannon"                    "Chao1"                     
## [37] "Simpson"                    "Inverse Simpson"           
## [39] "Richness"                   "Signature page name"       
## [41] "Source"                     "Curated date"              
## [43] "Curator"                    "Revision editor"           
## [45] "Description"                "Abundance in Group 1"      
## [47] "MetaPhlAn taxon names"      "NCBI Taxonomy IDs"         
## [49] "State"                      "Reviewer"

Subsetting

Subsetting only studies done on humans - 6 of 17 studies were excluded because they were either done on mice or not statistically reliable (Cregger et. al)

subset.dat <-
  dat[which(
    dat$PMID == "30778155" |
      dat$PMID == "32192080" |
      dat$PMID == "31087436" |
      dat$PMID == "26901400" |
      dat$PMID == "33839907" |
      dat$PMID == "32046455" |
      dat$PMID == "33925708" |
      dat$PMID == "32299442" |
      dat$PMID == "33313185" |
      dat$PMID == "34268384" | dat$PMID == "33660232"
  ), ]

All studies

Summary of studies and most frequent taxa increased and decreased in endometriosis patients for all studies

bugSigSimple::createStudyTable(subset.dat)
## # A tibble: 11 × 9
##    `Study code` MaxCases MaxControls `Study design` Condition N_signatures PMID 
##    <chr>           <dbl>       <dbl> <chr>          <chr>            <int> <chr>
##  1 AkiyamaK_20…       30          39 cross-section… Endometr…            1 3108…
##  2 AtaB_2019          14          14 prospective c… Endometr…            2 3077…
##  3 ChaoX_2021         37          66 case-control   Endometr…            4 3426…
##  4 ChenS_2020         14         120 case-control   Adenomyo…            3 3331…
##  5 HernandesC_…       21          11 case-control   Endometr…            6 3219…
##  6 KhanKN_2016        32          32 case-control   Endometr…            5 2690…
##  7 LeeSR_2021         45          45 case-control   Endometr…            2 3392…
##  8 PerrottaAR_…        8          12 cross-section… Endometr…            2 3204…
##  9 ShanJ_2021         12          12 case-control   Endometr…            2 3383…
## 10 SvenssonA_2…       66         198 case-control   Endometr…            4 3366…
## 11 WeiW_2020          36          14 cross-section… Endometr…            2 3229…
## # ℹ 2 more variables: DOI <chr>, URL <chr>
getMostFrequentTaxa(subset.dat, n=30)
##    1301    1380    1578     194     543     838   84107  119852    1300    1350 
##       7       5       4       4       4       4       4       3       3       3 
##  135621    1375  171549 1940338    2050   28050     286   39948     469     906 
##       3       3       3       3       3       3       3       3       3       3 
##   13687 1508657 1582879    1654    1663  168808    1743  216851    2701   33038 
##       2       2       2       2       2       2       2       2       2       2
getMostFrequentTaxa(subset.dat, direction="UP")
##    1301     543  119852    1300    1350  135621    1578 1940338     286     469 
##       6       4       3       3       3       3       3       3       3       3
getMostFrequentTaxa(subset.dat, direction="DOWN")
##   1380 168808 171549    194   2050  28050  33958    838    906   1033 
##      3      2      2      2      2      2      2      2      2      1

Excluding feces samples

Summary of studies and most frequent taxa in only samples from female reproductive tract, excluding feces samples

subset.dat2 <-
  dat[which(
    dat$PMID == "30778155" |
      dat$PMID == "32192080" |
      dat$PMID == "31087436" |
      dat$PMID == "26901400" |
      dat$PMID == "32046455" |
      dat$PMID == "33925708" |
      dat$PMID == "32299442" |
      dat$PMID == "33313185" | dat$PMID == "34268384"
  ), ]

reproductive_sigs <-
  subset.dat2[which(subset.dat2$`Body site` != "feces" |
                      is.na(subset.dat2$`Body site`)), ]

bugSigSimple::createStudyTable(reproductive_sigs)
## # A tibble: 9 × 9
##   `Study code`  MaxCases MaxControls `Study design` Condition N_signatures PMID 
##   <chr>            <dbl>       <dbl> <chr>          <chr>            <int> <chr>
## 1 AkiyamaK_2019       30          39 cross-section… Endometr…            1 3108…
## 2 AtaB_2019           14          14 prospective c… Endometr…            2 3077…
## 3 ChaoX_2021          37          66 case-control   Endometr…            4 3426…
## 4 ChenS_2020          14         120 case-control   Adenomyo…            3 3331…
## 5 HernandesC_2…       21          11 case-control   Endometr…            6 3219…
## 6 KhanKN_2016         32          32 case-control   Endometr…            5 2690…
## 7 LeeSR_2021          45          45 case-control   Endometr…            2 3392…
## 8 PerrottaAR_2…        8          12 cross-section… Endometr…            2 3204…
## 9 WeiW_2020           36          14 cross-section… Endometr…            2 3229…
## # ℹ 2 more variables: DOI <chr>, URL <chr>
allfreqs <- bugSigSimple::createTaxonTable(reproductive_sigs, n = 20) %>% #could change number
  arrange(I(decreased_signatures - increased_signatures))
incfreqs <- filter(allfreqs, I(increased_signatures - decreased_signatures) > 0)
decfreqs <- filter(allfreqs, I(increased_signatures - decreased_signatures) < 0)
kableExtra::kbl(allfreqs) %>%
  kable_paper("hover", full_width = FALSE)
Taxon Name Taxonomic Level total_signatures increased_signatures decreased_signatures Binomial Test pval kingdom phylum class order family genus species n_signatures metaphlan_name
Streptococcus genus 6 5 1 0.22 Bacteria Bacillati Bacillota Bacilli Lactobacillales Streptococcaceae Streptococcus 0 d__Bacteria|k__Bacillati|p__Bacillota|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus
Coriobacteriaceae family 3 3 0 0.25 Bacteria Bacillati Actinomycetota Coriobacteriia Coriobacteriales Coriobacteriaceae NA 0 d__Bacteria|k__Bacillati|p__Actinomycetota|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae
Streptococcaceae family 3 3 0 0.25 Bacteria Bacillati Bacillota Bacilli Lactobacillales Streptococcaceae NA 0 d__Bacteria|k__Bacillati|p__Bacillota|c__Bacilli|o__Lactobacillales|f__Streptococcaceae
Enterobacteriaceae family 3 3 0 0.25 Bacteria Pseudomonadati Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae NA 0 d__Bacteria|k__Pseudomonadati|p__Pseudomonadota|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae
Escherichia/Shigella sp. species 3 3 0 0.25 Bacteria Pseudomonadati Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Escherichia/Shigella sp. 0 d__Bacteria|k__Pseudomonadati|p__Pseudomonadota|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|s__Escherichia/Shigella sp.
Acinetobacter genus 3 3 0 0.25 Bacteria Pseudomonadati Pseudomonadota Gammaproteobacteria Moraxellales Moraxellaceae Acinetobacter 0 d__Bacteria|k__Pseudomonadati|p__Pseudomonadota|c__Gammaproteobacteria|o__Moraxellales|f__Moraxellaceae|g__Acinetobacter
Pseudomonadaceae family 3 3 0 0.25 Bacteria Pseudomonadati Pseudomonadota Gammaproteobacteria Pseudomonadales Pseudomonadaceae NA 0 d__Bacteria|k__Pseudomonadati|p__Pseudomonadota|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae
Pseudomonas genus 3 3 0 0.25 Bacteria Pseudomonadati Pseudomonadota Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas 0 d__Bacteria|k__Pseudomonadati|p__Pseudomonadota|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas
Alloscardovia genus 2 2 0 0.50 Bacteria Bacillati Actinomycetota Actinomycetes Bifidobacteriales Bifidobacteriaceae Alloscardovia 0 d__Bacteria|k__Bacillati|p__Actinomycetota|c__Actinomycetes|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Alloscardovia
Arthrobacter genus 2 2 0 0.50 Bacteria Bacillati Actinomycetota Actinomycetes Micrococcales Micrococcaceae Arthrobacter 0 d__Bacteria|k__Bacillati|p__Actinomycetota|c__Actinomycetes|o__Micrococcales|f__Micrococcaceae|g__Arthrobacter
Aerococcus genus 3 2 1 1.00 Bacteria Bacillati Bacillota Bacilli Lactobacillales Aerococcaceae Aerococcus 0 d__Bacteria|k__Bacillati|p__Bacillota|c__Bacilli|o__Lactobacillales|f__Aerococcaceae|g__Aerococcus
Lactobacillus genus 3 2 1 1.00 Bacteria Bacillati Bacillota Bacilli Lactobacillales Lactobacillaceae Lactobacillus 0 d__Bacteria|k__Bacillati|p__Bacillota|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus
Dialister genus 3 2 1 1.00 Bacteria Bacillati Bacillota Negativicutes Veillonellales Veillonellaceae Dialister 0 d__Bacteria|k__Bacillati|p__Bacillota|c__Negativicutes|o__Veillonellales|f__Veillonellaceae|g__Dialister
Prevotella genus 4 2 2 1.00 Bacteria Pseudomonadati Bacteroidota Bacteroidia Bacteroidales Prevotellaceae Prevotella 0 d__Bacteria|k__Pseudomonadati|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella
Campylobacter genus 4 2 2 1.00 Bacteria Pseudomonadati Campylobacterota Epsilonproteobacteria Campylobacterales Campylobacteraceae Campylobacter 0 d__Bacteria|k__Pseudomonadati|p__Campylobacterota|c__Epsilonproteobacteria|o__Campylobacterales|f__Campylobacteraceae|g__Campylobacter
Gardnerella genus 2 1 1 1.00 Bacteria Bacillati Actinomycetota Actinomycetes Bifidobacteriales Bifidobacteriaceae Gardnerella 0 d__Bacteria|k__Bacillati|p__Actinomycetota|c__Actinomycetes|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Gardnerella
Propionibacterium genus 2 1 1 1.00 Bacteria Bacillati Actinomycetota Actinomycetes Propionibacteriales Propionibacteriaceae Propionibacterium 0 d__Bacteria|k__Bacillati|p__Actinomycetota|c__Actinomycetes|o__Propionibacteriales|f__Propionibacteriaceae|g__Propionibacterium
Atopobium genus 5 2 3 1.00 Bacteria Bacillati Actinomycetota Coriobacteriia Coriobacteriales Atopobiaceae Atopobium 0 d__Bacteria|k__Bacillati|p__Actinomycetota|c__Coriobacteriia|o__Coriobacteriales|f__Atopobiaceae|g__Atopobium
Mobiluncus genus 3 1 2 1.00 Bacteria Bacillati Actinomycetota Actinomycetes Actinomycetales Actinomycetaceae Mobiluncus 0 d__Bacteria|k__Bacillati|p__Actinomycetota|c__Actinomycetes|o__Actinomycetales|f__Actinomycetaceae|g__Mobiluncus
Megasphaera genus 3 1 2 1.00 Bacteria Bacillati Bacillota Negativicutes Veillonellales Veillonellaceae Megasphaera 0 d__Bacteria|k__Bacillati|p__Bacillota|c__Negativicutes|o__Veillonellales|f__Veillonellaceae|g__Megasphaera

Load HMP data from curatedMetagenomicData

suppressPackageStartupMessages(library(curatedMetagenomicData))
# hmp_vagina_metadata <- filter(sampleMetadata, study_name == "HMP_2012" & body_site == "vagina")
all_healthy_adult_vagina_metadata <- filter(sampleMetadata, body_site == "vagina" & disease == "healthy" & age_category == "adult")
se <-
  curatedMetagenomicData::returnSamples(all_healthy_adult_vagina_metadata, dataType = "relative_abundance", counts = FALSE)

Get matrices of species and genus relative abundance

allranks <- mia::splitByRanks(se)
#species_relab <- t(assay(allranks[["species"]]))
genus_relab <- t(assay(allranks[["genus"]]))
#family_relab <- t(assay(allranks[["family"]]))

Centered log-ratio transformation on genus level relative abundance:

genus_relab_clr <- Hotelling::clr(genus_relab + 1)
df <- dplyr::select(data.frame(genus_relab_clr), c("Streptococcus", "Lactobacillus"))
df$age <- se$age
df$BMI <- se$BMI
fit <- lm(Streptococcus ~ Lactobacillus + age + BMI, data = df)  
summary(fit)
## 
## Call:
## lm(formula = Streptococcus ~ Lactobacillus + age + BMI, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6728 -0.1875 -0.0264  0.0893  3.6729 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.386413   0.436694  -0.885 0.378987    
## Lactobacillus -0.078095   0.047933  -1.629 0.107344    
## age            0.035738   0.010073   3.548 0.000665 ***
## BMI           -0.009094   0.014295  -0.636 0.526581    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4857 on 77 degrees of freedom
##   (14 observations deleted due to missingness)
## Multiple R-squared:  0.1799, Adjusted R-squared:  0.1479 
## F-statistic: 5.629 on 3 and 77 DF,  p-value: 0.00153
summary(lm(Streptococcus ~ Lactobacillus, data = df))
## 
## Call:
## lm(formula = Streptococcus ~ Lactobacillus, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6006 -0.0609 -0.0462 -0.0440  4.1909 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    0.55246    0.20193   2.736  0.00745 **
## Lactobacillus -0.11409    0.04751  -2.401  0.01832 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5642 on 93 degrees of freedom
## Multiple R-squared:  0.05839,    Adjusted R-squared:  0.04826 
## F-statistic: 5.767 on 1 and 93 DF,  p-value: 0.01832
wilcox.test(df$Streptococcus, df$Lactobacillus)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  df$Streptococcus and df$Lactobacillus
## W = 213.5, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
t.test(df$Streptococcus, df$Lactobacillus)
## 
##  Welch Two Sample t-test
## 
## data:  df$Streptococcus and df$Lactobacillus
## t = -28.666, df = 133.93, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.258800 -3.709054
## sample estimates:
## mean of x mean of y 
## 0.0879008 4.0718280
secount <-
  curatedMetagenomicData::returnSamples(all_healthy_adult_vagina_metadata, dataType = "relative_abundance", counts = TRUE)
## 
## $`2021-03-31.FerrettiP_2018.relative_abundance`
## dropping rows without rowTree matches:
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_unclassified|g__Gemella|s__Gemella_bergeri
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
##   k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus|s__Ruminococcus_champanellensis
##   k__Bacteria|p__Firmicutes|c__Erysipelotrichia|o__Erysipelotrichales|f__Erysipelotrichaceae|g__Bulleidia|s__Bulleidia_extructa
##   k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
##   k__Bacteria|p__Spirochaetes|c__Spirochaetia|o__Leptospirales|f__Leptospiraceae|g__Leptospira|s__Leptospira_hartskeerlii
##   k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
## $`2021-03-31.HMP_2012.relative_abundance`
## dropping rows without rowTree matches:
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Atopobiaceae|g__Olsenella|s__Olsenella_profusa
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_unclassified|g__Gemella|s__Gemella_bergeri
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
##   k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus|s__Ruminococcus_champanellensis
##   k__Bacteria|p__Firmicutes|c__Erysipelotrichia|o__Erysipelotrichales|f__Erysipelotrichaceae|g__Bulleidia|s__Bulleidia_extructa
##   k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
##   k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
## $`2021-03-31.WampachL_2018.relative_abundance`
## dropping rows without rowTree matches:
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
##   k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
##   k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
allranks <- mia::splitByRanks(secount)
genus_relab_count <- t(assay(allranks[["genus"]]))
df <- dplyr::select(data.frame(genus_relab_count), c("Streptococcus", "Lactobacillus"))
df$age <- secount$age
df$BMI <- secount$BMI
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.
fit.ZInegbin <- pscl::zeroinfl(Streptococcus ~ Lactobacillus + age + BMI | 1,
                data = df,
                dist = "negbin")

Create a data frame for regression analysis from healthy controls for log-linear model

library(curatedMetagenomicData)
library(dplyr)
library(DT)
library(purrr)

taxon_strep <- c("1884" = "Streptococcus")
taxon_lact <- c("1578" = "Lactobacillus")

target_sample <- "vagina"
sample_metadata <- sampleMetadata %>% 
    filter(
        grepl(target_sample, body_site) | grepl(target_sample, body_subsite)
    ) %>% 
    discard(~all(is.na(.x))) %>% 
    as_tibble()
dim(sample_metadata)
## [1] 96 26
unique(sample_metadata$study_name)
## [1] "FerrettiP_2018" "HMP_2012"       "WampachL_2018"
tse <- returnSamples(
    sampleMetadata = sample_metadata,
    dataType = "relative_abundance",
    counts = FALSE,
    rownames = "short"
)
## 
## $`2021-03-31.FerrettiP_2018.relative_abundance`
## dropping rows without rowTree matches:
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_unclassified|g__Gemella|s__Gemella_bergeri
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
##   k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus|s__Ruminococcus_champanellensis
##   k__Bacteria|p__Firmicutes|c__Erysipelotrichia|o__Erysipelotrichales|f__Erysipelotrichaceae|g__Bulleidia|s__Bulleidia_extructa
##   k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
##   k__Bacteria|p__Spirochaetes|c__Spirochaetia|o__Leptospirales|f__Leptospiraceae|g__Leptospira|s__Leptospira_hartskeerlii
##   k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
## $`2021-03-31.HMP_2012.relative_abundance`
## dropping rows without rowTree matches:
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Atopobiaceae|g__Olsenella|s__Olsenella_profusa
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_unclassified|g__Gemella|s__Gemella_bergeri
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
##   k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus|s__Ruminococcus_champanellensis
##   k__Bacteria|p__Firmicutes|c__Erysipelotrichia|o__Erysipelotrichales|f__Erysipelotrichaceae|g__Bulleidia|s__Bulleidia_extructa
##   k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
##   k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
## $`2021-03-31.WampachL_2018.relative_abundance`
## dropping rows without rowTree matches:
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_stercoris
##   k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Coriobacteriales|f__Coriobacteriaceae|g__Enorma|s__[Collinsella]_massiliensis
##   k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Carnobacteriaceae|g__Granulicatella|s__Granulicatella_elegans
##   k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Sutterellaceae|g__Sutterella|s__Sutterella_parvirubra
##   k__Bacteria|p__Synergistetes|c__Synergistia|o__Synergistales|f__Synergistaceae|g__Cloacibacillus|s__Cloacibacillus_evryensis
## Warning in convertNode(tree = tree, node = labN, use.alias = FALSE, message =
## FALSE): Multiple nodes are found to have the same label.
tse
## class: TreeSummarizedExperiment 
## dim: 908 96 
## metadata(0):
## assays(1): relative_abundance
## rownames(908): species:Escherichia coli species:Klebsiella
##   michiganensis ... species:Pediococcus pentosaceus species:Rahnella
##   aquatilis
## rowData names(7): superkingdom phylum ... genus species
## colnames(96): CA_C10001MS2003VA_t0M15 CA_C10003MS2031VA_t0M15 ...
##   MV_C118 MV_C120
## colData names(25): study_name subject_id ... family_role body_subsite
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
## rowLinks: a LinkDataFrame (908 rows)
## rowTree: 1 phylo tree(s) (10430 leaves)
## colLinks: NULL
## colTree: NULL

Comparing mean to variance of the outcome variable

library(ggplot2)
mean(df$Streptococcus)
## [1] 4049.558
var(df$Streptococcus)
## [1] 509068831
fraczeroes <- nrow(df[df$Streptococcus == 0,])/nrow(df)
fraczeroes
## [1] 0.8736842

#Fit log-linear models

fit.pois <- glm(Streptococcus ~ Lactobacillus + age + BMI,
                data = df,
                family = poisson(link = "log"))

summary (fit.pois)
## 
## Call:
## glm(formula = Streptococcus ~ Lactobacillus + age + BMI, family = poisson(link = "log"), 
##     data = df)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -5.313e-01  1.873e-02  -28.36   <2e-16 ***
## Lactobacillus  5.141e-08  9.828e-10   52.32   <2e-16 ***
## age            3.599e-01  4.136e-04  870.25   <2e-16 ***
## BMI           -1.573e-01  6.453e-04 -243.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1929049  on 80  degrees of freedom
## Residual deviance:  632598  on 77  degrees of freedom
##   (14 observations deleted due to missingness)
## AIC: 632704
## 
## Number of Fisher Scoring iterations: 20
library(pscl)
fit.ZIpois <- zeroinfl(Streptococcus ~ Lactobacillus + age + BMI | 1,
                data = df,
                dist = "poisson")

summary (fit.ZIpois)
## 
## Call:
## zeroinfl(formula = Streptococcus ~ Lactobacillus + age + BMI | 1, data = df, 
##     dist = "poisson")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3753 -0.3753 -0.3752 -0.3750 27.8168 
## 
## Count model coefficients (poisson with log link):
##                 Estimate Std. Error z value Pr(>|z|)
## (Intercept)    6.169e+00        NaN     NaN      NaN
## Lactobacillus  1.321e-07        NaN     NaN      NaN
## age            2.420e-01        NaN     NaN      NaN
## BMI           -2.116e-01        NaN     NaN      NaN
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)     1.96        NaN     NaN      NaN
## 
## Number of iterations in BFGS optimization: 20 
## Log-likelihood: -1.02e+05 on 5 Df
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
fit.negbin <-
  glm.nb(
    Streptococcus ~ Lactobacillus + age + BMI,
    data = df,
    control = glm.control(
      epsilon = 1e-8,
      maxit = 50,
      trace = FALSE
    )
  )
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning: glm.fit: algorithm did not converge
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in sqrt(1/i): NaNs produced
## Warning in glm.nb(Streptococcus ~ Lactobacillus + age + BMI, data = df, :
## alternation limit reached
library(broom)
tidy(fit.negbin)
## # A tibble: 4 × 5
##   term               estimate   std.error statistic     p.value
##   <chr>                 <dbl>       <dbl>     <dbl>       <dbl>
## 1 (Intercept)   -34.4         8.11            -4.24 0.0000224  
## 2 Lactobacillus   0.000000889 0.000000196      4.54 0.00000570 
## 3 age             1.09        0.207            5.30 0.000000119
## 4 BMI             0.297       0.277            1.07 0.282
fit.ZInegbin <- zeroinfl(Streptococcus ~ Lactobacillus + age + BMI | 1,
                data = df,
                dist = "negbin")

summary(fit.ZInegbin)
## 
## Call:
## zeroinfl(formula = Streptococcus ~ Lactobacillus + age + BMI | 1, data = df, 
##     dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.1898 -0.1898 -0.1898 -0.1898  7.0569 
## 
## Count model coefficients (negbin with log link):
##                 Estimate Std. Error z value Pr(>|z|)
## (Intercept)    1.945e+01        NaN     NaN      NaN
## Lactobacillus -2.787e-07        NaN     NaN      NaN
## age            2.839e-02        NaN     NaN      NaN
## BMI           -4.084e-01        NaN     NaN      NaN
## Log(theta)    -9.837e-01        NaN     NaN      NaN
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)    1.921        NaN     NaN      NaN
## 
## Theta = 0.3739 
## Number of iterations in BFGS optimization: 40 
## Log-likelihood: -131.8 on 6 Df