Analysis objectives

  1. Import, recode, and subset data from bugsigdb.org
  2. Create a table of studies
  3. Calculate the frequency of appearance of each taxa in independent signatures and identify the most frequently reported taxa

Making sure packages are installed

Not evaluated in vignette:

if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install(c("devtools", "tidyverse", "kableExtra"))
BiocManager::install(c("waldronlab/bugSigSimple", "waldronlab/BugSigDBStats", "waldronlab/bugsigdbr"))

Load and subset data

# use version="devel" and cache = FALSE to take the latest version from bugsigdb.org
dat <- bugsigdbr::importBugSigDB(version = "devel", cache = FALSE) 
dim(dat)
## [1] 7115   50
names(dat)
##  [1] "BSDB ID"                    "Study"                     
##  [3] "Study design"               "PMID"                      
##  [5] "DOI"                        "URL"                       
##  [7] "Authors list"               "Title"                     
##  [9] "Journal"                    "Year"                      
## [11] "Keywords"                   "Experiment"                
## [13] "Location of subjects"       "Host species"              
## [15] "Body site"                  "UBERON ID"                 
## [17] "Condition"                  "EFO ID"                    
## [19] "Group 0 name"               "Group 1 name"              
## [21] "Group 1 definition"         "Group 0 sample size"       
## [23] "Group 1 sample size"        "Antibiotics exclusion"     
## [25] "Sequencing type"            "16S variable region"       
## [27] "Sequencing platform"        "Statistical test"          
## [29] "Significance threshold"     "MHT correction"            
## [31] "LDA Score above"            "Matched on"                
## [33] "Confounders controlled for" "Pielou"                    
## [35] "Shannon"                    "Chao1"                     
## [37] "Simpson"                    "Inverse Simpson"           
## [39] "Richness"                   "Signature page name"       
## [41] "Source"                     "Curated date"              
## [43] "Curator"                    "Revision editor"           
## [45] "Description"                "Abundance in Group 1"      
## [47] "MetaPhlAn taxon names"      "NCBI Taxonomy IDs"         
## [49] "State"                      "Reviewer"

Subsetting

included.pmid <-
  c(
    28018325,
    24614698,
    29207565,
    29459704,
    29538354,
    32012716,
    20566857,
    28512451,
    28112736,
    27362264
  )
subset.dat <-
  filter(dat, PMID %in% included.pmid) 
unique(subset.dat$`Group 0 name`)
##  [1] "vaginal delivery after 7 days of delivery"    
##  [2] "vaginal delivery after 3 months of delivery"  
##  [3] "vaginal delivery after 6 months of delivery"  
##  [4] "vaginal delivery"                             
##  [5] ">/=33 weeks"                                  
##  [6] "vaginal delivery at day 3"                    
##  [7] "vaginal delivery at day 5"                    
##  [8] "vaginal delivery at day 28"                   
##  [9] "vaginal delivery at day 150"                  
## [10] "vaginal delivery at day 365"                  
## [11] "full-term delivery >39 weeks at day 3"        
## [12] "mecomium in vaginal delivery"                 
## [13] "mecomium in C-section delivery"               
## [14] "Antimicrobials use during delivery (no)"      
## [15] "Maternal consumption of probiotics (NO)"      
## [16] "Furry pets at home (NO)"                      
## [17] "Maternal samples taken at delivery."          
## [18] "Maternal samples taken at 6  weeks postpartum"
## [19] "vaginal delivery (Va)"
included.group0 <- "vaginal delivery"
unique(subset.dat$`Group 1 name`)
##  [1] "C-section delivery"                             
##  [2] "C-section"                                      
##  [3] "infants <33 weeks gestational age"              
##  [4] "c-section"                                      
##  [5] "late preterm 34-36 weeks"                       
##  [6] "overall time point infants samples of c-section"
##  [7] "transitional stool"                             
##  [8] "Antimicrobials use during delivery (yes)"       
##  [9] "Maternal consumption of probiotics (YES)"       
## [10] "Furry pets at home (YES)"                       
## [11] "Neonatal samples taken at delivery"             
## [12] "Infant samples taken at 6 weeks postpartum"     
## [13] "C-section (Cesarean section)"
included.group1 <- "C-section"
subset.final <-
  filter(subset.dat, `Group 0 name` %in% included.group0 & `Group 1 name` %in% included.group1) %>%
  filter(`Body site` == "Meconium") %>%
  arrange(PMID)

Show key characteristics of the included signatures:

detach("package:dplyr", unload = TRUE)
## Warning: 'dplyr' namespace cannot be unloaded:
##   namespace 'dplyr' is imported by 'BiocFileCache', 'bugSigSimple', 'tidyr', 'dbplyr' so cannot be unloaded
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
select(subset.final, "PMID", "Source", "Group 0 name", "Group 1 name", "Abundance in Group 1")
##       PMID              Source     Group 0 name Group 1 name
## 1 24614698 Table S6 in File S1 vaginal delivery    C-section
## 2 27362264            Figure 1 vaginal delivery    C-section
## 3 28512451                <NA> vaginal delivery    C-section
## 4 29459704            Figure 1 vaginal delivery    C-section
## 5 29459704            Figure 1 vaginal delivery    C-section
## 6 29538354             Table 3 vaginal delivery    C-section
## 7 32012716     Figure 4 & text vaginal delivery    C-section
## 8 32012716     Figure 4 & text vaginal delivery    C-section
##   Abundance in Group 1
## 1            increased
## 2            decreased
## 3                 <NA>
## 4            increased
## 5            decreased
## 6            decreased
## 7            increased
## 8            decreased

Are any studies missing?

sort(setdiff(included.pmid, subset.dat$PMID))
## numeric(0)
sort(setdiff(included.pmid, subset.final$PMID))
## [1] 20566857 28018325 28112736 29207565
sort(setdiff(subset.dat$PMID, subset.final$PMID))
## [1] 20566857 28018325 28112736 29207565

Table of studies

These are the studies included in the review:

bugSigSimple::createStudyTable(subset.final)
## # A tibble: 6 × 5
##   Study          Condition        Cases Controls `Study Design`                 
##   <chr>          <chr>            <dbl>    <dbl> <chr>                          
## 1 Ardissone 2014 Cesarean section    33       19 cross-sectional observational,…
## 2 Martin 2016    Cesarean section    28       80 cross-sectional observational,…
## 3 Shi 2018       Cesarean section    10        8 cross-sectional observational,…
## 4 Tapiainen 2018 Cesarean section    40      172 cross-sectional observational,…
## 5 Wampach 2017   Cesarean section     6        4 time series / longitudinal obs…
## 6 Wong 2020      Cesarean section    43       62 prospective cohort

Summary of taxa reported

This table summarizes the results for the top n most frequently identified taxa.

kable_styling(kbl(bugSigSimple::createTaxonTable(subset.final, n = 20)))
## Warning: Expected 7 pieces. Additional pieces discarded in 1 rows [16].
Taxon Name Taxonomic Level total_signatures increased_signatures decreased_signatures Binomial Test pval kingdom phylum class order family genus species n_signatures metaphlan_name
Staphylococcus genus 3 1 2 1.0 Bacteria Bacillota Bacilli Bacillales Staphylococcaceae Staphylococcus NA 3 k__Bacteria|p__Bacillota|c__Bacilli|o__Bacillales|f__Staphylococcaceae|g__Staphylococcus
Corynebacterium genus 2 1 1 1.0 Bacteria Actinomycetota Actinomycetes Mycobacteriales Corynebacteriaceae Corynebacterium NA 2 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Mycobacteriales|f__Corynebacteriaceae|g__Corynebacterium
Propionibacterium genus 2 1 1 1.0 Bacteria Actinomycetota Actinomycetes Propionibacteriales Propionibacteriaceae Propionibacterium NA 2 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Propionibacteriales|f__Propionibacteriaceae|g__Propionibacterium
Enterococcus genus 2 0 2 0.5 Bacteria Bacillota Bacilli Lactobacillales Enterococcaceae Enterococcus NA 2 k__Bacteria|p__Bacillota|c__Bacilli|o__Lactobacillales|f__Enterococcaceae|g__Enterococcus
Streptococcus genus 2 1 1 1.0 Bacteria Bacillota Bacilli Lactobacillales Streptococcaceae Streptococcus NA 2 k__Bacteria|p__Bacillota|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus
Phocaeicola vulgatus species 2 1 1 1.0 Bacteria Bacteroidota Bacteroidia Bacteroidales Bacteroidaceae Phocaeicola Phocaeicola vulgatus 2 k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Phocaeicola|s__Phocaeicola vulgatus
Comamonas genus 2 1 1 1.0 Bacteria Pseudomonadota Betaproteobacteria Burkholderiales Comamonadaceae Comamonas NA 2 k__Bacteria|p__Pseudomonadota|c__Betaproteobacteria|o__Burkholderiales|f__Comamonadaceae|g__Comamonas
Citrobacter genus 2 1 1 1.0 Bacteria Pseudomonadota Gammaproteobacteria Enterobacterales Enterobacteriaceae Citrobacter NA 2 k__Bacteria|p__Pseudomonadota|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|g__Citrobacter
Stenotrophomonas genus 2 2 0 0.5 Bacteria Pseudomonadota Gammaproteobacteria Lysobacterales Lysobacteraceae Stenotrophomonas NA 2 k__Bacteria|p__Pseudomonadota|c__Gammaproteobacteria|o__Lysobacterales|f__Lysobacteraceae|g__Stenotrophomonas
Actinomycetota phylum 1 0 1 1.0 Bacteria Actinomycetota NA NA NA NA NA 4 k__Bacteria|p__Actinomycetota
Actinomycetes class 1 0 1 1.0 Bacteria Actinomycetota Actinomycetes NA NA NA NA 4 k__Bacteria|p__Actinomycetota|c__Actinomycetes
Actinomycetales order 1 0 1 1.0 Bacteria Actinomycetota Actinomycetes Actinomycetales NA NA NA 1 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Actinomycetales
Bifidobacterium genus 1 0 1 1.0 Bacteria Actinomycetota Actinomycetes Bifidobacteriales Bifidobacteriaceae Bifidobacterium NA 2 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium
Bifidobacterium bifidum species 1 0 1 1.0 Bacteria Actinomycetota Actinomycetes Bifidobacteriales Bifidobacteriaceae Bifidobacterium Bifidobacterium bifidum 1 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium|s__Bifidobacterium bifidum
Bifidobacterium catenulatum species 1 0 1 1.0 Bacteria Actinomycetota Actinomycetes Bifidobacteriales Bifidobacteriaceae Bifidobacterium Bifidobacterium catenulatum 1 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium|s__Bifidobacterium catenulatum
Bifidobacterium longum subsp. longum species 1 0 1 1.0 Bacteria Actinomycetota Actinomycetes Bifidobacteriales Bifidobacteriaceae Bifidobacterium Bifidobacterium longum 1 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium|s__Bifidobacterium longum|s__Bifidobacterium longum subsp. longum
Pseudoscardovia genus 1 1 0 1.0 Bacteria Actinomycetota Actinomycetes Bifidobacteriales Bifidobacteriaceae Pseudoscardovia NA 1 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Pseudoscardovia
Propionibacteriaceae family 1 0 1 1.0 Bacteria Actinomycetota Actinomycetes Propionibacteriales Propionibacteriaceae NA NA 2 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Propionibacteriales|f__Propionibacteriaceae
Cutibacterium acnes species 1 0 1 1.0 Bacteria Actinomycetota Actinomycetes Propionibacteriales Propionibacteriaceae Cutibacterium Cutibacterium acnes 1 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Propionibacteriales|f__Propionibacteriaceae|g__Cutibacterium|s__Cutibacterium acnes
Pseudonocardiaceae family 1 0 1 1.0 Bacteria Actinomycetota Actinomycetes Pseudonocardiales Pseudonocardiaceae NA NA 1 k__Bacteria|p__Actinomycetota|c__Actinomycetes|o__Pseudonocardiales|f__Pseudonocardiaceae

Long list of most frequently identified taxa

These are not needed because of the taxon table above, but they list a larger number of taxa.

getMostFrequentTaxa(subset.final, n = 50)
##    1279    1301    1350    1716    1743     283   40323     544     821  114248 
##       3       2       2       2       2       2       2       2       2       1 
##  117563  118964    1236    1239    1243   12916    1297    1298 1302778  135614 
##       1       1       1       1       1       1       1       1       1       1 
##    1357    1380    1385    1386    1390    1402    1485    1502  150247    1578 
##       1       1       1       1       1       1       1       1       1       1 
##  158851    1596    1598    1678    1679    1681    1686    1730    1747    1760 
##       1       1       1       1       1       1       1       1       1       1 
##  183710  186806  186817  188787  201174    2037    2070  216851     222    2737 
##       1       1       1       1       1       1       1       1       1       1
getMostFrequentTaxa(subset.final, direction="UP")
##  40323 114248 117563 118964   1239   1243   1279  12916   1297   1298 
##      2      1      1      1      1      1      1      1      1      1
getMostFrequentTaxa(subset.final, direction="DOWN")
##   1279   1350   1236   1301   1380   1485   1502 158851   1596   1598 
##      2      2      1      1      1      1      1      1      1      1