Analysis objectives

  1. Import, recode, and subset data from bugsigdb.org
  2. Create a table of studies
  3. Create a clustered heatmap showing similarity of signatures from independent studies
  4. Calculate the frequency of appearance of each taxa in independent signatures, and identify the most frequently reported taxa
  5. Estimate the probability of the most frequently identified taxa occuring so frequently by chance

Packages installation

Install packages (not evaluated in vignette)

install.packages(c("devtools", "tidyverse", "kableExtra", "gt", "glue"))
devtools::install_github("waldronlab/bugSigSimple")
devtools::install_github("waldronlab/BugSigDBStats")
devtools::install_github("waldronlab/bugsigdbr")

Data import, recoding, and subset

library(bugSigSimple)
dat <- bugsigdbr::importBugSigDB(cache = TRUE) 
## Using cached version from 2022-03-31 12:35:22
dim(dat)
## [1] 2098   48
names(dat)
##  [1] "Study"                      "Study design"              
##  [3] "PMID"                       "DOI"                       
##  [5] "URL"                        "Authors"                   
##  [7] "Title"                      "Journal"                   
##  [9] "Year"                       "Experiment"                
## [11] "Location of subjects"       "Host species"              
## [13] "Body site"                  "UBERON ID"                 
## [15] "Condition"                  "EFO ID"                    
## [17] "Group 0 name"               "Group 1 name"              
## [19] "Group 1 definition"         "Group 0 sample size"       
## [21] "Group 1 sample size"        "Antibiotics exclusion"     
## [23] "Sequencing type"            "16S variable region"       
## [25] "Sequencing platform"        "Statistical test"          
## [27] "Significance threshold"     "MHT correction"            
## [29] "LDA Score above"            "Matched on"                
## [31] "Confounders controlled for" "Pielou"                    
## [33] "Shannon"                    "Chao1"                     
## [35] "Simpson"                    "Inverse Simpson"           
## [37] "Richness"                   "Signature page name"       
## [39] "Source"                     "Curated date"              
## [41] "Curator"                    "Revision editor"           
## [43] "Description"                "Abundance in Group 1"      
## [45] "MetaPhlAn taxon names"      "NCBI Taxonomy IDs"         
## [47] "State"                      "Reviewer"
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
##  ggplot2 3.3.5      purrr   0.3.4
##  tibble  3.1.6      dplyr   1.0.8
##  tidyr   1.2.0      stringr 1.4.0
##  readr   2.1.2      forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
##  dplyr::filter() masks stats::filter()
##  dplyr::lag()    masks stats::lag()
 efo <- bugsigdbr::getOntology("efo")
## Loading required namespace: ontologyIndex
## Using cached version from 2022-03-31 12:34:11
covid_all <- bugsigdbr::subsetByOntology(dat, column = "Condition", "COVID-19", efo) %>%
  mutate(studyexp = paste(Study, Experiment, sep = "_")) %>%
  mutate(
    site = recode(`Body site`,
      "feces" = "Gut",
      "rectal" = "Gut",
      "nasopharynx" = "aURT",
      "oropharynx" = "aURT",
      "nasopharynx,oropharynx" = "aURT",
      "tongue" = "aURT"
    )
  ) %>%
  mutate(comparison1 = paste(`Group 0 name`, `Group 1 name`, sep = " vs "))

Table of studies

bugSigSimple::createStudyTable(covid_all)
## # A tibble: 31 × 5
##    Study        Condition Cases Controls `Study Design`                         
##    <chr>        <chr>     <dbl>    <dbl> <chr>                                  
##  1 Cao 2021     COVID-19     13        8 case-control                           
##  2 Engen 2021   COVID-19      9       10 cross-sectional observational, not cas…
##  3 Gaibani 2021 COVID-19     24       24 case-control                           
##  4 Gao 2021     COVID-19     94       48 prospective cohort                     
##  5 Gu 2020      COVID-19     30       30 cross-sectional observational, not cas…
##  6 Gupta 2021   COVID-19     63       26 cross-sectional observational, not cas…
##  7 Hoque 2021   COVID-19     11       10 case-control                           
##  8 Khan 2021    COVID-19     10       10 cross-sectional observational, not cas…
##  9 Kim 2021     COVID-19     12       36 time series / longitudinal observation…
## 10 Liu 2021     COVID-19      6        3 cross-sectional observational, not cas…
## # … with 21 more rows

Taxon frequency tables by body site

library(dplyr)
gut_sigs <- filter(covid_all, 
                           site == "Gut") %>%
    drop_na(Source)

naso_sigs <- filter(covid_all, 
                           site == "aURT") %>%
  drop_na(Source)

In this table, the Binomial Test p-value corresponds to the null hypothesis

H0: the proportion of signatures in which the taxon is reported increased or decreased, relative to the total number of signatures in which it is reported, is equal to 0.5

kableExtra::kbl(bugSigSimple::createTaxonTable(gut_sigs))
Taxon Name Taxonomic Level total_signatures increased_signatures decreased_signatures Binomial Test pval kingdom phylum class order family genus species n_signatures metaphlan_name
Anaerostipes genus 12 4 8 0.3900 Bacteria Firmicutes Clostridia Eubacteriales Lachnospiraceae Anaerostipes NA 14 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Lachnospiraceae|g__Anaerostipes
Streptococcus genus 11 10 1 0.0120 Bacteria Firmicutes Bacilli Lactobacillales Streptococcaceae Streptococcus NA 15 k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus
Faecalibacterium genus 11 1 10 0.0120 Bacteria Firmicutes Clostridia Eubacteriales Oscillospiraceae Faecalibacterium NA 17 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Faecalibacterium
Enterococcus genus 10 10 0 0.0020 Bacteria Firmicutes Bacilli Lactobacillales Enterococcaceae Enterococcus NA 15 k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Enterococcaceae|g__Enterococcus
Clostridium genus 10 4 6 0.7500 Bacteria Firmicutes Clostridia Eubacteriales Clostridiaceae Clostridium NA 13 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Clostridiaceae|g__Clostridium
Roseburia genus 10 0 10 0.0020 Bacteria Firmicutes Clostridia Eubacteriales Lachnospiraceae Roseburia NA 13 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Lachnospiraceae|g__Roseburia
Blautia genus 9 3 6 0.5100 Bacteria Firmicutes Clostridia Eubacteriales Lachnospiraceae Blautia NA 11 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Lachnospiraceae|g__Blautia
Coprococcus genus 9 0 9 0.0039 Bacteria Firmicutes Clostridia Eubacteriales Lachnospiraceae Coprococcus NA 15 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Lachnospiraceae|g__Coprococcus
Dorea genus 9 1 8 0.0390 Bacteria Firmicutes Clostridia Eubacteriales Lachnospiraceae Dorea NA 16 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Lachnospiraceae|g__Dorea
Lactobacillaceae family 8 5 3 0.7300 Bacteria Firmicutes Bacilli Lactobacillales Lactobacillaceae NA NA 15 k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae
kableExtra::kbl(bugSigSimple::createTaxonTable(naso_sigs))
Taxon Name Taxonomic Level total_signatures increased_signatures decreased_signatures Binomial Test pval kingdom phylum class order family genus species n_signatures metaphlan_name
Veillonella genus 11 8 3 0.230 Bacteria Firmicutes Negativicutes Veillonellales Veillonellaceae Veillonella NA 12 k__Bacteria|p__Firmicutes|c__Negativicutes|o__Veillonellales|f__Veillonellaceae|g__Veillonella
Rothia genus 8 2 6 0.290 Bacteria Actinobacteria Actinomycetia Micrococcales Micrococcaceae Rothia NA 10 k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Micrococcales|f__Micrococcaceae|g__Rothia
Streptococcus genus 8 2 6 0.290 Bacteria Firmicutes Bacilli Lactobacillales Streptococcaceae Streptococcus NA 12 k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus
Prevotella genus 7 3 4 1.000 Bacteria Bacteroidetes Bacteroidia Bacteroidales Prevotellaceae Prevotella NA 18 k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Prevotellaceae|g__Prevotella
Oribacterium genus 7 1 6 0.130 Bacteria Firmicutes Clostridia Eubacteriales Lachnospiraceae Oribacterium NA 7 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Lachnospiraceae|g__Oribacterium
Filifactor genus 7 1 6 0.130 Bacteria Firmicutes Clostridia Eubacteriales Peptostreptococcaceae Filifactor NA 7 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Peptostreptococcaceae|g__Filifactor
Acinetobacter genus 7 3 4 1.000 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Moraxellaceae Acinetobacter NA 7 k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter
Haemophilus genus 6 0 6 0.031 Bacteria Proteobacteria Gammaproteobacteria Pasteurellales Pasteurellaceae Haemophilus NA 7 k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pasteurellales|f__Pasteurellaceae|g__Haemophilus
Arachnia genus 5 0 5 0.062 Bacteria Actinobacteria Actinomycetia Propionibacteriales Propionibacteriaceae Arachnia NA 5 k__Bacteria|p__Actinobacteria|c__Actinomycetia|o__Propionibacteriales|f__Propionibacteriaceae|g__Arachnia
Bacteroidetes phylum 5 0 5 0.062 Bacteria Bacteroidetes NA NA NA NA NA 27 k__Bacteria|p__Bacteroidetes

gut microbiota analysis

Look specifically at case-control comparisons

healthy <- grepl(".*(healthy.*COVID|COVID.*healthy).*", gut_sigs$comparison1, ignore.case = TRUE)

cc_gut_sigs1 <- gut_sigs %>%
      filter(healthy == TRUE) 

Overall frequencies of taxa increased in cases for case/control feces studies

Identifying a taxon reported consistently in 8 out of 8 studies is much more compelling that the FDR value here would suggest, since this taxon also passed a significance threshold in every one of those studies.

cc_gut_sigs1_taxontable <- bugSigSimple::createTaxonTable(cc_gut_sigs1) %>% 
  mutate(FDR =  p.adjust(p = `Binomial Test pval`, method="fdr")) %>%
  relocate(FDR, .after = `Binomial Test pval`)
kableExtra::kbl(cc_gut_sigs1_taxontable)
Taxon Name Taxonomic Level total_signatures increased_signatures decreased_signatures Binomial Test pval FDR kingdom phylum class order family genus species n_signatures metaphlan_name
Enterococcus genus 8 8 0 0.0078 0.0156000 Bacteria Firmicutes Bacilli Lactobacillales Enterococcaceae Enterococcus NA 11 k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Enterococcaceae|g__Enterococcus
Lactobacillaceae family 8 5 3 0.7300 0.7300000 Bacteria Firmicutes Bacilli Lactobacillales Lactobacillaceae NA NA 12 k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae
Streptococcus genus 8 8 0 0.0078 0.0156000 Bacteria Firmicutes Bacilli Lactobacillales Streptococcaceae Streptococcus NA 10 k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus
Anaerostipes genus 8 1 7 0.0700 0.1000000 Bacteria Firmicutes Clostridia Eubacteriales Lachnospiraceae Anaerostipes NA 9 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Lachnospiraceae|g__Anaerostipes
Coprococcus genus 8 0 8 0.0078 0.0156000 Bacteria Firmicutes Clostridia Eubacteriales Lachnospiraceae Coprococcus NA 13 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Lachnospiraceae|g__Coprococcus
Roseburia genus 8 0 8 0.0078 0.0156000 Bacteria Firmicutes Clostridia Eubacteriales Lachnospiraceae Roseburia NA 10 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Lachnospiraceae|g__Roseburia
Faecalibacterium genus 8 0 8 0.0078 0.0156000 Bacteria Firmicutes Clostridia Eubacteriales Oscillospiraceae Faecalibacterium NA 13 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Faecalibacterium
Clostridium genus 7 2 5 0.4500 0.5625000 Bacteria Firmicutes Clostridia Eubacteriales Clostridiaceae Clostridium NA 8 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Clostridiaceae|g__Clostridium
Anaerobutyricum hallii species 6 2 4 0.6900 0.7300000 Bacteria Firmicutes Clostridia Eubacteriales Lachnospiraceae Anaerobutyricum Anaerobutyricum hallii 6 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Lachnospiraceae|g__Anaerobutyricum|s__Anaerobutyricum hallii
Dorea genus 6 0 6 0.0310 0.0516667 Bacteria Firmicutes Clostridia Eubacteriales Lachnospiraceae Dorea NA 11 k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Lachnospiraceae|g__Dorea

Monte-Carlo simulation for increased abundance taxa

Just for the increased cc_gut_sigs1 for now. I am inclined to skip this analysis in favor of the clustering and binomial test analysis.

library(bugSigSimple)
gut.sigs.increased <- filter(cc_gut_sigs1, `Abundance in Group 1` == "increased") %>% 
  bugsigdbr::getSignatures(tax.id.type = "taxname")
 my.siglengths.inc <- sapply(gut.sigs.increased, length)

getCriticalN(gut.sigs.increased, my.siglengths.inc)
## 95% 
##  10
# Compare to observed - enterococcus & streptococcus are the only taxa that equal the critical limit
frequencySigs(gut.sigs.increased)
##            Enterococcus           Streptococcus      Enterobacteriaceae 
##                       8                       8                       5 
## Enterocloster citroniae         Enterococcaceae        Lactobacillaceae 
##                       5                       5                       5 
##                  Rothia          Actinobacteria               Atopobium 
##                       5                       4                       4 
##           Lactobacillus 
##                       4

Overall frequencies of taxa decreased in cases for case/control feces studies

createTaxonTable(cc_gut_sigs1, n=40)

nasopharyngeal microbiota analysis

Look specifically at case-control comparisons

library(dplyr)
healthy <- grepl(".*(control.*COVID|COVID.*control).*", naso_sigs$comparison1, ignore.case = TRUE)

cc_naso_sigs1 <- naso_sigs %>%
      filter(healthy == TRUE) %>%
  subset(Study != "Study 458")

Cluster analysis

Note, this EDA should really be done before hypothesis testing.

First calculate pairwise overlaps for all signatures of length > 1:

allsigs <- bugsigdbr::getSignatures(covid_all, tax.id.type = "taxname")
allsigs <- allsigs[sapply(allsigs, length) > 1] #require length > 1
dim(allsigs)
## NULL
mydists <- BugSigDBStats::calcPairwiseOverlaps(allsigs)
dim(mydists)
## [1] 1811    8

Create a matrix of Jaccard similarities (0 for no overlap, 1 for 100% overlap)

jmat <- BugSigDBStats::calcJaccardSimilarity(allsigs)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.10.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
siglengths <- sapply(allsigs, length)
ha <- HeatmapAnnotation(`Signature Length` = anno_barplot(siglengths))
hr <- rowAnnotation(`Signature Length` = anno_barplot(siglengths))
hm <- Heatmap(
  jmat,
  top_annotation = ha, left_annotation = hr,
  row_names_max_width = unit(20, "cm"),
  column_names_max_height = unit(20, "cm"),
  row_labels = sub(".+:", "", rownames(jmat)),
  column_labels = sub(".+:", "", colnames(jmat))
)
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
## 
## Use `suppressMessages()` to turn off this message.
hm

Use this interactively to make an interactive heatmap (not evaluated in vignette). Some expanding of the default size is required to see anything. Creating a sub-heatmap, then exporting it as a table, allows in-depth identification of the subgroups.

hc <- hclust(as.dist(jmat))
plot(hc)

This tree can be cut to show the clusters, for example. The clusters of more than 1 signature but less than ~10 signatures are most likely to be something interesting.

clusts <- sort(cutree(hc, h = 0.05))
lapply(unique(clusts), function(i) names(clusts)[clusts == i])
## [[1]]
##  [1] "bsdb:428/1/1_COVID-19:COVID-19-cases_vs_Healthy-controls_UP"                                                                                       
##  [2] "bsdb:428/1/2_COVID-19:COVID-19-cases_vs_Healthy-controls_DOWN"                                                                                     
##  [3] "bsdb:428/3/1_COVID-19:Recovered-COVID-19-cases_vs_Healthy-controls_UP"                                                                             
##  [4] "bsdb:430/1/1_COVID-19:Patients-with-high-SARS-CoV-2-infectivity_vs_Patients-with-low-to-none-SARS-CoV-2-infectivity_UP"                            
##  [5] "bsdb:441/1/1_COVID-19:COVID-19-cases_vs_Healthy-controls_UP"                                                                                       
##  [6] "bsdb:441/3/2_COVID-19:COVID-19-patients_vs_H1N1-patients_DOWN"                                                                                     
##  [7] "bsdb:449/1/2_COVID-19:COVID-19-cases_vs_Healthy-controls_DOWN"                                                                                     
##  [8] "bsdb:453/3/1_COVID-19:Recovered-COVID-19-patients_vs_Non-infected-patients-with-unrelated-respiratory-medical-conditions_UP"                       
##  [9] "bsdb:453/3/2_COVID-19:Recovered-COVID-19-patients_vs_Non-infected-patients-with-unrelated-respiratory-medical-conditions_DOWN"                     
## [10] "bsdb:458/1/1_COVID-19:COVID-19-patients_vs_Non-intubated-patients-with-non-incubation-viral-pneumonia-or-non-incubation-non-infectious-diseases_UP"
## [11] "bsdb:459/1/2_COVID-19:COVID-19-positive-patients_vs_COVID-19-negative-controls_UP"                                                                 
## [12] "bsdb:464/1/2_COVID-19:Confirmed-COVID-19-patients_vs_Healthy-controls_UP"                                                                          
## [13] "bsdb:474/1/2_COVID-19:COVID-19-patients_vs_Non-COVID-patients-with-URTI-or-COPD_DOWN"                                                              
## [14] "bsdb:479/2/2_COVID-19:Mild-moderate-COVID-19-patients_vs_Healthy-controls_UP"                                                                      
## [15] "bsdb:482/1/2_COVID-19:Severe-COVID-19-patients_vs_Mild-COVID-19-patients_UP"                                                                       
## [16] "bsdb:483/1/1_COVID-19:Antibiotic-naive-COVID-19-patients_vs_Uninfected-controls,-pneumonia-patients,-antibiotics-treated-COVID-19-patients_UP"     
## [17] "bsdb:483/3/1_COVID-19:Pneumonia-controls_vs_Uninfected-controls,-COVID-19-patients_DOWN"                                                           
## [18] "bsdb:484/1/1_COVID-19:COVID-19-positive-patients_vs_COVID-19-negative-patients-with-fever-and-cough_UP"                                            
## [19] "bsdb:485/1/1_COVID-19:COVID-19-patients-with-fever_vs_COVID-19-patients-without-fever_UP"                                                          
## [20] "bsdb:511/1/2_COVID-19:COVID-19-patients_vs_Healthy-controls_DOWN"                                                                                  
## [21] "bsdb:512/3/1_COVID-19:COVID-19-positive-patients_vs_COVID-19-negative-healthcare-workers-(controls)_UP"                                            
## 
## [[2]]
##  [1] "bsdb:428/2/1_COVID-19:Recovered-COVID-19-cases_vs_Healthy-controls_UP"                                                         
##  [2] "bsdb:428/2/2_COVID-19:Recovered-COVID-19-cases_vs_Healthy-controls_DOWN"                                                       
##  [3] "bsdb:441/1/2_COVID-19:COVID-19-cases_vs_Healthy-controls_DOWN"                                                                 
##  [4] "bsdb:441/4/1_COVID-19:COVID-19-cases_vs_Healthy-controls_UP"                                                                   
##  [5] "bsdb:453/1/1_COVID-19:SARS-CoV-2-infected-patients_vs_Non-infected-patients-with-unrelated-respiratory-medical-conditions_UP"  
##  [6] "bsdb:464/2/2_COVID-19:Recovered-COVID-19-patients_vs_Healthy-controls_UP"                                                      
##  [7] "bsdb:469/1/1_COVID-19:COVID-19-patients_vs_Non-COVID-19-controls_DOWN"                                                         
##  [8] "bsdb:474/1/1_COVID-19:COVID-19-patients_vs_Non-COVID-patients-with-URTI-or-COPD_UP"                                            
##  [9] "bsdb:476/1/1_COVID-19:Recovered-COVID-19-patients_vs_Healthy-controls_UP"                                                      
## [10] "bsdb:478/1/2_COVID-19:COVID-19-patients_vs_Healthy-controls_UP"                                                                
## [11] "bsdb:479/2/1_COVID-19:Mild-moderate-COVID-19-patients_vs_Healthy-controls_DOWN"                                                
## [12] "bsdb:481/1/1_COVID-19:COVID-19-patients_vs_Healthy-controls_UP"                                                                
## [13] "bsdb:487/1/1_COVID-19:Recovered-COVID-19-samples-(respiratory-negative)_vs_Infected-COVID-19-samples-(respiratory-positive)_UP"
## [14] "bsdb:498/1/2_COVID-19:COVID-19-patients_vs_Asymptomatic-uninfected-controls_DOWN"                                              
## [15] "bsdb:511/3/2_COVID-19:COVID-19-patients-not-treated-with-antibiotics_vs_Healthy-controls_DOWN"                                 
## [16] "bsdb:511/4/2_COVID-19:COVID-19-patients_vs_Healthy-controls_DOWN"                                                              
## [17] "bsdb:512/2/1_COVID-19:COVID-19-positive-patients_vs_COVID-19-negative-controls_UP"                                             
## 
## [[3]]
##  [1] "bsdb:428/3/2_COVID-19:Recovered-COVID-19-cases_vs_Healthy-controls_DOWN"                                                                             
##  [2] "bsdb:453/2/1_COVID-19:SARS-CoV-2-recovered-individuals_vs_SARS-CoV-2-infected-patients_DOWN"                                                         
##  [3] "bsdb:458/1/2_COVID-19:COVID-19-patients_vs_Non-intubated-patients-with-non-incubation-viral-pneumonia-or-non-incubation-non-infectious-diseases_DOWN"
##  [4] "bsdb:478/2/1_COVID-19:COVID-19-patients_vs_Flu-patients_DOWN"                                                                                        
##  [5] "bsdb:479/1/1_COVID-19:ICU-COVID-19-patients_vs_Healthy-controls_DOWN"                                                                                
##  [6] "bsdb:479/4/2_COVID-19:ICU-COVID-19-patients_vs_Patients-infected-with-other-human-coronaviruses_UP"                                                  
##  [7] "bsdb:481/2/1_COVID-19:Antibiotic-treated-COVID-19-patients_vs_Antibiotic-naive-COVID-19-patients_UP"                                                 
##  [8] "bsdb:481/3/1_COVID-19:Severe-COVID-19-patients_vs_Mild-COVID-19-patients_UP"                                                                         
##  [9] "bsdb:485/2/2_COVID-19:COVID-19-patients-with-fever_vs_COVID-19-patients-without-fever_DOWN"                                                          
## [10] "bsdb:487/2/1_COVID-19:Recovered-COVID-19-samples-(respiratory-negative)_vs_Healthy-controls_DOWN"                                                    
## 
## [[4]]
##  [1] "bsdb:428/4/1_COVID-19:COVID-19-patients-treated-with-antibiotics_vs_Healthy-controls_DOWN"                                     
##  [2] "bsdb:428/4/2_COVID-19:COVID-19-patients-treated-with-antibiotics_vs_Healthy-controls_UP"                                       
##  [3] "bsdb:430/1/2_COVID-19:Patients-with-high-SARS-CoV-2-infectivity_vs_Patients-with-low-to-none-SARS-CoV-2-infectivity_DOWN"      
##  [4] "bsdb:453/1/2_COVID-19:SARS-CoV-2-infected-patients_vs_Non-infected-patients-with-unrelated-respiratory-medical-conditions_DOWN"
##  [5] "bsdb:453/4/2_COVID-19:Positive-for-COVID-19-viral-RNA-in-feces_vs_Negative-for-COVID-19-viral-RNA-in-feces_DOWN"               
##  [6] "bsdb:464/2/1_COVID-19:Recovered-COVID-19-patients_vs_Healthy-controls_DOWN"                                                    
##  [7] "bsdb:477/1/1_COVID-19:Severe-COVID-19-patients_vs_COVID-19-negative-controls_UP"                                               
##  [8] "bsdb:479/3/1_COVID-19:ICU-COVID-19-patients_vs_Mild-Moderate-COVID-19-patients_DOWN"                                           
##  [9] "bsdb:484/4/1_COVID-19:COVID-19-positive-patients_vs_COVID-19-negative-patients-with-fever-and-cough_UP"                        
## [10] "bsdb:498/1/1_COVID-19:COVID-19-patients_vs_Asymptomatic-uninfected-controls_UP"                                                
## 
## [[5]]
## [1] "bsdb:441/3/1_COVID-19:COVID-19-patients_vs_H1N1-patients_UP"                                          
## [2] "bsdb:473/1/1_COVID-19:COVID-19-positive-patients_vs_COVID-19-negative-controls_DOWN"                  
## [3] "bsdb:478/3/1_COVID-19:Flu-patients_vs_Healthy-controls_UP"                                            
## [4] "bsdb:479/1/2_COVID-19:ICU-COVID-19-patients_vs_Healthy-controls_UP"                                   
## [5] "bsdb:481/2/2_COVID-19:Antibiotic-treated-COVID-19-patients_vs_Antibiotic-naive-COVID-19-patients_DOWN"
## [6] "bsdb:500/1/1_COVID-19:COVID-19-patients_vs_Non-COVID-19-pneumonia-patients_UP"                        
## [7] "bsdb:511/2/1_COVID-19:COVID-19-patients-(severe/critical)_vs_COVID-19-patients-(mild/moderate)_DOWN"  
## 
## [[6]]
## [1] "bsdb:449/1/1_COVID-19:COVID-19-cases_vs_Healthy-controls_UP"                                                                 
## [2] "bsdb:453/2/2_COVID-19:SARS-CoV-2-recovered-individuals_vs_SARS-CoV-2-infected-patients_UP"                                   
## [3] "bsdb:459/1/1_COVID-19:COVID-19-positive-patients_vs_COVID-19-negative-controls_DOWN"                                         
## [4] "bsdb:478/5/1_COVID-19:Flu-patients_vs_Healthy-controls_DOWN"                                                                 
## [5] "bsdb:481/1/2_COVID-19:COVID-19-patients_vs_Healthy-controls_DOWN"                                                            
## [6] "bsdb:486/1/2_COVID-19:Asymptomatic-COVID-19-patients_vs_Healthy-controls_DOWN"                                               
## [7] "bsdb:486/2/1_COVID-19:Asymptomatic-COVID-19-patients_vs_Healthy-controls_UP"                                                 
## [8] "bsdb:489/1/1_COVID-19:COVID-positive-patients-admitted-to-ICU-(i-COVID-19)_vs_Pneumonia-control-patients-(COVID-negative)_UP"
## [9] "bsdb:513/1/1_COVID-19:COVID-19-positive-patients_vs_COVID-19-negative-controls_UP"                                           
## 
## [[7]]
## [1] "bsdb:453/4/1_COVID-19:Positive-for-COVID-19-viral-RNA-in-feces_vs_Negative-for-COVID-19-viral-RNA-in-feces_UP"                                            
## [2] "bsdb:464/3/2_COVID-19:COVID-19-patients_vs_Healthy-controls_UP"                                                                                           
## [3] "bsdb:476/1/2_COVID-19:Recovered-COVID-19-patients_vs_Healthy-controls_DOWN"                                                                               
## [4] "bsdb:489/2/1_COVID-19:COVID-positive-patients-admitted-to-ICU-(i-COVID-19)_vs_COVID-positive-patients-admitted-to-infectious-disease-ward-(w-COVID-19)_UP"
## [5] "bsdb:498/2/1_COVID-19:COVID-19-patients-with-high-viral-load_vs_COVID-19-patients-with-low-viral-load_UP"                                                 
## 
## [[8]]
## [1] "bsdb:464/1/1_COVID-19:Confirmed-COVID-19-patients_vs_Healthy-controls_DOWN"                                                                    
## [2] "bsdb:478/4/2_COVID-19:COVID-19-patients_vs_Healthy-controls_UP"                                                                                
## [3] "bsdb:479/3/2_COVID-19:ICU-COVID-19-patients_vs_Mild-Moderate-COVID-19-patients_UP"                                                             
## [4] "bsdb:483/2/1_COVID-19:Antibiotic-treated-COVID-19-patients_vs_Antibiotic-naive-COVID-19-patients,-pneumonia-patients,-uninfected-controls_DOWN"
## [5] "bsdb:485/1/2_COVID-19:COVID-19-patients-with-fever_vs_COVID-19-patients-without-fever_DOWN"                                                    
## 
## [[9]]
## [1] "bsdb:464/3/1_COVID-19:COVID-19-patients_vs_Healthy-controls_DOWN"                                                                          
## [2] "bsdb:478/1/1_COVID-19:COVID-19-patients_vs_Healthy-controls_DOWN"                                                                          
## [3] "bsdb:478/5/2_COVID-19:Flu-patients_vs_Healthy-controls_UP"                                                                                 
## [4] "bsdb:489/3/2_COVID-19:COVID-positive-patients-admitted-to-infectious-disease-ward-(w-COVID-19)_vs_Pneumonia-controls-(COVID-negative)_DOWN"
## [5] "bsdb:511/5/1_COVID-19:COVID-19-patients-not-treated-with-antibiotics_vs_Healthy-controls_UP"                                               
## 
## [[10]]
## [1] "bsdb:478/3/2_COVID-19:Flu-patients_vs_Healthy-controls_DOWN"                                         
## [2] "bsdb:479/4/1_COVID-19:ICU-COVID-19-patients_vs_Patients-infected-with-other-human-coronaviruses_DOWN"
## [3] "bsdb:482/1/1_COVID-19:Severe-COVID-19-patients_vs_Mild-COVID-19-patients_DOWN"                       
## [4] "bsdb:485/2/1_COVID-19:COVID-19-patients-with-fever_vs_COVID-19-patients-without-fever_UP"            
## 
## [[11]]
## [1] "bsdb:478/4/1_COVID-19:COVID-19-patients_vs_Healthy-controls_DOWN"                                          
## [2] "bsdb:486/1/1_COVID-19:Asymptomatic-COVID-19-patients_vs_Healthy-controls_UP"                               
## [3] "bsdb:498/2/2_COVID-19:COVID-19-patients-with-high-viral-load_vs_COVID-19-patients-with-low-viral-load_DOWN"
## [4] "bsdb:511/3/1_COVID-19:COVID-19-patients-not-treated-with-antibiotics_vs_Healthy-controls_UP"               
## [5] "bsdb:511/5/2_COVID-19:COVID-19-patients-not-treated-with-antibiotics_vs_Healthy-controls_DOWN"             
## [6] "bsdb:547/1/1_COVID-19:Recovered-COVID-19-patients_vs_Healthy-controls_UP"                                  
## 
## [[12]]
## [1] "bsdb:486/2/2_COVID-19:Asymptomatic-COVID-19-patients_vs_Healthy-controls_DOWN"                           
## [2] "bsdb:511/1/1_COVID-19:COVID-19-patients_vs_Healthy-controls_UP"                                          
## [3] "bsdb:512/1/1_COVID-19:COVID-19-positive-patients_vs_COVID-19-negative-controls-and-healthcare-workers_UP"
## 
## [[13]]
## [1] "bsdb:486/3/1_COVID-19:Severe-COVID-19-patients_vs_Healthy-controls_UP"                                                         
## [2] "bsdb:489/1/2_COVID-19:COVID-positive-patients-admitted-to-ICU-(i-COVID-19)_vs_Pneumonia-control-patients-(COVID-negative)_DOWN"
## [3] "bsdb:496/2/1_COVID-19:COVID-19-patients_vs_Seasonal-flu-patients_UP"                                                           
## 
## [[14]]
## [1] "bsdb:486/3/2_COVID-19:Severe-COVID-19-patients_vs_Healthy-controls_DOWN"
## [2] "bsdb:511/4/1_COVID-19:COVID-19-patients_vs_Healthy-controls_UP"         
## 
## [[15]]
## [1] "bsdb:487/2/2_COVID-19:Recovered-COVID-19-samples-(respiratory-negative)_vs_Healthy-controls_UP"                                          
## [2] "bsdb:487/3/1_COVID-19:Infected-COVID-19-samples-(respiratory-positive)_vs_Healthy-controls_DOWN"                                         
## [3] "bsdb:496/2/2_COVID-19:COVID-19-patients_vs_Seasonal-flu-patients_DOWN"                                                                   
## [4] "bsdb:513/2/1_COVID-19:COVID-19-positive-patients-with-breathing-assistance_vs_COVID-19-positive-patients-without-breathing-assistance_UP"
## 
## [[16]]
## [1] "bsdb:487/3/2_COVID-19:Infected-COVID-19-samples-(respiratory-positive)_vs_Healthy-controls_UP"
## [2] "bsdb:500/1/2_COVID-19:COVID-19-patients_vs_Non-COVID-19-pneumonia-patients_DOWN"              
## 
## [[17]]
## [1] "bsdb:489/2/2_COVID-19:COVID-positive-patients-admitted-to-ICU-(i-COVID-19)_vs_COVID-positive-patients-admitted-to-infectious-disease-ward-(w-COVID-19)_DOWN"
## [2] "bsdb:546/1/2_COVID-19:COVID-19-cases_vs_Non-infected-controls_DOWN"                                                                                         
## 
## [[18]]
## [1] "bsdb:489/3/1_COVID-19:COVID-positive-patients-admitted-to-infectious-disease-ward-(w-COVID-19)_vs_Pneumonia-controls-(COVID-negative)_UP"
## [2] "bsdb:496/1/2_COVID-19:COVID-19-patients_vs_Healthy-controls_DOWN"                                                                        
## [3] "bsdb:548/1/1_COVID-19:COVID-19-cases_vs_Healthy-controls_UP"                                                                             
## 
## [[19]]
## [1] "bsdb:496/1/1_COVID-19:COVID-19-patients_vs_Healthy-controls_UP"            
## [2] "bsdb:547/1/2_COVID-19:Recovered-COVID-19-patients_vs_Healthy-controls_DOWN"
## [3] "bsdb:548/1/2_COVID-19:COVID-19-cases_vs_Healthy-controls_DOWN"

Create a wide-format dataframe

This would be suitable for regression analysis.

covid_withsigs <- filter(covid_all, !is.na(covid_all$`NCBI Taxonomy IDs`))
sigs <- bugsigdbr::getSignatures(covid_withsigs, tax.id.type = "taxname")
cmat <- t(safe::getCmatrix(sigs, as.matrix = TRUE, min.size = 0, prune = FALSE))
## WARNING: rows are sorted elements of keyword.list
## 132 categories formed
cdf <- data.frame(cmat, stringsAsFactors = FALSE, check.names = FALSE)
cdf <- cbind(covid_withsigs, cdf)
colnames(cdf)[1:54]
##  [1] "Study"                            "Study design"                    
##  [3] "PMID"                             "DOI"                             
##  [5] "URL"                              "Authors"                         
##  [7] "Title"                            "Journal"                         
##  [9] "Year"                             "Experiment"                      
## [11] "Location of subjects"             "Host species"                    
## [13] "Body site"                        "UBERON ID"                       
## [15] "Condition"                        "EFO ID"                          
## [17] "Group 0 name"                     "Group 1 name"                    
## [19] "Group 1 definition"               "Group 0 sample size"             
## [21] "Group 1 sample size"              "Antibiotics exclusion"           
## [23] "Sequencing type"                  "16S variable region"             
## [25] "Sequencing platform"              "Statistical test"                
## [27] "Significance threshold"           "MHT correction"                  
## [29] "LDA Score above"                  "Matched on"                      
## [31] "Confounders controlled for"       "Pielou"                          
## [33] "Shannon"                          "Chao1"                           
## [35] "Simpson"                          "Inverse Simpson"                 
## [37] "Richness"                         "Signature page name"             
## [39] "Source"                           "Curated date"                    
## [41] "Curator"                          "Revision editor"                 
## [43] "Description"                      "Abundance in Group 1"            
## [45] "MetaPhlAn taxon names"            "NCBI Taxonomy IDs"               
## [47] "State"                            "Reviewer"                        
## [49] "studyexp"                         "site"                            
## [51] "comparison1"                      "[Brevibacterium] frigoritolerans"
## [53] "[Clostridium] colinum"            "[Clostridium] scindens"

Note this has a number of columns that are mostly zeros, it could be filtered significantly for any regression or machine learning analysis:

table(cdf[["[Brevibacterium] frigoritolerans"]])
## 
##   0   1 
## 131   1

Create another heatmap on correlations of presence/absence of taxa. This is not necessary because the previous Jaccard Index heatmap is probably better, it is just a demonstration of doing something with the taxa presence/absence directly.

sigcors <- cor(t(cmat))
siglengths <- sapply(sigs, length)
ha <- HeatmapAnnotation(`Signature Length` = anno_barplot(siglengths))
hr <- rowAnnotation(`Signature Length` = anno_barplot(siglengths))
hm <- Heatmap(
  sigcors,
  top_annotation = ha, left_annotation = hr,
  row_names_max_width = unit(20, "cm"),
  column_names_max_height = unit(20, "cm"),
  row_labels = sub(".+:", "", rownames(sigcors)), ##removing study just to make signature names legible
  column_labels = sub(".+:", "", colnames(sigcors))
)
hm

Use this interactively to make an interactive heatmap: