Reading data

Get bulk export from bugsigdb.org:

full.dat <- bugsigdbr::importBugSigDB(version = "devel", cache = FALSE)
dim(full.dat)
## [1] 2838   49
colnames(full.dat)
##  [1] "BSDB ID"                    "Study"                     
##  [3] "Study design"               "PMID"                      
##  [5] "DOI"                        "URL"                       
##  [7] "Authors list"               "Title"                     
##  [9] "Journal"                    "Year"                      
## [11] "Experiment"                 "Location of subjects"      
## [13] "Host species"               "Body site"                 
## [15] "UBERON ID"                  "Condition"                 
## [17] "EFO ID"                     "Group 0 name"              
## [19] "Group 1 name"               "Group 1 definition"        
## [21] "Group 0 sample size"        "Group 1 sample size"       
## [23] "Antibiotics exclusion"      "Sequencing type"           
## [25] "16S variable region"        "Sequencing platform"       
## [27] "Statistical test"           "Significance threshold"    
## [29] "MHT correction"             "LDA Score above"           
## [31] "Matched on"                 "Confounders controlled for"
## [33] "Pielou"                     "Shannon"                   
## [35] "Chao1"                      "Simpson"                   
## [37] "Inverse Simpson"            "Richness"                  
## [39] "Signature page name"        "Source"                    
## [41] "Curated date"               "Curator"                   
## [43] "Revision editor"            "Description"               
## [45] "Abundance in Group 1"       "MetaPhlAn taxon names"     
## [47] "NCBI Taxonomy IDs"          "State"                     
## [49] "Reviewer"

Stripping illformed entries:

is.study <- grepl("^Study [0-9]+$", full.dat[["Study"]])
is.exp <- grepl("^Experiment [0-9]+$", full.dat[["Experiment"]])
full.dat <- full.dat[is.study & is.exp, ]

Curation output

Number of papers and signatures curated:

pmids <- unique(full.dat[,"PMID"])
length(pmids)
## [1] 705
nrow(full.dat)
## [1] 2838

Publication date of the curated papers:

pmids <- pmids[!is.na(pmids)]
pubyear1 <- pmid2pubyear(pmids[1:361])
pubyear2 <- pmid2pubyear(pmids[362:length(pmids)])
pubyear <- c(pubyear1, pubyear2)
head(cbind(pmids, pubyear))
##      pmids      pubyear
## [1,] "28038683" "2016" 
## [2,] "28173873" "2017" 
## [3,] "27015276" "2016" 
## [4,] "27625705" "2016" 
## [5,] "23071781" "2012" 
## [6,] "28467925" "2017"
tab <- table(pubyear)
tab <- tab[-length(tab)]
tab <- tab[order(as.integer(names(tab)))]
df <- data.frame(year = names(tab), papers = as.integer(tab))
ggbarplot(df, x = "year", y = "papers", 
          label = TRUE, fill = "steelblue",
          ggtheme = theme_bw())

Stripping empty signatures:

ind1 <- lengths(full.dat[["MetaPhlAn taxon names"]]) > 0
ind2 <- lengths(full.dat[["NCBI Taxonomy IDs"]]) > 0
dat <- full.dat[ind1 & ind2,]
nrow(dat)
## [1] 2838

Papers containing only empty UP and DOWN signatures (under curation?):

setdiff(pmids, unique(dat[,"PMID"]))
## numeric(0)

Progress over time:

dat[,"Curated date"] <- as.character(lubridate::dmy(dat[,"Curated date"]))
plotProgressOverTime(dat)

plotProgressOverTime(dat, diff = TRUE)

Stratified by curator:

npc <- stratifyByCurator(dat)
plotCuratorStats(dat, npc)

Number of complete and revised signatures:

table(df[["State"]])
## < table of extent 0 >
table(dat[,"Revision editor"])
## 
##                                   Adi13                               Aiyshaaaa 
##                                       8                                      19 
##                     Aiyshaaaa,WikiWorks               Aiyshaaaa,WikiWorks,Merit 
##                                       1                                       4 
##                             Annabelcute                                Atrayees 
##                                       3                                      11 
##         Atrayees,Aiyshaaaa,Claregrieve1                   Atrayees,Claregrieve1 
##                                       1                                       2 
##          Atrayees,Lwaldron,Claregrieve1                      Atrayees,WikiWorks 
##                                       1                                      20 
##                Atrayees,WikiWorks,Merit                     Barakat Dindi,Chloe 
##                                       2                                       2 
##                             BLESSING123                       BLESSING123,Chloe 
##                                      15                                       2 
##                                   Brian                           Brian,Suwaiba 
##                                       2                                       2 
##                                  Busayo                           Busayo,Fatima 
##                                       1                                       1 
##                         Busayo,Mcarlson                                  Chioma 
##                                       2                                       4 
##                           Chioma,Fatima                                   Chloe 
##                                       2                                       7 
##                         Chloe,Kwekuamoo                             Chloe,Merit 
##                                       1                                       1 
##                         Chloe,WikiWorks                        Chondatondaponda 
##                                       4                                      10 
##                            Claregrieve1                  Claregrieve1,Aiyshaaaa 
##                                     152                                       2 
##                   Claregrieve1,Atrayees         Claregrieve1,Atrayees,WikiWorks 
##                                       1                                       2 
##   Claregrieve1,Atrayees,WikiWorks,Merit                      Claregrieve1,Chloe 
##                                       1                                       1 
##      Claregrieve1,Chloe,WikiWorks,Merit                     Claregrieve1,Fatima 
##                                       1                                      10 
##        Claregrieve1,Fatima,LGeistlinger             Claregrieve1,Fatima,Yu Wang 
##                                       1                                       1 
##                   Claregrieve1,Lwaldron           Claregrieve1,Lwaldron,Suwaiba 
##                                       2                                       1 
##                      Claregrieve1,Merit            Claregrieve1,Merit,WikiWorks 
##                                       8                                       2 
##            Claregrieve1,Rukky,WikiWorks                    Claregrieve1,Suwaiba 
##                                       1                                       1 
##              Claregrieve1,Suwaiba,Merit                  Claregrieve1,WikiWorks 
##                                       1                                     264 
##            Claregrieve1,WikiWorks,Merit                                Cyberian 
##                                       6                                       4 
##                          Cyberian,Chloe                        Cynthia Anderson 
##                                       2                                      28 
##           Cynthia Anderson,Claregrieve1                 Cynthia Anderson,Fatima 
##                                       2                                       2 
## Cynthia Anderson,LGeistlinger,WikiWorks     Cynthia Anderson,Lwaldron,WikiWorks 
##                                       2                                       1 
##         Danyab56,Aiyshaaaa,Claregrieve1                                  Deacme 
##                                       5                                       4 
##                        Deacme,Aiyshaaaa                         Deacme,Atrayees 
##                                       8                                       2 
##                                    Dupe                          Dupe,Aiyshaaaa 
##                                       4                                       1 
##                           Dupe,Atrayees                           Dupe,Mcarlson 
##                                       2                                       2 
##                             Ellajessica                   Ellajessica,Aiyshaaaa 
##                                       2                                       2 
##                                  Fatima                        Fatima,Aiyshaaaa 
##                                      21                                       1 
##                     Fatima,Claregrieve1           Fatima,Claregrieve1,WikiWorks 
##                                       2                                      11 
##              Fatima,Kwekuamoo,WikiWorks           Fatima,LGeistlinger,WikiWorks 
##                                       2                                       1 
##               Fatima,Lwaldron,WikiWorks                  Fatima,Merit,WikiWorks 
##                                       3                                       3 
##                        Fatima,WikiWorks                  Fatima,WikiWorks,Merit 
##                                      40                                       3 
##                                Fcuevas3                      Fcuevas3,Aiyshaaaa 
##                                      37                                       2 
##                   Fcuevas3,Claregrieve1                         Fcuevas3,Fatima 
##                                       4                                       1 
##             Fcuevas3,Lwaldron,Aiyshaaaa                         Fcuevas3,Rimsha 
##                                       2                                       8 
##                                    Gina                                Haoyanzh 
##                                      14                                      20 
##                       Haoyanzh,Lwaldron                             Itslanapark 
##                                       2                                      27 
##                   Itslanapark,Aiyshaaaa                    Itslanapark,Atrayees 
##                                       5                                       1 
##                       Itslanapark,Chloe       Itslanapark,Chloe,Aiyshaaaa,Merit 
##                                       3                                       1 
##                Itslanapark,Claregrieve1      Itslanapark,Claregrieve1,Aiyshaaaa 
##                                       4                                       1 
##       Itslanapark,Claregrieve1,Atrayees                      Itslanapark,Fatima 
##                                       1                                       2 
##          Itslanapark,Fatima,Chloe,Merit                      Itslanapark,Rimsha 
##                                       1                                       1 
##                         Jacquelynshevin                   Jacquelynshevin,Chloe 
##                                      29                                       1 
##         Jacquelynshevin,Chloe,WikiWorks                  Jacquelynshevin,Fatima 
##                                       1                                       7 
##                                 Jeshudy                       Jeshudy,Aiyshaaaa 
##                                      55                                       3 
##                    Jeshudy,Claregrieve1                          Jeshudy,Fatima 
##                                      38                                       4 
##                         Jeshudy,Suwaiba                                 Joyessa 
##                                       2                                       1 
##                       Joyessa,Aiyshaaaa                    Joyessa,Claregrieve1 
##                                       3                                      18 
##              Joyessa,Claregrieve1,Merit             Joyessa,Fatima,Claregrieve1 
##                                       1                                       2 
##       Kahvecirem,Aiyshaaaa,Claregrieve1 Kahvecirem,Aiyshaaaa,Merit,Claregrieve1 
##                                       2                                       3 
##                     Kahvecirem,Atrayees  Kahvecirem,Atrayees,Merit,Claregrieve1 
##                                       2                                       1 
##           Kahvecirem,Merit,Claregrieve1                          Kaluifeanyi101 
##                                       2                                      54 
##                Kaluifeanyi101,Aiyshaaaa             Kaluifeanyi101,Claregrieve1 
##                                       2                                      16 
##                   Kaluifeanyi101,Fatima                  KathyWaldron,WikiWorks 
##                                       2                                       3 
##            KathyWaldron,WikiWorks,Merit                           Kelvin Joseph 
##                                       1                                      11 
##              Kelvin Joseph,Claregrieve1                    Khadeeejah,Aiyshaaaa 
##                                       2                                       4 
##              Khadeeejah,Aiyshaaaa,Chloe                     Khadeeejah,Atrayees 
##                                       1                                       2 
##               Khadeeejah,Atrayees,Chloe                               Kwekuamoo 
##                                       1                                      25 
##                  Kwekuamoo,Claregrieve1               Kwekuamoo,Merit,WikiWorks 
##                                       4                                       1 
##                Levitest,WikiWorks,Merit                     Lorakasselman,Chloe 
##                                       1                                       2 
##              Lorakasselman,Claregrieve1                                Lwaldron 
##                                       2                                      11 
##   Lwaldron,Atrayees,WikiWorks,Aiyshaaaa         Lwaldron,Claregrieve1,WikiWorks 
##                                       1                                       5 
##   Lwaldron,Claregrieve1,WikiWorks,Merit               Lwaldron,Fatima,WikiWorks 
##                                       2                                       1 
##                      Lwaldron,WikiWorks         Lwaldron,WikiWorks,LGeistlinger 
##                                      30                                       1 
##                Lwaldron,WikiWorks,Merit                           Madhubani Dey 
##                                       4                                      13 
##                 Madhubani Dey,Aiyshaaaa               Madhubani Dey,Chloe,Merit 
##                                       1                                       1 
##              Madhubani Dey,Claregrieve1       Madhubani Dey,Fatima,Claregrieve1 
##                                      26                                       2 
##                  Madhubani Dey,Lwaldron                     Madhubani Dey,Merit 
##                                       1                                       2 
##                                 Manuela                          Mary Bearkland 
##                                      11                                      24 
##                Mary Bearkland,Aiyshaaaa   Mary Bearkland,Aiyshaaaa,Claregrieve1 
##                                       1                                       1 
##             Mary Bearkland,Claregrieve1                   Mary Bearkland,Fatima 
##                                      20                                       3 
##             Mary Bearkland,Fatima,Merit                    Mary Bearkland,Merit 
##                                       1                                       6 
##                     Maryemzaki,Lwaldron                                   Merit 
##                                       2                                       2 
##                         Merit,Aiyshaaaa                      Merit,Claregrieve1 
##                                       1                                       1 
##                         Merit,WikiWorks                                  Mmarin 
##                                      19                                      21 
##                         Mmarin,Atrayees                     Mmarin,Claregrieve1 
##                                       1                                      13 
##                                  Nice25                   Nnadichioma,Aiyshaaaa 
##                                       1                                       1 
##          Nnadichioma,Aiyshaaaa,Atrayees             Nnadichioma,Aiyshaaaa,Merit 
##                                       2                                       2 
##    Nnadichioma,Atrayees,Aiyshaaaa,Merit                                  Ombati 
##                                       1                                       3 
##                         Ombati,Atrayees                            Ombati,Chloe 
##                                       2                                       1 
##                                  Rimsha    Rimsha,Fatima,LGeistlinger,WikiWorks 
##                                       4                                       1 
##                 Rimsha,Fatima,WikiWorks                         Rimsha,Lwaldron 
##                                       1                                       1 
##                             Samara.Khan                Samara.Khan,Claregrieve1 
##                                      38                                       9 
##                      Samara.Khan,Fatima                               Sharmilac 
##                                       1                                       5 
##                     Sharmilac,Aiyshaaaa                  Sharmilac,Claregrieve1 
##                                       1                                       2 
##                        Sharmilac,Fatima              Sharmilac,Fatima,Aiyshaaaa 
##                                       6                                       1 
##                         Sharmilac,Merit                                   Sophy 
##                                       2                                       9 
##            Sophy,Aiyshaaaa,Claregrieve1                             Sophy,Chloe 
##                                       4                                       3 
##                      Sophy,Claregrieve1                 Sophy,Mcarlson,Atrayees 
##                                       4                                       2 
##                                 Suwaiba                                  Tislam 
##                                      23                                      29 
##                        Tislam,Aiyshaaaa           Tislam,Aiyshaaaa,Claregrieve1 
##                                       2                                       1 
##                         Tislam,Atrayees                     Tislam,Claregrieve1 
##                                       2                                       6 
##                           Tislam,Fatima              Tislam,Fatima,Claregrieve1 
##                                       8                                       1 
##        Tislam,Rimsha,Claregrieve1,Merit                                   Titas 
##                                       2                                      11 
##                            Titas,Fatima                          Titas,Lwaldron 
##                                       1                                       1 
##                            Ufuoma Ejite            Ufuoma Ejite,Chloe,Aiyshaaaa 
##                                       9                                       1 
##   Ufuoma Ejite,Chloe,Aiyshaaaa,Atrayees         Ufuoma Ejite,Lwaldron,WikiWorks 
##                                       1                                       7 
##                               Valentina                               WikiWorks 
##                                       2                                    1092 
##                     WikiWorks,Aiyshaaaa                      WikiWorks,Atrayees 
##                                       3                                       5 
##                WikiWorks,Atrayees,Merit                  WikiWorks,Claregrieve1 
##                                       4                                      12 
##                        WikiWorks,Fatima                       WikiWorks,Jeshudy 
##                                       2                                       1 
##                         WikiWorks,Merit                         WikiWorks,Rukky 
##                                      29                                       3 
##                       WikiWorks,Suwaiba                          Yu Wang,Fatima 
##                                       1                                       1 
##             Yu Wang,Fatima,Claregrieve1 
##                                       4

Study stats

Study design

spl <- split(dat[["Study"]], dat[["Study design"]])
sds <- lapply(spl, unique)
sort(lengths(sds), decreasing = FALSE)
##                                case-control,prospective cohort 
##                                                              1 
## laboratory experiment,time series / longitudinal observational 
##                                                              1 
##                                     case-control,meta-analysis 
##                                                              5 
##                                                  meta-analysis 
##                                                              5 
##                                    randomized controlled trial 
##                                                             30 
##                                          laboratory experiment 
##                                                             32 
##                       time series / longitudinal observational 
##                                                             59 
##                                             prospective cohort 
##                                                             68 
##                cross-sectional observational, not case-control 
##                                                            205 
##                                                   case-control 
##                                                            314

Experiment stats

Columns of the full dataset that describe experiments:

# Experiment ID
exp.cols <- c("Study", "Experiment")

# Subjects
sub.cols <- c("Host species",    
              "Location of subjects", 
              "Body site",
              "Condition", 
              "Antibiotics exclusion",
              "Group 0 sample size",
              "Group 1 sample size")

# Lab analysis              
lab.cols <-  c("Sequencing type",
              "16S variable region",
              "Sequencing platform")

# Statistical analysis
stat.cols <-  c("Statistical test",
              "MHT correction",
              "Significance threshold")

# Alpha diversity
div.cols <- c("Pielou",
              "Shannon",
              "Chao1",
              "Simpson", 
              "Inverse Simpson",
              "Richness")

Restrict dataset to experiment information:

exps <- dat[,c(exp.cols, sub.cols, lab.cols, stat.cols, div.cols)]
exps <- unique(exps)

Subjects

Number of experiments for the top 10 categories for each subjects column:

sub.tab <- lapply(sub.cols[1:5], tabCol, df = exps, n = 10)
names(sub.tab) <- sub.cols[1:5]
sub.tab
## $`Host species`
## 
##      Homo sapiens      Mus musculus Rattus norvegicus 
##              1542                70                 6 
## 
## $`Location of subjects`
## 
##                    China United States of America                    Italy 
##                      492                      417                       67 
##                    Spain                    Japan              South Korea 
##                       53                       48                       44 
##                  Finland                   Canada              Netherlands 
##                       36                       29                       29 
##                   Brazil 
##                       25 
## 
## $`Body site`
## 
##                     Feces                    Saliva                    Vagina 
##                       957                        96                        50 
##            Uterine cervix                     Mouth              Skin of body 
##                        40                        30                        29 
##               Nasopharynx                  Meconium                 Intestine 
##                        28                        27                        26 
## Subgingival dental plaque 
##                        24 
## 
## $Condition
## 
##                                      obesity 
##                                          107 
##                                     COVID-19 
##                                           89 
##                            colorectal cancer 
##                                           81 
##                          antimicrobial agent 
##                                           77 
##                                         diet 
##                                           56 
##                          Parkinson's disease 
##                                           45 
##              human papilloma virus infection 
##                                           41 
## cervical glandular intraepithelial neoplasia 
##                                           38 
##                               gastric cancer 
##                                           36 
##                                endometriosis 
##                                           35 
## 
## $`Antibiotics exclusion`
## 
##       3 months        1 month       6 months        4 weeks       2 months 
##            199            117             77             58             57 
##        2 weeks           None        8 weeks None specified         5 days 
##             31             15             12             12             10

Proportions instead:

sub.tab <- lapply(sub.cols[1:5], tabCol, df = exps, n = 10, perc = TRUE)
names(sub.tab) <- sub.cols[1:5]
sub.tab
## $`Host species`
## 
##      Homo sapiens      Mus musculus Rattus norvegicus 
##           0.95300           0.04330           0.00371 
## 
## $`Location of subjects`
## 
##                    China United States of America                    Italy 
##                   0.3060                   0.2590                   0.0416 
##                    Spain                    Japan              South Korea 
##                   0.0329                   0.0298                   0.0273 
##                  Finland                   Canada              Netherlands 
##                   0.0224                   0.0180                   0.0180 
##                   Brazil 
##                   0.0155 
## 
## $`Body site`
## 
##                     Feces                    Saliva                    Vagina 
##                    0.5910                    0.0593                    0.0309 
##            Uterine cervix                     Mouth              Skin of body 
##                    0.0247                    0.0185                    0.0179 
##               Nasopharynx                  Meconium                 Intestine 
##                    0.0173                    0.0167                    0.0161 
## Subgingival dental plaque 
##                    0.0148 
## 
## $Condition
## 
##                                      obesity 
##                                       0.0665 
##                                     COVID-19 
##                                       0.0553 
##                            colorectal cancer 
##                                       0.0503 
##                          antimicrobial agent 
##                                       0.0479 
##                                         diet 
##                                       0.0348 
##                          Parkinson's disease 
##                                       0.0280 
##              human papilloma virus infection 
##                                       0.0255 
## cervical glandular intraepithelial neoplasia 
##                                       0.0236 
##                               gastric cancer 
##                                       0.0224 
##                                endometriosis 
##                                       0.0218 
## 
## $`Antibiotics exclusion`
## 
##       3 months        1 month       6 months        4 weeks       2 months 
##        0.19800        0.11600        0.07660        0.05770        0.05670 
##        2 weeks           None        8 weeks None specified         5 days 
##        0.03080        0.01490        0.01190        0.01190        0.00995

Sample size:

ssize <- apply(exps[,sub.cols[6:7]], 2, summary)
ssize
##         Group 0 sample size Group 1 sample size
## Min.                0.00000             1.00000
## 1st Qu.            13.00000            13.00000
## Median             27.00000            24.00000
## Mean               68.47422            81.77897
## 3rd Qu.            53.50000            46.00000
## Max.             1883.00000          9623.00000
## NA's               50.00000            42.00000

Lab analysis

Number of experiments for the top 10 categories for each lab analysis column:

lab.tab <- lapply(lab.cols, tabCol, df = exps, n = 10)
names(lab.tab) <- lab.cols
lab.tab
## $`Sequencing type`
## 
##        16S        WMS        PCR ITS / ITS2 
##       1464        136          4          1 
## 
## $`16S variable region`
## 
##   34    4  123   12  345   45    3   56 1234   23 
##  487  377  125   76   57   56   38   18   13   11 
## 
## $`Sequencing platform`
## 
##                    Illumina                    Roche454 
##                        1118                         232 
##                 Ion Torrent                     RT-qPCR 
##                          94                          84 
## Human Intestinal Tract Chip                 MGISEQ-2000 
##                          12                          11 
##           Mass spectrometry                      Sanger 
##                           7                           6 
##           HTF-Microbi.Array        Non-quantitative PCR 
##                           4                           4

Proportions instead:

lab.tab <- lapply(lab.cols, tabCol, df = exps, n = 10, perc = TRUE)
names(lab.tab) <- lab.cols
lab.tab
## $`Sequencing type`
## 
##        16S        WMS        PCR ITS / ITS2 
##   0.912000   0.084700   0.002490   0.000623 
## 
## $`16S variable region`
## 
##      34       4     123      12     345      45       3      56    1234      23 
## 0.37500 0.29100 0.09640 0.05860 0.04390 0.04320 0.02930 0.01390 0.01000 0.00848 
## 
## $`Sequencing platform`
## 
##                    Illumina                    Roche454 
##                     0.70400                     0.14600 
##                 Ion Torrent                     RT-qPCR 
##                     0.05920                     0.05290 
## Human Intestinal Tract Chip                 MGISEQ-2000 
##                     0.00755                     0.00692 
##           Mass spectrometry                      Sanger 
##                     0.00441                     0.00378 
##           HTF-Microbi.Array        Non-quantitative PCR 
##                     0.00252                     0.00252

Statistical analysis

Number of experiments for the top 10 categories for each statistical analysis column:

stat.tab <- lapply(stat.cols, tabCol, df = exps, n = 10)
names(stat.tab) <- stat.cols
stat.tab
## $`Statistical test`
## 
##                        LEfSe      Mann-Whitney (Wilcoxon) 
##                          456                          439 
##                       DESeq2              Kruskall-Wallis 
##                          108                          103 
##                       T-Test                        ANOVA 
##                           86                           68 
##            Linear Regression                    PERMANOVA 
##                           47                           34 
## Negative Binomial Regression                    Metastats 
##                           31                           28 
## 
## $`MHT correction`
## 
##  TRUE FALSE 
##   753   716 
## 
## $`Significance threshold`
## 
##  0.05   0.1  0.01 0.001  0.15   0.2 0.005     0  0.25 1e-11 
##  1401    62    26    21    14    13     4     3     3     1

Proportions instead:

stat.tab <- lapply(stat.cols, tabCol, df = exps, n = 10, perc = TRUE)
names(stat.tab) <- stat.cols
stat.tab
## $`Statistical test`
## 
##                        LEfSe      Mann-Whitney (Wilcoxon) 
##                       0.2870                       0.2770 
##                       DESeq2              Kruskall-Wallis 
##                       0.0681                       0.0649 
##                       T-Test                        ANOVA 
##                       0.0542                       0.0428 
##            Linear Regression                    PERMANOVA 
##                       0.0296                       0.0214 
## Negative Binomial Regression                    Metastats 
##                       0.0195                       0.0176 
## 
## $`MHT correction`
## 
##  TRUE FALSE 
## 0.513 0.487 
## 
## $`Significance threshold`
## 
##     0.05      0.1     0.01    0.001     0.15      0.2    0.005        0 
## 0.901000 0.039900 0.016700 0.013500 0.009000 0.008360 0.002570 0.001930 
##     0.25    1e-11 
## 0.001930 0.000643

Alpha diversity

Overall distribution:

apply(exps[,div.cols], 2, table)
##           Pielou Shannon Chao1 Simpson Inverse Simpson Richness
## decreased     18     220   127      69              15      112
## increased     12     158    74      40              14      118
## unchanged     41     569   326     218              40      296

Correspondence of Shannon diversity and Richness:

table(exps$Shannon, exps$Richness)
##            
##             decreased increased unchanged
##   decreased        53         6        23
##   increased         1        45        17
##   unchanged        21        30       220

Conditions with consistently increased or decreased alpha diversity:

tabDiv(exps, "Shannon", "Condition")
##                                              increased decreased unchanged
## COVID-19                                             5        15        33
## obesity                                              3        13        36
## gastric cancer                                       3        12        14
## HIV infection                                        1        10        10
## human papilloma virus infection                      6         0        24
## lung cancer                                          2         8         5
## cesarean section                                     5         0        14
## Crohn's disease                                      0         5         2
## acute lymphoblastic leukemia                         0         4         4
## antimicrobial agent                                  5         9        22
## cervical cancer                                      4         0         4
## hypertension                                         4         0         2
## age                                                  2         5         2
## alcohol drinking                                     3         0         2
## Alzheimer's disease                                  0         3         2
## atopic asthma                                        4         1         7
## Parkinson's disease                                  6         3        12
## periodontitis                                        3         0         3
## squamous cell carcinoma                              3         0         3
## cervical glandular intraepithelial neoplasia         2         0         9
## colorectal cancer                                    6         8        24
## diet                                                 6         4        14
## Eczema                                               0         2        10
## ethnic group                                         4         6         7
## food allergy                                         0         2        10
## schizophrenia                                        1         3         7
## smoking behavior                                     3         1        14
## socioeconomic status                                 4         2         4
## acute myeloid leukemia                               2         3         2
## air pollution                                        7         6         3
## asthma                                               1         0        11
## atopic eczema                                        2         3        17
## chronic fatigue syndrome                             0         1         4
## chronic hepatitis B virus infection                  0         1         5
## endometriosis                                        2         3        17
## esophageal cancer                                    1         2         2
## milk allergic reaction                               1         0         5
## multiple sclerosis                                   0         1         7
## pancreatic carcinoma                                 0         1         4
## Response to immunochemotherapy                       2         1         3
## arthritis                                            1         1         5
## colorectal adenoma                                   2         2         5
## gestational diabetes                                 0         0         5
## hepatocellular carcinoma                             0         0         6
## HIV mother to child transmission                     0         0         5
## irritable bowel syndrome                             0         0         6
## obsessive-compulsive disorder                        0         0         5
## rheumatoid arthritis                                 3         3         4
## type II diabetes mellitus                            2         2         9
tabDiv(exps, "Shannon", "Condition", perc = TRUE)
##                                              increased decreased unchanged
## COVID-19                                         0.094     0.280      0.62
## obesity                                          0.058     0.250      0.69
## gastric cancer                                   0.100     0.410      0.48
## HIV infection                                    0.048     0.480      0.48
## human papilloma virus infection                  0.200     0.000      0.80
## lung cancer                                      0.130     0.530      0.33
## cesarean section                                 0.260     0.000      0.74
## Crohn's disease                                  0.000     0.710      0.29
## acute lymphoblastic leukemia                     0.000     0.500      0.50
## antimicrobial agent                              0.140     0.250      0.61
## cervical cancer                                  0.500     0.000      0.50
## hypertension                                     0.670     0.000      0.33
## age                                              0.220     0.560      0.22
## alcohol drinking                                 0.600     0.000      0.40
## Alzheimer's disease                              0.000     0.600      0.40
## atopic asthma                                    0.330     0.083      0.58
## Parkinson's disease                              0.290     0.140      0.57
## periodontitis                                    0.500     0.000      0.50
## squamous cell carcinoma                          0.500     0.000      0.50
## cervical glandular intraepithelial neoplasia     0.180     0.000      0.82
## colorectal cancer                                0.160     0.210      0.63
## diet                                             0.250     0.170      0.58
## Eczema                                           0.000     0.170      0.83
## ethnic group                                     0.240     0.350      0.41
## food allergy                                     0.000     0.170      0.83
## schizophrenia                                    0.091     0.270      0.64
## smoking behavior                                 0.170     0.056      0.78
## socioeconomic status                             0.400     0.200      0.40
## acute myeloid leukemia                           0.290     0.430      0.29
## air pollution                                    0.440     0.380      0.19
## asthma                                           0.083     0.000      0.92
## atopic eczema                                    0.091     0.140      0.77
## chronic fatigue syndrome                         0.000     0.200      0.80
## chronic hepatitis B virus infection              0.000     0.170      0.83
## endometriosis                                    0.091     0.140      0.77
## esophageal cancer                                0.200     0.400      0.40
## milk allergic reaction                           0.170     0.000      0.83
## multiple sclerosis                               0.000     0.120      0.88
## pancreatic carcinoma                             0.000     0.200      0.80
## Response to immunochemotherapy                   0.330     0.170      0.50
## arthritis                                        0.140     0.140      0.71
## colorectal adenoma                               0.220     0.220      0.56
## gestational diabetes                             0.000     0.000      1.00
## hepatocellular carcinoma                         0.000     0.000      1.00
## HIV mother to child transmission                 0.000     0.000      1.00
## irritable bowel syndrome                         0.000     0.000      1.00
## obsessive-compulsive disorder                    0.000     0.000      1.00
## rheumatoid arthritis                             0.300     0.300      0.40
## type II diabetes mellitus                        0.150     0.150      0.69
tabDiv(exps, "Richness", "Condition")
##                                              increased decreased unchanged
## COVID-19                                             4        15        17
## air pollution                                       15         5         3
## Parkinson's disease                                  9         0         2
## antimicrobial agent                                  1         8         5
## alcohol drinking                                     5         0         0
## HIV infection                                        0         5         8
## acute lymphoblastic leukemia                         5         1         0
## cervical glandular intraepithelial neoplasia         4         0         2
## atopic asthma                                        4         1         7
## food allergy                                         0         3         9
## gastric cancer                                       3         6        13
## colorectal adenoma                                   0         2         8
## diet                                                 3         1         6
## endometriosis                                        3         1        15
## asthma                                               1         0        10
## atopic eczema                                        2         1         7
## colorectal cancer                                    6         5         9
## ethnic group                                         3         2         1
## human papilloma virus infection                      2         1        13
## irritable bowel syndrome                             0         1         7
## lung cancer                                          0         1         6
## obesity                                              6         7        18
## obsessive-compulsive disorder                        0         1         4
## schizophrenia                                        1         2         2
## cesarean section                                     2         2         9
## HIV mother to child transmission                     0         0         5
## multiple sclerosis                                   0         0         9
## smoking behavior                                     1         1         8
## socioeconomic status                                 2         2         1
tabDiv(exps, "Richness", "Condition", perc = TRUE)
##                                              increased decreased unchanged
## COVID-19                                         0.110     0.420      0.47
## air pollution                                    0.650     0.220      0.13
## Parkinson's disease                              0.820     0.000      0.18
## antimicrobial agent                              0.071     0.570      0.36
## alcohol drinking                                 1.000     0.000      0.00
## HIV infection                                    0.000     0.380      0.62
## acute lymphoblastic leukemia                     0.830     0.170      0.00
## cervical glandular intraepithelial neoplasia     0.670     0.000      0.33
## atopic asthma                                    0.330     0.083      0.58
## food allergy                                     0.000     0.250      0.75
## gastric cancer                                   0.140     0.270      0.59
## colorectal adenoma                               0.000     0.200      0.80
## diet                                             0.300     0.100      0.60
## endometriosis                                    0.160     0.053      0.79
## asthma                                           0.091     0.000      0.91
## atopic eczema                                    0.200     0.100      0.70
## colorectal cancer                                0.300     0.250      0.45
## ethnic group                                     0.500     0.330      0.17
## human papilloma virus infection                  0.120     0.062      0.81
## irritable bowel syndrome                         0.000     0.120      0.88
## lung cancer                                      0.000     0.140      0.86
## obesity                                          0.190     0.230      0.58
## obsessive-compulsive disorder                    0.000     0.200      0.80
## schizophrenia                                    0.200     0.400      0.40
## cesarean section                                 0.150     0.150      0.69
## HIV mother to child transmission                 0.000     0.000      1.00
## multiple sclerosis                               0.000     0.000      1.00
## smoking behavior                                 0.100     0.100      0.80
## socioeconomic status                             0.400     0.400      0.20

Body sites with consistently increased or decreased alpha diversity:

tabDiv(exps, "Shannon", "Body site")
##                            increased decreased unchanged
## Feces                             69       140       340
## Mouth                              9         2        11
## Stomach                            3         9         5
## Vagina                             9         3        10
## Meconium                           5         0         7
## Subgingival dental plaque          7         2         7
## Tongue                             0         5         8
## Posterior fornix of vagina         4         0         3
## Caecum                             1         4         1
## Dental plaque                      0         3         3
## Skin of body                       4         7         6
## Uterine cervix                     3         0        23
## Nasopharynx                        1         3        18
## Oral cavity                        3         1         2
## Saliva                            15        13        38
## Breast                             3         2         0
## Lung                               1         2         6
## Oropharynx                         1         2         3
## Rectum                             0         1        12
## Uterus                             0         1         7
## Blood                              0         0         5
## Bronchus                           0         0         6
## Intestine                          1         1         6
tabDiv(exps, "Shannon", "Body site", perc = TRUE)
##                            increased decreased unchanged
## Feces                          0.130     0.260      0.62
## Mouth                          0.410     0.091      0.50
## Stomach                        0.180     0.530      0.29
## Vagina                         0.410     0.140      0.45
## Meconium                       0.420     0.000      0.58
## Subgingival dental plaque      0.440     0.120      0.44
## Tongue                         0.000     0.380      0.62
## Posterior fornix of vagina     0.570     0.000      0.43
## Caecum                         0.170     0.670      0.17
## Dental plaque                  0.000     0.500      0.50
## Skin of body                   0.240     0.410      0.35
## Uterine cervix                 0.120     0.000      0.88
## Nasopharynx                    0.045     0.140      0.82
## Oral cavity                    0.500     0.170      0.33
## Saliva                         0.230     0.200      0.58
## Breast                         0.600     0.400      0.00
## Lung                           0.110     0.220      0.67
## Oropharynx                     0.170     0.330      0.50
## Rectum                         0.000     0.077      0.92
## Uterus                         0.000     0.120      0.88
## Blood                          0.000     0.000      1.00
## Bronchus                       0.000     0.000      1.00
## Intestine                      0.120     0.120      0.75
tabDiv(exps, "Richness", "Body site")
##                            increased decreased unchanged
## Colon                              9         0         1
## Feces                             55        61       157
## Mouth                              7         1         5
## Oropharynx                         0         6         3
## Uterine cervix                     6         0        16
## Stomach                            2         7         3
## Posterior fornix of vagina         4         0         2
## Subgingival dental plaque          4         0         4
## Rectum                             0         3         7
## Nasopharynx                        3         5        10
## Saliva                             5         7        18
## Tongue                             0         2         5
## Vagina                             3         1        11
## Caecum                             2         3         0
## Meconium                           1         2         6
## Bronchus                           0         0         6
## Intestine                          0         0         7
tabDiv(exps, "Richness", "Body site", perc = TRUE)
##                            increased decreased unchanged
## Colon                           0.90     0.000      0.10
## Feces                           0.20     0.220      0.58
## Mouth                           0.54     0.077      0.38
## Oropharynx                      0.00     0.670      0.33
## Uterine cervix                  0.27     0.000      0.73
## Stomach                         0.17     0.580      0.25
## Posterior fornix of vagina      0.67     0.000      0.33
## Subgingival dental plaque       0.50     0.000      0.50
## Rectum                          0.00     0.300      0.70
## Nasopharynx                     0.17     0.280      0.56
## Saliva                          0.17     0.230      0.60
## Tongue                          0.00     0.290      0.71
## Vagina                          0.20     0.067      0.73
## Caecum                          0.40     0.600      0.00
## Meconium                        0.11     0.220      0.67
## Bronchus                        0.00     0.000      1.00
## Intestine                       0.00     0.000      1.00

Signature stats

sigs <- bugsigdbr::getSignatures(dat, tax.id.type = "metaphlan")

Unique microbes

Number unique microbes contained in the signatures:

(nuniq <- length(unique(unlist(sigs))))
## [1] 2484

Development of unique microbes captured over time:

Microbe set size distribution

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   4.000   7.449   9.000 471.000
gghistogram(lengths(sigs), bins = 30, ylab = "number of signatures",
    xlab = "signature size", fill = "#00AFBB", ggtheme = theme_bw())

sum(lengths(sigs) > 4)
## [1] 1371

Microbe co-occurrence

dat.feces <- subset(dat, `Body site` == "Feces")
cooc.mat <- microbeHeatmap(dat.feces, tax.level = "genus")
## Loading required namespace: safe

antag.mat <- microbeHeatmap(dat.feces, tax.level = "genus", antagonistic = TRUE)

Get the top 20 genera most frequently reported as differentially abundant:

sigs.feces <- getSignatures(dat.feces, tax.id.type = "taxname", 
                            tax.level = "genus", exact.tax.level = FALSE) 
top20 <- sort(table(unlist(sigs.feces)), decreasing = TRUE)[1:20]
top20
## 
##      Bacteroides  Bifidobacterium Faecalibacterium       Prevotella 
##              299              204              199              192 
##    Streptococcus      Clostridium     Ruminococcus        Roseburia 
##              181              163              162              157 
##          Blautia    Lactobacillus  Parabacteroides        Alistipes 
##              153              141              129              122 
##      Coprococcus            Dorea      Eubacterium      Veillonella 
##              115              114              111              110 
##     Enterococcus      Lachnospira      Akkermansia     Anaerostipes 
##              105              102              100               98

Subset heatmaps to the top 20 genera most frequently reported as differentially abundant:

all(names(top20) %in% rownames(cooc.mat))
## [1] TRUE
cooc.mat <- cooc.mat[names(top20), names(top20)]
all(names(top20) %in% rownames(antag.mat))
## [1] TRUE
antag.mat <- antag.mat[names(top20), names(top20)]

Distinguish by direction of abundance change (increased / decreased):

# increased
sub.dat.feces <- subset(dat.feces, `Abundance in Group 1` == "increased")
sigs.feces.up <- getSignatures(sub.dat.feces, tax.id.type = "taxname", 
                               tax.level = "genus", exact.tax.level = FALSE) 
top20.up <- table(unlist(sigs.feces.up))[names(top20)]
top20.up
## 
##      Bacteroides  Bifidobacterium Faecalibacterium       Prevotella 
##              116               84               66              100 
##    Streptococcus      Clostridium     Ruminococcus        Roseburia 
##              111               84               64               48 
##          Blautia    Lactobacillus  Parabacteroides        Alistipes 
##               64               95               63               41 
##      Coprococcus            Dorea      Eubacterium      Veillonella 
##               37               47               47               73 
##     Enterococcus      Lachnospira      Akkermansia     Anaerostipes 
##               86               22               61               34
# decreased
sub.dat.feces <- subset(dat.feces, `Abundance in Group 1` == "decreased")
sigs.feces.down <- getSignatures(sub.dat.feces, tax.id.type = "taxname", 
                                 tax.level = "genus", exact.tax.level = FALSE) 
top20.down <- table(unlist(sigs.feces.down))[names(top20)]
top20.down
## 
##      Bacteroides  Bifidobacterium Faecalibacterium       Prevotella 
##              174              110              127               88 
##    Streptococcus      Clostridium     Ruminococcus        Roseburia 
##               60               74               93              103 
##          Blautia    Lactobacillus  Parabacteroides        Alistipes 
##               83               44               60               75 
##      Coprococcus            Dorea      Eubacterium      Veillonella 
##               72               61               58               32 
##     Enterococcus      Lachnospira      Akkermansia     Anaerostipes 
##               15               74               33               58

Plot the heatmap

# annotation
mat <- matrix(nc = 2, cbind(top20.up, top20.down))
bp <- ComplexHeatmap::anno_barplot(mat, gp = gpar(fill = c("#D55E00", "#0072B2"),
                                                  col = c("#D55E00", "#0072B2")),
                                   height = unit(2, "cm"))
banno <- ComplexHeatmap::HeatmapAnnotation(`Abundance in Group 1` = bp)

lgd_list <- list(
    Legend(labels = c("increased", "decreased"), 
           title = "Abundance in Group 1", 
           type = "grid",
           legend_gp = gpar(col = c("#D55E00", "#0072B2"), fill = c("#D55E00", "#0072B2"))))
                                            
# same direction
# lcm <- sweep(cooc.mat, 2, matrixStats::colMaxs(cooc.mat), FUN = "/")
# we need to dampen the maximum here a bit down,
# otherwise 100% self co-occurrence takes up a large fraction of the colorscale,
sec <- apply(cooc.mat, 2, function(x) sort(x, decreasing = TRUE)[2])
cooc.mat2 <- cooc.mat
for(i in 1:ncol(cooc.mat2)) cooc.mat2[i,i] <- min(cooc.mat2[i,i], 1.4 * sec[i])
lcm <- sweep(cooc.mat2, 2, matrixStats::colMaxs(cooc.mat2), FUN = "/")
col <- circlize::colorRamp2(c(0,1), c("#EEEEEE", "red"))
ht1 <- ComplexHeatmap::Heatmap(lcm,
                               col = col, 
                               name = "Relative frequency (top)",
                               cluster_columns = FALSE, 
                               row_km = 3, 
                               row_title = "same direction", 
                               column_names_rot = 45,
                               row_names_gp = gpar(fontsize = 8),
                               column_names_gp = gpar(fontsize = 8))

# opposite direction
acm <- sweep(antag.mat, 2, matrixStats::colMaxs(antag.mat), FUN = "/")
col <- circlize::colorRamp2(c(0,1), c("#EEEEEE", "blue"))
ht2 <- ComplexHeatmap::Heatmap(acm,
                               col = col, 
                               name = "Relative frequency (bottom)",
                               cluster_columns = FALSE, 
                               row_title = "opposite direction", 
                               row_km = 3, 
                               column_names_rot = 45,
                               row_names_gp = gpar(fontsize = 8),
                               column_names_gp = gpar(fontsize = 8))

# phylum
sfp <- bugsigdbr::getSignatures(dat.feces, tax.id.type = "metaphlan", 
                                tax.level = "genus", exact.tax.level = FALSE) 
sfp20 <- sort(table(unlist(sfp)), decreasing = TRUE)[1:20]
uanno <- bugsigdbr::extractTaxLevel(names(sfp20),
                                    tax.id.type = "taxname",
                                    tax.level = "phylum",
                                    exact.tax.level = FALSE) 
phyla.grid <- seq_along(unique(uanno))
panno <- ComplexHeatmap::HeatmapAnnotation(phylum = uanno)

uanno <- matrix(uanno, nrow = 1)
colnames(uanno) <- names(top20)
pcols <- c("#CC79A7", "#F0E442", "#009E73", "#56B4E9", "#E69F00")
uanno <- ComplexHeatmap::Heatmap(uanno, name = "Phylum",
                                 col = pcols[phyla.grid],
                                 cluster_columns = FALSE,
                                 column_names_rot = 45,
                                 column_names_gp = gpar(fontsize = 8))

# put everything together
ht_list <- ht1 %v% banno %v% ht2 %v% uanno
ComplexHeatmap::draw(ht_list, annotation_legend_list = lgd_list, merge_legend = TRUE)

decorate_annotation("Abundance in Group 1", {
    grid.text("# signatures", x = unit(-1, "cm"), rot = 90, just = "bottom", gp = gpar(fontsize = 8))
    grid.text("*", x = unit(2.45, "cm"), y = unit(1.2, "cm"))
    grid.text("*", x = unit(5.18, "cm"), y = unit(1, "cm"))
    grid.text("*", x = unit(6.55, "cm"), y = unit(0.95, "cm"))
    grid.text("*", x = unit(8.6, "cm"), y = unit(0.85, "cm"))
    grid.text("*", x = unit(10, "cm"), y = unit(0.7, "cm"))
    grid.text("*", x = unit(10.7, "cm"), y = unit(0.7, "cm"))
})

Signature similarity

Jaccard index

Inspect signature similarity for signatures from stomach samples based on Jaccard index:

stomachsub <- subset(dat, `Body site` == "Stomach")
sigsub <- bugsigdbr::getSignatures(stomachsub)
pair.jsim <- calcJaccardSimilarity(sigsub)

Create a dendrogram of Jaccard dissimilarities (1.0 has no overlap, 0.0 are identical signatures).

jdist <- as.dist(1 - pair.jsim)
plot(hclust(jdist))