Reading data

Get bulk export from bugsigdb.org:

full.dat <- bugsigdbr::importBugSigDB(version = "devel", cache = FALSE)
dim(full.dat)
## [1] 2555   48
colnames(full.dat)
##  [1] "Study"                      "Study design"              
##  [3] "PMID"                       "DOI"                       
##  [5] "URL"                        "Authors list"              
##  [7] "Title"                      "Journal"                   
##  [9] "Year"                       "Experiment"                
## [11] "Location of subjects"       "Host species"              
## [13] "Body site"                  "UBERON ID"                 
## [15] "Condition"                  "EFO ID"                    
## [17] "Group 0 name"               "Group 1 name"              
## [19] "Group 1 definition"         "Group 0 sample size"       
## [21] "Group 1 sample size"        "Antibiotics exclusion"     
## [23] "Sequencing type"            "16S variable region"       
## [25] "Sequencing platform"        "Statistical test"          
## [27] "Significance threshold"     "MHT correction"            
## [29] "LDA Score above"            "Matched on"                
## [31] "Confounders controlled for" "Pielou"                    
## [33] "Shannon"                    "Chao1"                     
## [35] "Simpson"                    "Inverse Simpson"           
## [37] "Richness"                   "Signature page name"       
## [39] "Source"                     "Curated date"              
## [41] "Curator"                    "Revision editor"           
## [43] "Description"                "Abundance in Group 1"      
## [45] "MetaPhlAn taxon names"      "NCBI Taxonomy IDs"         
## [47] "State"                      "Reviewer"

Stripping illformed entries:

is.study <- grepl("^Study [0-9]+$", full.dat[["Study"]])
is.exp <- grepl("^Experiment [0-9]+$", full.dat[["Experiment"]])
full.dat <- full.dat[is.study & is.exp, ]

Curation output

Number of papers and signatures curated:

pmids <- unique(full.dat[,"PMID"])
length(pmids)
## [1] 640
nrow(full.dat)
## [1] 2555

Publication date of the curated papers:

pmids <- pmids[!is.na(pmids)]
pubyear1 <- pmid2pubyear(pmids[1:361])
pubyear2 <- pmid2pubyear(pmids[362:length(pmids)])
pubyear <- c(pubyear1, pubyear2)
head(cbind(pmids, pubyear))
##      pmids      pubyear
## [1,] "28038683" "2016" 
## [2,] "28173873" "2017" 
## [3,] "27015276" "2016" 
## [4,] "27625705" "2016" 
## [5,] "23071781" "2012" 
## [6,] "28467925" "2017"
tab <- table(pubyear)
tab <- tab[-length(tab)]
tab <- tab[order(as.integer(names(tab)))]
df <- data.frame(year = names(tab), papers = as.integer(tab))
ggbarplot(df, x = "year", y = "papers", 
          label = TRUE, fill = "steelblue",
          ggtheme = theme_bw())

Stripping empty signatures:

ind1 <- lengths(full.dat[["MetaPhlAn taxon names"]]) > 0
ind2 <- lengths(full.dat[["NCBI Taxonomy IDs"]]) > 0
dat <- full.dat[ind1 & ind2,]
nrow(dat)
## [1] 2555

Papers containing only empty UP and DOWN signatures (under curation?):

setdiff(pmids, unique(dat[,"PMID"]))
## numeric(0)

Progress over time:

dat[,"Curated date"] <- as.character(lubridate::dmy(dat[,"Curated date"]))
plotProgressOverTime(dat)

plotProgressOverTime(dat, diff = TRUE)

Stratified by curator:

npc <- stratifyByCurator(dat)
plotCuratorStats(dat, npc)

Number of complete and revised signatures:

table(df[["State"]])
## < table of extent 0 >
table(dat[,"Revision editor"])
## 
##                                            Chloe 
##                                                4 
##                                     Claregrieve1 
##                                              140 
##                               Claregrieve1,Chloe 
##                                                1 
##                  Claregrieve1,Chloe,WikiWorks743 
##                                                1 
##                              Claregrieve1,Fatima 
##                                               11 
##                            Claregrieve1,Lwaldron 
##                                                1 
##                                 Cynthia Anderson 
##                                               28 
##                    Cynthia Anderson,Claregrieve1 
##                                                2 
##                          Cynthia Anderson,Fatima 
##                                                2 
##                                           Fatima 
##                                               16 
##                              Fatima,Claregrieve1 
##                                                2 
##                                         Fcuevas3 
##                                               37 
##                            Fcuevas3,Claregrieve1 
##                                                3 
##                                  Fcuevas3,Fatima 
##                                                1 
##                                  Fcuevas3,Rimsha 
##                                                8 
##                                             Gina 
##                                               14 
##                                         Haoyanzh 
##                                               20 
##                                Haoyanzh,Lwaldron 
##                                                2 
##                                      Itslanapark 
##                                               35 
##                                Itslanapark,Chloe 
##                                                2 
##                         Itslanapark,Claregrieve1 
##                                                4 
##                         Itslanapark,Fatima,Chloe 
##                                                1 
##                               Itslanapark,Rimsha 
##                                                1 
##                                          Jeshudy 
##                                               77 
##                             Jeshudy,Claregrieve1 
##                                                3 
##                                   Jeshudy,Fatima 
##                                                4 
##                                          Joyessa 
##                                                4 
##                             Joyessa,Claregrieve1 
##                                               19 
##                      Joyessa,Fatima,Claregrieve1 
##                                                2 
##                                   Kaluifeanyi101 
##                                               63 
##                      Kaluifeanyi101,Claregrieve1 
##                                                9 
##                            Kaluifeanyi101,Fatima 
##                                                2 
##                                        Kwekuamoo 
##                                               29 
##                                  Kwekuamoo,Chloe 
##                                                1 
##                                    Lorakasselman 
##                                                2 
##                              Lorakasselman,Chloe 
##                                                2 
##                                         Lwaldron 
##                                               10 
##                                    Madhubani Dey 
##                                               28 
##                              Madhubani Dey,Chloe 
##                                                1 
##                       Madhubani Dey,Claregrieve1 
##                                               14 
##                Madhubani Dey,Fatima,Claregrieve1 
##                                                2 
##                           Madhubani Dey,Lwaldron 
##                                                1 
##                                          Manuela 
##                                               11 
##                                   Mary Bearkland 
##                                               23 
##                      Mary Bearkland,Claregrieve1 
##                                               10 
##                            Mary Bearkland,Fatima 
##                                                3 
##                              Maryemzaki,Lwaldron 
##                                                2 
##                                           Mmarin 
##                                               21 
##                              Mmarin,Claregrieve1 
##                                               13 
##                                           Rimsha 
##                                                4 
##                                  Rimsha,Lwaldron 
##                                                1 
##                                      Samara.Khan 
##                                               41 
##                         Samara.Khan,Claregrieve1 
##                                                7 
##                               Samara.Khan,Fatima 
##                                                1 
##                                        Sharmilac 
##                                                9 
##                                 Sharmilac,Fatima 
##                                                7 
##                                           Tislam 
##                                               25 
##                              Tislam,Claregrieve1 
##                                                3 
##                                    Tislam,Fatima 
##                                                8 
##                       Tislam,Fatima,Claregrieve1 
##                                                1 
##                       Tislam,Rimsha,Claregrieve1 
##                                                2 
##                                            Titas 
##                                               11 
##                                     Titas,Fatima 
##                                                1 
##                                   Titas,Lwaldron 
##                                                1 
##                                        Valentina 
##                                                2 
##                                 WikiWorks,Fatima 
##                                                2 
##                                WikiWorks,Jeshudy 
##                                                1 
##                                     WikiWorks743 
##                                             1309 
##                               WikiWorks743,Chloe 
##                                                4 
##                        WikiWorks743,Claregrieve1 
##                                              196 
##       WikiWorks743,Cynthia Anderson,LGeistlinger 
##                                                2 
##           WikiWorks743,Cynthia Anderson,Lwaldron 
##                                                1 
##                              WikiWorks743,Fatima 
##                                               49 
##                 WikiWorks743,Fatima,Claregrieve1 
##                                                9 
##                    WikiWorks743,Fatima,Kwekuamoo 
##                                                2 
##                     WikiWorks743,Fatima,Lwaldron 
##                                                3 
##                        WikiWorks743,KathyWaldron 
##                                                4 
##                           WikiWorks743,Kwekuamoo 
##                                                1 
##                            WikiWorks743,Lwaldron 
##                                               32 
##               WikiWorks743,Lwaldron,Claregrieve1 
##                                                7 
##                     WikiWorks743,Lwaldron,Fatima 
##                                                1 
##                       WikiWorks743,Rimsha,Fatima 
##                                                1 
##          WikiWorks743,Rimsha,Fatima,LGeistlinger 
##                                                1 
##                        WikiWorks743,WikiWorks753 
##                                                1 
## WikiWorks743,WikiWorks753,WikiWorks,Claregrieve1 
##                                                1 
##                                   Yu Wang,Fatima 
##                                                6

Study stats

Study design

spl <- split(dat[["Study"]], dat[["Study design"]])
sds <- lapply(spl, unique)
sort(lengths(sds), decreasing = FALSE)
##                 case-control,prospective cohort 
##                                               1 
##                      case-control,meta-analysis 
##                                               5 
##                                   meta-analysis 
##                                               5 
##                     randomized controlled trial 
##                                              28 
##                           laboratory experiment 
##                                              29 
##        time series / longitudinal observational 
##                                              56 
##                              prospective cohort 
##                                              63 
## cross-sectional observational, not case-control 
##                                             179 
##                                    case-control 
##                                             288

Experiment stats

Columns of the full dataset that describe experiments:

# Experiment ID
exp.cols <- c("Study", "Experiment")

# Subjects
sub.cols <- c("Host species",    
              "Location of subjects", 
              "Body site",
              "Condition", 
              "Antibiotics exclusion",
              "Group 0 sample size",
              "Group 1 sample size")

# Lab analysis              
lab.cols <-  c("Sequencing type",
              "16S variable region",
              "Sequencing platform")

# Statistical analysis
stat.cols <-  c("Statistical test",
              "MHT correction",
              "Significance threshold")

# Alpha diversity
div.cols <- c("Pielou",
              "Shannon",
              "Chao1",
              "Simpson", 
              "Inverse Simpson",
              "Richness")

Restrict dataset to experiment information:

exps <- dat[,c(exp.cols, sub.cols, lab.cols, stat.cols, div.cols)]
exps <- unique(exps)

Subjects

Number of experiments for the top 10 categories for each subjects column:

sub.tab <- lapply(sub.cols[1:5], tabCol, df = exps, n = 10)
names(sub.tab) <- sub.cols[1:5]
sub.tab
## $`Host species`
## 
##      Homo sapiens      Mus musculus Rattus norvegicus 
##              1410                62                 6 
## 
## $`Location of subjects`
## 
##                    China United States of America                    Italy 
##                      462                      381                       62 
##                    Spain              South Korea                    Japan 
##                       50                       44                       42 
##                  Finland                   Canada              Netherlands 
##                       35                       28                       27 
##                   Brazil 
##                       25 
## 
## $`Body site`
## 
##          Feces         Saliva         Vagina Uterine cervix       Meconium 
##            867             51             47             40             29 
##   Skin of body          Mouth      Intestine    Nasopharynx        Stomach 
##             29             28             27             26             22 
## 
## $Condition
## 
##                                      obesity 
##                                          104 
##                                     COVID-19 
##                                           89 
##                          antimicrobial agent 
##                                           78 
##                            colorectal cancer 
##                                           77 
##                                         diet 
##                                           56 
##              human papilloma virus infection 
##                                           42 
## cervical glandular intraepithelial neoplasia 
##                                           38 
##                               gastric cancer 
##                                           35 
##                             cesarean section 
##                                           34 
##                                endometriosis 
##                                           33 
## 
## $`Antibiotics exclusion`
## 
##                 3 months                  1 month                 6 months 
##                      185                      113                       74 
##                 2 months                  4 weeks                  2 weeks 
##                       57                       43                       30 
##           None specified            Not specified                  8 weeks 
##                       14                       12                       11 
## before sample collection 
##                       10

Proportions instead:

sub.tab <- lapply(sub.cols[1:5], tabCol, df = exps, n = 10, perc = TRUE)
names(sub.tab) <- sub.cols[1:5]
sub.tab
## $`Host species`
## 
##      Homo sapiens      Mus musculus Rattus norvegicus 
##           0.95400           0.04190           0.00406 
## 
## $`Location of subjects`
## 
##                    China United States of America                    Italy 
##                   0.3140                   0.2590                   0.0421 
##                    Spain              South Korea                    Japan 
##                   0.0340                   0.0299                   0.0286 
##                  Finland                   Canada              Netherlands 
##                   0.0238                   0.0190                   0.0184 
##                   Brazil 
##                   0.0170 
## 
## $`Body site`
## 
##          Feces         Saliva         Vagina Uterine cervix       Meconium 
##         0.5880         0.0346         0.0319         0.0271         0.0197 
##   Skin of body          Mouth      Intestine    Nasopharynx        Stomach 
##         0.0197         0.0190         0.0183         0.0176         0.0149 
## 
## $Condition
## 
##                                      obesity 
##                                       0.0712 
##                                     COVID-19 
##                                       0.0609 
##                          antimicrobial agent 
##                                       0.0534 
##                            colorectal cancer 
##                                       0.0527 
##                                         diet 
##                                       0.0383 
##              human papilloma virus infection 
##                                       0.0287 
## cervical glandular intraepithelial neoplasia 
##                                       0.0260 
##                               gastric cancer 
##                                       0.0240 
##                             cesarean section 
##                                       0.0233 
##                                endometriosis 
##                                       0.0226 
## 
## $`Antibiotics exclusion`
## 
##                 3 months                  1 month                 6 months 
##                   0.2020                   0.1230                   0.0807 
##                 2 months                  4 weeks                  2 weeks 
##                   0.0622                   0.0469                   0.0327 
##           None specified            Not specified                  8 weeks 
##                   0.0153                   0.0131                   0.0120 
## before sample collection 
##                   0.0109

Sample size:

ssize <- apply(exps[,sub.cols[6:7]], 2, summary)
ssize
##         Group 0 sample size Group 1 sample size
## Min.                0.00000             1.00000
## 1st Qu.            13.00000            12.00000
## Median             26.00000            24.00000
## Mean               59.01047            76.81128
## 3rd Qu.            50.00000            45.00000
## Max.             1883.00000          9623.00000
## NA's               50.00000            47.00000

Lab analysis

Number of experiments for the top 10 categories for each lab analysis column:

lab.tab <- lapply(lab.cols, tabCol, df = exps, n = 10)
names(lab.tab) <- lab.cols
lab.tab
## $`Sequencing type`
## 
##  16S  WMS 
## 1346  122 
## 
## $`16S variable region`
## 
##   34    4  123   12  345   45    3 1234   56   23 
##  438  336  112   76   57   45   38   20   18   11 
## 
## $`Sequencing platform`
## 
##                    Illumina                    Roche454 
##                         997                         224 
##                 Ion Torrent                     RT-qPCR 
##                          88                          82 
## Human Intestinal Tract Chip                 MGISEQ-2000 
##                          12                          11 
##                      Sanger           Mass spectrometry 
##                           7                           5 
##           HTF-Microbi.Array       BGISEQ-500 Sequencing 
##                           4                           3

Proportions instead:

lab.tab <- lapply(lab.cols, tabCol, df = exps, n = 10, perc = TRUE)
names(lab.tab) <- lab.cols
lab.tab
## $`Sequencing type`
## 
##    16S    WMS 
## 0.9170 0.0831 
## 
## $`16S variable region`
## 
##      34       4     123      12     345      45       3    1234      56      23 
## 0.36800 0.28300 0.09420 0.06390 0.04790 0.03780 0.03200 0.01680 0.01510 0.00925 
## 
## $`Sequencing platform`
## 
##                    Illumina                    Roche454 
##                     0.68800                     0.15400 
##                 Ion Torrent                     RT-qPCR 
##                     0.06070                     0.05660 
## Human Intestinal Tract Chip                 MGISEQ-2000 
##                     0.00828                     0.00759 
##                      Sanger           Mass spectrometry 
##                     0.00483                     0.00345 
##           HTF-Microbi.Array       BGISEQ-500 Sequencing 
##                     0.00276                     0.00207

Statistical analysis

Number of experiments for the top 10 categories for each statistical analysis column:

stat.tab <- lapply(stat.cols, tabCol, df = exps, n = 10)
names(stat.tab) <- stat.cols
stat.tab
## $`Statistical test`
## 
##      Mann-Whitney (Wilcoxon)                        LEfSe 
##                          418                          400 
##              Kruskall-Wallis                       DESeq2 
##                           98                           86 
##                       T-Test                        ANOVA 
##                           84                           69 
##            Linear Regression                    PERMANOVA 
##                           42                           27 
##                    Metastats Negative Binomial Regression 
##                           25                           21 
## 
## $`MHT correction`
## 
## FALSE  TRUE 
##   671   652 
## 
## $`Significance threshold`
## 
##  0.05   0.1  0.01 0.001   0.2  0.15 0.005     0  0.25 1e-11 
##  1286    59    28    14    13    10     4     3     3     1

Proportions instead:

stat.tab <- lapply(stat.cols, tabCol, df = exps, n = 10, perc = TRUE)
names(stat.tab) <- stat.cols
stat.tab
## $`Statistical test`
## 
##      Mann-Whitney (Wilcoxon)                        LEfSe 
##                       0.2910                       0.2780 
##              Kruskall-Wallis                       DESeq2 
##                       0.0682                       0.0598 
##                       T-Test                        ANOVA 
##                       0.0585                       0.0480 
##            Linear Regression                    PERMANOVA 
##                       0.0292                       0.0188 
##                    Metastats Negative Binomial Regression 
##                       0.0174                       0.0146 
## 
## $`MHT correction`
## 
## FALSE  TRUE 
## 0.507 0.493 
## 
## $`Significance threshold`
## 
##   0.05    0.1   0.01  0.001    0.2   0.15  0.005      0   0.25  1e-11 
## 0.9010 0.0413 0.0196 0.0098 0.0091 0.0070 0.0028 0.0021 0.0021 0.0007

Alpha diversity

Overall distribution:

apply(exps[,div.cols], 2, table)
##           Pielou Shannon Chao1 Simpson Inverse Simpson Richness
## decreased     18     180   114      55              15      101
## increased     10     149    73      44              11      105
## unchanged     37     522   279     198              36      270

Correspondence of Shannon diversity and Richness:

table(exps$Shannon, exps$Richness)
##            
##             decreased increased unchanged
##   decreased        44         4        17
##   increased         2        42        16
##   unchanged        18        28       203

Conditions with consistently increased or decreased alpha diversity:

tabDiv(exps, "Shannon", "Condition")
##                                              increased decreased unchanged
## COVID-19                                             4        16        33
## gastric cancer                                       3        12        14
## human papilloma virus infection                      6         0        25
## lung cancer                                          2         8         5
## Parkinson's disease                                  6         0         5
## cesarean section                                     5         0        14
## cervical cancer                                      4         0         4
## HIV infection                                        1         5        10
## hypertension                                         4         0         2
## obesity                                              3         7        39
## acute lymphoblastic leukemia                         0         3         4
## age                                                  2         5         2
## antimicrobial agent                                  6         9        21
## atopic asthma                                        4         1         7
## Ethnic group                                         2         5         7
## periodontitis                                        3         0         2
## socioeconomic status                                 5         2         4
## squamous cell carcinoma                              3         0         3
## adenoma                                              0         2         3
## cervical glandular intraepithelial neoplasia         2         0         9
## colorectal cancer                                    6         4        24
## diet                                                 6         4        14
## Eczema                                               0         2         9
## food allergy                                         0         2        10
## rheumatoid arthritis                                 1         3         1
## air pollution                                        7         6         3
## asthma                                               1         0        11
## atopic eczema                                        2         3        17
## chronic fatigue syndrome                             0         1         4
## chronic hepatitis b virus infection                  0         1         5
## endometriosis                                        2         3        15
## esophageal cancer                                    1         2         2
## milk allergic reaction                               1         0         5
## multiple sclerosis                                   0         1         7
## pancreatic carcinoma                                 0         1         5
## schizophrenia                                        1         2         6
## smoking behavior                                     0         1         6
## Arthritis                                            1         1         4
## gestational diabetes                                 0         0         5
## irritable bowel syndrome                             0         0         5
## type II diabetes mellitus                            2         2         9
tabDiv(exps, "Shannon", "Condition", perc = TRUE)
##                                              increased decreased unchanged
## COVID-19                                         0.075     0.300      0.62
## gastric cancer                                   0.100     0.410      0.48
## human papilloma virus infection                  0.190     0.000      0.81
## lung cancer                                      0.130     0.530      0.33
## Parkinson's disease                              0.550     0.000      0.45
## cesarean section                                 0.260     0.000      0.74
## cervical cancer                                  0.500     0.000      0.50
## HIV infection                                    0.062     0.310      0.62
## hypertension                                     0.670     0.000      0.33
## obesity                                          0.061     0.140      0.80
## acute lymphoblastic leukemia                     0.000     0.430      0.57
## age                                              0.220     0.560      0.22
## antimicrobial agent                              0.170     0.250      0.58
## atopic asthma                                    0.330     0.083      0.58
## Ethnic group                                     0.140     0.360      0.50
## periodontitis                                    0.600     0.000      0.40
## socioeconomic status                             0.450     0.180      0.36
## squamous cell carcinoma                          0.500     0.000      0.50
## adenoma                                          0.000     0.400      0.60
## cervical glandular intraepithelial neoplasia     0.180     0.000      0.82
## colorectal cancer                                0.180     0.120      0.71
## diet                                             0.250     0.170      0.58
## Eczema                                           0.000     0.180      0.82
## food allergy                                     0.000     0.170      0.83
## rheumatoid arthritis                             0.200     0.600      0.20
## air pollution                                    0.440     0.380      0.19
## asthma                                           0.083     0.000      0.92
## atopic eczema                                    0.091     0.140      0.77
## chronic fatigue syndrome                         0.000     0.200      0.80
## chronic hepatitis b virus infection              0.000     0.170      0.83
## endometriosis                                    0.100     0.150      0.75
## esophageal cancer                                0.200     0.400      0.40
## milk allergic reaction                           0.170     0.000      0.83
## multiple sclerosis                               0.000     0.120      0.88
## pancreatic carcinoma                             0.000     0.170      0.83
## schizophrenia                                    0.110     0.220      0.67
## smoking behavior                                 0.000     0.140      0.86
## Arthritis                                        0.170     0.170      0.67
## gestational diabetes                             0.000     0.000      1.00
## irritable bowel syndrome                         0.000     0.000      1.00
## type II diabetes mellitus                        0.150     0.150      0.69
tabDiv(exps, "Richness", "Condition")
##                                              increased decreased unchanged
## COVID-19                                             4        15        17
## air pollution                                       15         5         3
## antimicrobial agent                                  1         8         7
## Parkinson's disease                                  8         1         0
## acute lymphoblastic leukemia                         5         1         1
## cervical glandular intraepithelial neoplasia         4         0         2
## atopic asthma                                        4         1         7
## colorectal cancer                                    6         3        10
## food allergy                                         0         3         9
## gastric cancer                                       3         6        13
## human papilloma virus infection                      3         0        14
## adenoma                                              0         2         3
## diet                                                 3         1         6
## endometriosis                                        3         1        13
## HIV infection                                        0         2         9
## asthma                                               1         0        10
## atopic eczema                                        2         1         7
## irritable bowel syndrome                             0         1         6
## lung cancer                                          0         1         6
## obesity                                              6         7        21
## cesarean section                                     2         2         9
## Colorectal adenoma                                   0         0         5
## multiple sclerosis                                   0         0         9
## smoking behavior                                     1         1         4
tabDiv(exps, "Richness", "Condition", perc = TRUE)
##                                              increased decreased unchanged
## COVID-19                                         0.110     0.420      0.47
## air pollution                                    0.650     0.220      0.13
## antimicrobial agent                              0.062     0.500      0.44
## Parkinson's disease                              0.890     0.110      0.00
## acute lymphoblastic leukemia                     0.710     0.140      0.14
## cervical glandular intraepithelial neoplasia     0.670     0.000      0.33
## atopic asthma                                    0.330     0.083      0.58
## colorectal cancer                                0.320     0.160      0.53
## food allergy                                     0.000     0.250      0.75
## gastric cancer                                   0.140     0.270      0.59
## human papilloma virus infection                  0.180     0.000      0.82
## adenoma                                          0.000     0.400      0.60
## diet                                             0.300     0.100      0.60
## endometriosis                                    0.180     0.059      0.76
## HIV infection                                    0.000     0.180      0.82
## asthma                                           0.091     0.000      0.91
## atopic eczema                                    0.200     0.100      0.70
## irritable bowel syndrome                         0.000     0.140      0.86
## lung cancer                                      0.000     0.140      0.86
## obesity                                          0.180     0.210      0.62
## cesarean section                                 0.150     0.150      0.69
## Colorectal adenoma                               0.000     0.000      1.00
## multiple sclerosis                               0.000     0.000      1.00
## smoking behavior                                 0.170     0.170      0.67

Body sites with consistently increased or decreased alpha diversity:

tabDiv(exps, "Shannon", "Body site")
##                            increased decreased unchanged
## Feces                             67       103       312
## Meconium                           6         0         8
## Stomach                            3         9         5
## Tongue                             0         6         7
## Vagina                             8         3         8
## Mouth                              6         2        14
## Posterior fornix of vagina         4         0         3
## Saliva                            13         9        25
## Caecum                             1         4         1
## Dental plaque                      0         3         3
## Skin of body                       4         7         6
## Subgingival dental plaque          5         2         4
## Uterine cervix                     3         0        23
## Nasopharynx                        1         3        18
## Breast                             3         2         0
## Lung                               1         2         5
## Uterus                             0         1         7
## Blood                              0         0         5
## Bronchus                           0         0         6
## Intestine                          1         1         6
## Rectum                             0         0        12
tabDiv(exps, "Shannon", "Body site", perc = TRUE)
##                            increased decreased unchanged
## Feces                          0.140     0.210      0.65
## Meconium                       0.430     0.000      0.57
## Stomach                        0.180     0.530      0.29
## Tongue                         0.000     0.460      0.54
## Vagina                         0.420     0.160      0.42
## Mouth                          0.270     0.091      0.64
## Posterior fornix of vagina     0.570     0.000      0.43
## Saliva                         0.280     0.190      0.53
## Caecum                         0.170     0.670      0.17
## Dental plaque                  0.000     0.500      0.50
## Skin of body                   0.240     0.410      0.35
## Subgingival dental plaque      0.450     0.180      0.36
## Uterine cervix                 0.120     0.000      0.88
## Nasopharynx                    0.045     0.140      0.82
## Breast                         0.600     0.400      0.00
## Lung                           0.120     0.250      0.62
## Uterus                         0.000     0.120      0.88
## Blood                          0.000     0.000      1.00
## Bronchus                       0.000     0.000      1.00
## Intestine                      0.120     0.120      0.75
## Rectum                         0.000     0.000      1.00
tabDiv(exps, "Richness", "Body site")
##                            increased decreased unchanged
## Colon                              9         0         1
## Oropharynx                         0         6         1
## Uterine cervix                     6         0        16
## Feces                             49        54       142
## Stomach                            2         7         3
## Posterior fornix of vagina         4         0         2
## Subgingival dental plaque          4         0         3
## Mouth                              3         0         8
## Nasopharynx                        3         5        10
## Rectum                             0         2         7
## Tongue                             0         2         4
## Vagina                             3         1         8
## Caecum                             2         3         0
## Meconium                           1         2         8
## Saliva                             2         1        11
## Bronchus                           0         0         6
## Intestine                          0         0         7
tabDiv(exps, "Richness", "Body site", perc = TRUE)
##                            increased decreased unchanged
## Colon                          0.900     0.000      0.10
## Oropharynx                     0.000     0.860      0.14
## Uterine cervix                 0.270     0.000      0.73
## Feces                          0.200     0.220      0.58
## Stomach                        0.170     0.580      0.25
## Posterior fornix of vagina     0.670     0.000      0.33
## Subgingival dental plaque      0.570     0.000      0.43
## Mouth                          0.270     0.000      0.73
## Nasopharynx                    0.170     0.280      0.56
## Rectum                         0.000     0.220      0.78
## Tongue                         0.000     0.330      0.67
## Vagina                         0.250     0.083      0.67
## Caecum                         0.400     0.600      0.00
## Meconium                       0.091     0.180      0.73
## Saliva                         0.140     0.071      0.79
## Bronchus                       0.000     0.000      1.00
## Intestine                      0.000     0.000      1.00

Signature stats

sigs <- bugsigdbr::getSignatures(dat, tax.id.type = "metaphlan")

Unique microbes

Number unique microbes contained in the signatures:

(nuniq <- length(unique(unlist(sigs))))
## [1] 2292

Development of unique microbes captured over time:

Microbe set size distribution

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.00    4.00    7.02    8.00  299.00
gghistogram(lengths(sigs), bins = 30, ylab = "number of signatures",
    xlab = "signature size", fill = "#00AFBB", ggtheme = theme_bw())

sum(lengths(sigs) > 4)
## [1] 1164

Microbe co-occurrence

dat.feces <- subset(dat, `Body site` == "Feces")
cooc.mat <- microbeHeatmap(dat.feces, tax.level = "genus")
## Loading required namespace: safe

antag.mat <- microbeHeatmap(dat.feces, tax.level = "genus", antagonistic = TRUE)

Get the top 20 genera most frequently reported as differentially abundant:

sigs.feces <- getSignatures(dat.feces, tax.id.type = "taxname", 
                            tax.level = "genus", exact.tax.level = FALSE) 
top20 <- sort(table(unlist(sigs.feces)), decreasing = TRUE)[1:20]
top20
## 
##      Bacteroides  Bifidobacterium Faecalibacterium    Streptococcus 
##              267              190              170              161 
##       Prevotella      Clostridium     Ruminococcus          Blautia 
##              158              140              140              136 
##        Roseburia    Lactobacillus  Parabacteroides        Alistipes 
##              133              132              123              111 
##            Dorea      Coprococcus      Veillonella      Eubacterium 
##              104              101               97               93 
##     Enterococcus      Akkermansia     Anaerostipes      Escherichia 
##               90               87               85               84

Subset heatmaps to the top 20 genera most frequently reported as differentially abundant:

all(names(top20) %in% rownames(cooc.mat))
## [1] TRUE
cooc.mat <- cooc.mat[names(top20), names(top20)]
all(names(top20) %in% rownames(antag.mat))
## [1] TRUE
antag.mat <- antag.mat[names(top20), names(top20)]

Distinguish by direction of abundance change (increased / decreased):

# increased
sub.dat.feces <- subset(dat.feces, `Abundance in Group 1` == "increased")
sigs.feces.up <- getSignatures(sub.dat.feces, tax.id.type = "taxname", 
                               tax.level = "genus", exact.tax.level = FALSE) 
top20.up <- table(unlist(sigs.feces.up))[names(top20)]
top20.up
## 
##      Bacteroides  Bifidobacterium Faecalibacterium    Streptococcus 
##              101               78               57               98 
##       Prevotella      Clostridium     Ruminococcus          Blautia 
##               79               72               57               55 
##        Roseburia    Lactobacillus  Parabacteroides        Alistipes 
##               45               91               59               37 
##            Dorea      Coprococcus      Veillonella      Eubacterium 
##               47               36               65               42 
##     Enterococcus      Akkermansia     Anaerostipes      Escherichia 
##               75               49               29               54
# decreased
sub.dat.feces <- subset(dat.feces, `Abundance in Group 1` == "decreased")
sigs.feces.down <- getSignatures(sub.dat.feces, tax.id.type = "taxname", 
                                 tax.level = "genus", exact.tax.level = FALSE) 
top20.down <- table(unlist(sigs.feces.down))[names(top20)]
top20.down
## 
##      Bacteroides  Bifidobacterium Faecalibacterium    Streptococcus 
##              157              102              108               54 
##       Prevotella      Clostridium     Ruminococcus          Blautia 
##               76               63               79               75 
##        Roseburia    Lactobacillus  Parabacteroides        Alistipes 
##               83               39               59               68 
##            Dorea      Coprococcus      Veillonella      Eubacterium 
##               51               59               28               45 
##     Enterococcus      Akkermansia     Anaerostipes      Escherichia 
##               11               32               50               24

Plot the heatmap

# annotation
mat <- matrix(nc = 2, cbind(top20.up, top20.down))
bp <- ComplexHeatmap::anno_barplot(mat, gp = gpar(fill = c("#D55E00", "#0072B2"),
                                                  col = c("#D55E00", "#0072B2")),
                                   height = unit(2, "cm"))
banno <- ComplexHeatmap::HeatmapAnnotation(`Abundance in Group 1` = bp)

lgd_list <- list(
    Legend(labels = c("increased", "decreased"), 
           title = "Abundance in Group 1", 
           type = "grid",
           legend_gp = gpar(col = c("#D55E00", "#0072B2"), fill = c("#D55E00", "#0072B2"))))
                                            
# same direction
# lcm <- sweep(cooc.mat, 2, matrixStats::colMaxs(cooc.mat), FUN = "/")
# we need to dampen the maximum here a bit down,
# otherwise 100% self co-occurrence takes up a large fraction of the colorscale,
sec <- apply(cooc.mat, 2, function(x) sort(x, decreasing = TRUE)[2])
cooc.mat2 <- cooc.mat
for(i in 1:ncol(cooc.mat2)) cooc.mat2[i,i] <- min(cooc.mat2[i,i], 1.4 * sec[i])
lcm <- sweep(cooc.mat2, 2, matrixStats::colMaxs(cooc.mat2), FUN = "/")
col <- circlize::colorRamp2(c(0,1), c("#EEEEEE", "red"))
ht1 <- ComplexHeatmap::Heatmap(lcm,
                               col = col, 
                               name = "Relative frequency (top)",
                               cluster_columns = FALSE, 
                               row_km = 3, 
                               row_title = "same direction", 
                               column_names_rot = 45,
                               row_names_gp = gpar(fontsize = 8),
                               column_names_gp = gpar(fontsize = 8))

# opposite direction
acm <- sweep(antag.mat, 2, matrixStats::colMaxs(antag.mat), FUN = "/")
col <- circlize::colorRamp2(c(0,1), c("#EEEEEE", "blue"))
ht2 <- ComplexHeatmap::Heatmap(acm,
                               col = col, 
                               name = "Relative frequency (bottom)",
                               cluster_columns = FALSE, 
                               row_title = "opposite direction", 
                               row_km = 3, 
                               column_names_rot = 45,
                               row_names_gp = gpar(fontsize = 8),
                               column_names_gp = gpar(fontsize = 8))

# phylum
sfp <- bugsigdbr::getSignatures(dat.feces, tax.id.type = "metaphlan", 
                                tax.level = "genus", exact.tax.level = FALSE) 
sfp20 <- sort(table(unlist(sfp)), decreasing = TRUE)[1:20]
uanno <- bugsigdbr::extractTaxLevel(names(sfp20),
                                    tax.id.type = "taxname",
                                    tax.level = "phylum",
                                    exact.tax.level = FALSE) 
phyla.grid <- seq_along(unique(uanno))
panno <- ComplexHeatmap::HeatmapAnnotation(phylum = uanno)

uanno <- matrix(uanno, nrow = 1)
colnames(uanno) <- names(top20)
pcols <- c("#CC79A7", "#F0E442", "#009E73", "#56B4E9", "#E69F00")
uanno <- ComplexHeatmap::Heatmap(uanno, name = "Phylum",
                                 col = pcols[phyla.grid],
                                 cluster_columns = FALSE,
                                 column_names_rot = 45,
                                 column_names_gp = gpar(fontsize = 8))

# put everything together
ht_list <- ht1 %v% banno %v% ht2 %v% uanno
ComplexHeatmap::draw(ht_list, annotation_legend_list = lgd_list, merge_legend = TRUE)

decorate_annotation("Abundance in Group 1", {
    grid.text("# signatures", x = unit(-1, "cm"), rot = 90, just = "bottom", gp = gpar(fontsize = 8))
    grid.text("*", x = unit(2.45, "cm"), y = unit(1.2, "cm"))
    grid.text("*", x = unit(5.18, "cm"), y = unit(1, "cm"))
    grid.text("*", x = unit(6.55, "cm"), y = unit(0.95, "cm"))
    grid.text("*", x = unit(8.6, "cm"), y = unit(0.85, "cm"))
    grid.text("*", x = unit(10, "cm"), y = unit(0.7, "cm"))
    grid.text("*", x = unit(10.7, "cm"), y = unit(0.7, "cm"))
})

Signature similarity

Jaccard index

Inspect signature similarity for signatures from stomach samples based on Jaccard index:

stomachsub <- subset(dat, `Body site` == "Stomach")
sigsub <- bugsigdbr::getSignatures(stomachsub)
pair.jsim <- calcJaccardSimilarity(sigsub)

Create a dendrogram of Jaccard dissimilarities (1.0 has no overlap, 0.0 are identical signatures).

jdist <- as.dist(1 - pair.jsim)
plot(hclust(jdist))