Get bulk export from bugsigdb.org:
full.dat <- bugsigdbr::importBugSigDB(version = "devel", cache = FALSE)
dim(full.dat)
## [1] 2838 49
colnames(full.dat)
## [1] "BSDB ID" "Study"
## [3] "Study design" "PMID"
## [5] "DOI" "URL"
## [7] "Authors list" "Title"
## [9] "Journal" "Year"
## [11] "Experiment" "Location of subjects"
## [13] "Host species" "Body site"
## [15] "UBERON ID" "Condition"
## [17] "EFO ID" "Group 0 name"
## [19] "Group 1 name" "Group 1 definition"
## [21] "Group 0 sample size" "Group 1 sample size"
## [23] "Antibiotics exclusion" "Sequencing type"
## [25] "16S variable region" "Sequencing platform"
## [27] "Statistical test" "Significance threshold"
## [29] "MHT correction" "LDA Score above"
## [31] "Matched on" "Confounders controlled for"
## [33] "Pielou" "Shannon"
## [35] "Chao1" "Simpson"
## [37] "Inverse Simpson" "Richness"
## [39] "Signature page name" "Source"
## [41] "Curated date" "Curator"
## [43] "Revision editor" "Description"
## [45] "Abundance in Group 1" "MetaPhlAn taxon names"
## [47] "NCBI Taxonomy IDs" "State"
## [49] "Reviewer"
Stripping illformed entries:
Number of papers and signatures curated:
## [1] 705
nrow(full.dat)
## [1] 2838
Publication date of the curated papers:
pmids <- pmids[!is.na(pmids)]
pubyear1 <- pmid2pubyear(pmids[1:361])
pubyear2 <- pmid2pubyear(pmids[362:length(pmids)])
pubyear <- c(pubyear1, pubyear2)
head(cbind(pmids, pubyear))
## pmids pubyear
## [1,] "28038683" "2016"
## [2,] "28173873" "2017"
## [3,] "27015276" "2016"
## [4,] "27625705" "2016"
## [5,] "23071781" "2012"
## [6,] "28467925" "2017"
tab <- table(pubyear)
tab <- tab[-length(tab)]
tab <- tab[order(as.integer(names(tab)))]
df <- data.frame(year = names(tab), papers = as.integer(tab))
ggbarplot(df, x = "year", y = "papers",
label = TRUE, fill = "steelblue",
ggtheme = theme_bw())
Stripping empty signatures:
ind1 <- lengths(full.dat[["MetaPhlAn taxon names"]]) > 0
ind2 <- lengths(full.dat[["NCBI Taxonomy IDs"]]) > 0
dat <- full.dat[ind1 & ind2,]
nrow(dat)
## [1] 2838
Papers containing only empty UP and DOWN signatures (under curation?):
## numeric(0)
Progress over time:
dat[,"Curated date"] <- as.character(lubridate::dmy(dat[,"Curated date"]))
plotProgressOverTime(dat)
plotProgressOverTime(dat, diff = TRUE)
Stratified by curator:
npc <- stratifyByCurator(dat)
plotCuratorStats(dat, npc)
Number of complete and revised signatures:
table(df[["State"]])
## < table of extent 0 >
table(dat[,"Revision editor"])
##
## Adi13 Aiyshaaaa
## 8 19
## Aiyshaaaa,WikiWorks Aiyshaaaa,WikiWorks,Merit
## 1 4
## Annabelcute Atrayees
## 3 11
## Atrayees,Aiyshaaaa,Claregrieve1 Atrayees,Claregrieve1
## 1 2
## Atrayees,Lwaldron,Claregrieve1 Atrayees,WikiWorks
## 1 20
## Atrayees,WikiWorks,Merit Barakat Dindi,Chloe
## 2 2
## BLESSING123 BLESSING123,Chloe
## 15 2
## Brian Brian,Suwaiba
## 2 2
## Busayo Busayo,Fatima
## 1 1
## Busayo,Mcarlson Chioma
## 2 4
## Chioma,Fatima Chloe
## 2 7
## Chloe,Kwekuamoo Chloe,Merit
## 1 1
## Chloe,WikiWorks Chondatondaponda
## 4 10
## Claregrieve1 Claregrieve1,Aiyshaaaa
## 152 2
## Claregrieve1,Atrayees Claregrieve1,Atrayees,WikiWorks
## 1 2
## Claregrieve1,Atrayees,WikiWorks,Merit Claregrieve1,Chloe
## 1 1
## Claregrieve1,Chloe,WikiWorks,Merit Claregrieve1,Fatima
## 1 10
## Claregrieve1,Fatima,LGeistlinger Claregrieve1,Fatima,Yu Wang
## 1 1
## Claregrieve1,Lwaldron Claregrieve1,Lwaldron,Suwaiba
## 2 1
## Claregrieve1,Merit Claregrieve1,Merit,WikiWorks
## 8 2
## Claregrieve1,Rukky,WikiWorks Claregrieve1,Suwaiba
## 1 1
## Claregrieve1,Suwaiba,Merit Claregrieve1,WikiWorks
## 1 264
## Claregrieve1,WikiWorks,Merit Cyberian
## 6 4
## Cyberian,Chloe Cynthia Anderson
## 2 28
## Cynthia Anderson,Claregrieve1 Cynthia Anderson,Fatima
## 2 2
## Cynthia Anderson,LGeistlinger,WikiWorks Cynthia Anderson,Lwaldron,WikiWorks
## 2 1
## Danyab56,Aiyshaaaa,Claregrieve1 Deacme
## 5 4
## Deacme,Aiyshaaaa Deacme,Atrayees
## 8 2
## Dupe Dupe,Aiyshaaaa
## 4 1
## Dupe,Atrayees Dupe,Mcarlson
## 2 2
## Ellajessica Ellajessica,Aiyshaaaa
## 2 2
## Fatima Fatima,Aiyshaaaa
## 21 1
## Fatima,Claregrieve1 Fatima,Claregrieve1,WikiWorks
## 2 11
## Fatima,Kwekuamoo,WikiWorks Fatima,LGeistlinger,WikiWorks
## 2 1
## Fatima,Lwaldron,WikiWorks Fatima,Merit,WikiWorks
## 3 3
## Fatima,WikiWorks Fatima,WikiWorks,Merit
## 40 3
## Fcuevas3 Fcuevas3,Aiyshaaaa
## 37 2
## Fcuevas3,Claregrieve1 Fcuevas3,Fatima
## 4 1
## Fcuevas3,Lwaldron,Aiyshaaaa Fcuevas3,Rimsha
## 2 8
## Gina Haoyanzh
## 14 20
## Haoyanzh,Lwaldron Itslanapark
## 2 27
## Itslanapark,Aiyshaaaa Itslanapark,Atrayees
## 5 1
## Itslanapark,Chloe Itslanapark,Chloe,Aiyshaaaa,Merit
## 3 1
## Itslanapark,Claregrieve1 Itslanapark,Claregrieve1,Aiyshaaaa
## 4 1
## Itslanapark,Claregrieve1,Atrayees Itslanapark,Fatima
## 1 2
## Itslanapark,Fatima,Chloe,Merit Itslanapark,Rimsha
## 1 1
## Jacquelynshevin Jacquelynshevin,Chloe
## 29 1
## Jacquelynshevin,Chloe,WikiWorks Jacquelynshevin,Fatima
## 1 7
## Jeshudy Jeshudy,Aiyshaaaa
## 55 3
## Jeshudy,Claregrieve1 Jeshudy,Fatima
## 38 4
## Jeshudy,Suwaiba Joyessa
## 2 1
## Joyessa,Aiyshaaaa Joyessa,Claregrieve1
## 3 18
## Joyessa,Claregrieve1,Merit Joyessa,Fatima,Claregrieve1
## 1 2
## Kahvecirem,Aiyshaaaa,Claregrieve1 Kahvecirem,Aiyshaaaa,Merit,Claregrieve1
## 2 3
## Kahvecirem,Atrayees Kahvecirem,Atrayees,Merit,Claregrieve1
## 2 1
## Kahvecirem,Merit,Claregrieve1 Kaluifeanyi101
## 2 54
## Kaluifeanyi101,Aiyshaaaa Kaluifeanyi101,Claregrieve1
## 2 16
## Kaluifeanyi101,Fatima KathyWaldron,WikiWorks
## 2 3
## KathyWaldron,WikiWorks,Merit Kelvin Joseph
## 1 11
## Kelvin Joseph,Claregrieve1 Khadeeejah,Aiyshaaaa
## 2 4
## Khadeeejah,Aiyshaaaa,Chloe Khadeeejah,Atrayees
## 1 2
## Khadeeejah,Atrayees,Chloe Kwekuamoo
## 1 25
## Kwekuamoo,Claregrieve1 Kwekuamoo,Merit,WikiWorks
## 4 1
## Levitest,WikiWorks,Merit Lorakasselman,Chloe
## 1 2
## Lorakasselman,Claregrieve1 Lwaldron
## 2 11
## Lwaldron,Atrayees,WikiWorks,Aiyshaaaa Lwaldron,Claregrieve1,WikiWorks
## 1 5
## Lwaldron,Claregrieve1,WikiWorks,Merit Lwaldron,Fatima,WikiWorks
## 2 1
## Lwaldron,WikiWorks Lwaldron,WikiWorks,LGeistlinger
## 30 1
## Lwaldron,WikiWorks,Merit Madhubani Dey
## 4 13
## Madhubani Dey,Aiyshaaaa Madhubani Dey,Chloe,Merit
## 1 1
## Madhubani Dey,Claregrieve1 Madhubani Dey,Fatima,Claregrieve1
## 26 2
## Madhubani Dey,Lwaldron Madhubani Dey,Merit
## 1 2
## Manuela Mary Bearkland
## 11 24
## Mary Bearkland,Aiyshaaaa Mary Bearkland,Aiyshaaaa,Claregrieve1
## 1 1
## Mary Bearkland,Claregrieve1 Mary Bearkland,Fatima
## 20 3
## Mary Bearkland,Fatima,Merit Mary Bearkland,Merit
## 1 6
## Maryemzaki,Lwaldron Merit
## 2 2
## Merit,Aiyshaaaa Merit,Claregrieve1
## 1 1
## Merit,WikiWorks Mmarin
## 19 21
## Mmarin,Atrayees Mmarin,Claregrieve1
## 1 13
## Nice25 Nnadichioma,Aiyshaaaa
## 1 1
## Nnadichioma,Aiyshaaaa,Atrayees Nnadichioma,Aiyshaaaa,Merit
## 2 2
## Nnadichioma,Atrayees,Aiyshaaaa,Merit Ombati
## 1 3
## Ombati,Atrayees Ombati,Chloe
## 2 1
## Rimsha Rimsha,Fatima,LGeistlinger,WikiWorks
## 4 1
## Rimsha,Fatima,WikiWorks Rimsha,Lwaldron
## 1 1
## Samara.Khan Samara.Khan,Claregrieve1
## 38 9
## Samara.Khan,Fatima Sharmilac
## 1 5
## Sharmilac,Aiyshaaaa Sharmilac,Claregrieve1
## 1 2
## Sharmilac,Fatima Sharmilac,Fatima,Aiyshaaaa
## 6 1
## Sharmilac,Merit Sophy
## 2 9
## Sophy,Aiyshaaaa,Claregrieve1 Sophy,Chloe
## 4 3
## Sophy,Claregrieve1 Sophy,Mcarlson,Atrayees
## 4 2
## Suwaiba Tislam
## 23 29
## Tislam,Aiyshaaaa Tislam,Aiyshaaaa,Claregrieve1
## 2 1
## Tislam,Atrayees Tislam,Claregrieve1
## 2 6
## Tislam,Fatima Tislam,Fatima,Claregrieve1
## 8 1
## Tislam,Rimsha,Claregrieve1,Merit Titas
## 2 11
## Titas,Fatima Titas,Lwaldron
## 1 1
## Ufuoma Ejite Ufuoma Ejite,Chloe,Aiyshaaaa
## 9 1
## Ufuoma Ejite,Chloe,Aiyshaaaa,Atrayees Ufuoma Ejite,Lwaldron,WikiWorks
## 1 7
## Valentina WikiWorks
## 2 1092
## WikiWorks,Aiyshaaaa WikiWorks,Atrayees
## 3 5
## WikiWorks,Atrayees,Merit WikiWorks,Claregrieve1
## 4 12
## WikiWorks,Fatima WikiWorks,Jeshudy
## 2 1
## WikiWorks,Merit WikiWorks,Rukky
## 29 3
## WikiWorks,Suwaiba Yu Wang,Fatima
## 1 1
## Yu Wang,Fatima,Claregrieve1
## 4
spl <- split(dat[["Study"]], dat[["Study design"]])
sds <- lapply(spl, unique)
sort(lengths(sds), decreasing = FALSE)
## case-control,prospective cohort
## 1
## laboratory experiment,time series / longitudinal observational
## 1
## case-control,meta-analysis
## 5
## meta-analysis
## 5
## randomized controlled trial
## 30
## laboratory experiment
## 32
## time series / longitudinal observational
## 59
## prospective cohort
## 68
## cross-sectional observational, not case-control
## 205
## case-control
## 314
Columns of the full dataset that describe experiments:
# Experiment ID
exp.cols <- c("Study", "Experiment")
# Subjects
sub.cols <- c("Host species",
"Location of subjects",
"Body site",
"Condition",
"Antibiotics exclusion",
"Group 0 sample size",
"Group 1 sample size")
# Lab analysis
lab.cols <- c("Sequencing type",
"16S variable region",
"Sequencing platform")
# Statistical analysis
stat.cols <- c("Statistical test",
"MHT correction",
"Significance threshold")
# Alpha diversity
div.cols <- c("Pielou",
"Shannon",
"Chao1",
"Simpson",
"Inverse Simpson",
"Richness")
Restrict dataset to experiment information:
Number of experiments for the top 10 categories for each subjects column:
## $`Host species`
##
## Homo sapiens Mus musculus Rattus norvegicus
## 1542 70 6
##
## $`Location of subjects`
##
## China United States of America Italy
## 492 417 67
## Spain Japan South Korea
## 53 48 44
## Finland Canada Netherlands
## 36 29 29
## Brazil
## 25
##
## $`Body site`
##
## Feces Saliva Vagina
## 957 96 50
## Uterine cervix Mouth Skin of body
## 40 30 29
## Nasopharynx Meconium Intestine
## 28 27 26
## Subgingival dental plaque
## 24
##
## $Condition
##
## obesity
## 107
## COVID-19
## 89
## colorectal cancer
## 81
## antimicrobial agent
## 77
## diet
## 56
## Parkinson's disease
## 45
## human papilloma virus infection
## 41
## cervical glandular intraepithelial neoplasia
## 38
## gastric cancer
## 36
## endometriosis
## 35
##
## $`Antibiotics exclusion`
##
## 3 months 1 month 6 months 4 weeks 2 months
## 199 117 77 58 57
## 2 weeks None 8 weeks None specified 5 days
## 31 15 12 12 10
Proportions instead:
sub.tab <- lapply(sub.cols[1:5], tabCol, df = exps, n = 10, perc = TRUE)
names(sub.tab) <- sub.cols[1:5]
sub.tab
## $`Host species`
##
## Homo sapiens Mus musculus Rattus norvegicus
## 0.95300 0.04330 0.00371
##
## $`Location of subjects`
##
## China United States of America Italy
## 0.3060 0.2590 0.0416
## Spain Japan South Korea
## 0.0329 0.0298 0.0273
## Finland Canada Netherlands
## 0.0224 0.0180 0.0180
## Brazil
## 0.0155
##
## $`Body site`
##
## Feces Saliva Vagina
## 0.5910 0.0593 0.0309
## Uterine cervix Mouth Skin of body
## 0.0247 0.0185 0.0179
## Nasopharynx Meconium Intestine
## 0.0173 0.0167 0.0161
## Subgingival dental plaque
## 0.0148
##
## $Condition
##
## obesity
## 0.0665
## COVID-19
## 0.0553
## colorectal cancer
## 0.0503
## antimicrobial agent
## 0.0479
## diet
## 0.0348
## Parkinson's disease
## 0.0280
## human papilloma virus infection
## 0.0255
## cervical glandular intraepithelial neoplasia
## 0.0236
## gastric cancer
## 0.0224
## endometriosis
## 0.0218
##
## $`Antibiotics exclusion`
##
## 3 months 1 month 6 months 4 weeks 2 months
## 0.19800 0.11600 0.07660 0.05770 0.05670
## 2 weeks None 8 weeks None specified 5 days
## 0.03080 0.01490 0.01190 0.01190 0.00995
Sample size:
ssize <- apply(exps[,sub.cols[6:7]], 2, summary)
ssize
## Group 0 sample size Group 1 sample size
## Min. 0.00000 1.00000
## 1st Qu. 13.00000 13.00000
## Median 27.00000 24.00000
## Mean 68.47422 81.77897
## 3rd Qu. 53.50000 46.00000
## Max. 1883.00000 9623.00000
## NA's 50.00000 42.00000
Number of experiments for the top 10 categories for each lab analysis column:
## $`Sequencing type`
##
## 16S WMS PCR ITS / ITS2
## 1464 136 4 1
##
## $`16S variable region`
##
## 34 4 123 12 345 45 3 56 1234 23
## 487 377 125 76 57 56 38 18 13 11
##
## $`Sequencing platform`
##
## Illumina Roche454
## 1118 232
## Ion Torrent RT-qPCR
## 94 84
## Human Intestinal Tract Chip MGISEQ-2000
## 12 11
## Mass spectrometry Sanger
## 7 6
## HTF-Microbi.Array Non-quantitative PCR
## 4 4
Proportions instead:
lab.tab <- lapply(lab.cols, tabCol, df = exps, n = 10, perc = TRUE)
names(lab.tab) <- lab.cols
lab.tab
## $`Sequencing type`
##
## 16S WMS PCR ITS / ITS2
## 0.912000 0.084700 0.002490 0.000623
##
## $`16S variable region`
##
## 34 4 123 12 345 45 3 56 1234 23
## 0.37500 0.29100 0.09640 0.05860 0.04390 0.04320 0.02930 0.01390 0.01000 0.00848
##
## $`Sequencing platform`
##
## Illumina Roche454
## 0.70400 0.14600
## Ion Torrent RT-qPCR
## 0.05920 0.05290
## Human Intestinal Tract Chip MGISEQ-2000
## 0.00755 0.00692
## Mass spectrometry Sanger
## 0.00441 0.00378
## HTF-Microbi.Array Non-quantitative PCR
## 0.00252 0.00252
Number of experiments for the top 10 categories for each statistical analysis column:
## $`Statistical test`
##
## LEfSe Mann-Whitney (Wilcoxon)
## 456 439
## DESeq2 Kruskall-Wallis
## 108 103
## T-Test ANOVA
## 86 68
## Linear Regression PERMANOVA
## 47 34
## Negative Binomial Regression Metastats
## 31 28
##
## $`MHT correction`
##
## TRUE FALSE
## 753 716
##
## $`Significance threshold`
##
## 0.05 0.1 0.01 0.001 0.15 0.2 0.005 0 0.25 1e-11
## 1401 62 26 21 14 13 4 3 3 1
Proportions instead:
stat.tab <- lapply(stat.cols, tabCol, df = exps, n = 10, perc = TRUE)
names(stat.tab) <- stat.cols
stat.tab
## $`Statistical test`
##
## LEfSe Mann-Whitney (Wilcoxon)
## 0.2870 0.2770
## DESeq2 Kruskall-Wallis
## 0.0681 0.0649
## T-Test ANOVA
## 0.0542 0.0428
## Linear Regression PERMANOVA
## 0.0296 0.0214
## Negative Binomial Regression Metastats
## 0.0195 0.0176
##
## $`MHT correction`
##
## TRUE FALSE
## 0.513 0.487
##
## $`Significance threshold`
##
## 0.05 0.1 0.01 0.001 0.15 0.2 0.005 0
## 0.901000 0.039900 0.016700 0.013500 0.009000 0.008360 0.002570 0.001930
## 0.25 1e-11
## 0.001930 0.000643
Overall distribution:
apply(exps[,div.cols], 2, table)
## Pielou Shannon Chao1 Simpson Inverse Simpson Richness
## decreased 18 220 127 69 15 112
## increased 12 158 74 40 14 118
## unchanged 41 569 326 218 40 296
Correspondence of Shannon diversity and Richness:
table(exps$Shannon, exps$Richness)
##
## decreased increased unchanged
## decreased 53 6 23
## increased 1 45 17
## unchanged 21 30 220
Conditions with consistently increased or decreased alpha diversity:
tabDiv(exps, "Shannon", "Condition")
## increased decreased unchanged
## COVID-19 5 15 33
## obesity 3 13 36
## gastric cancer 3 12 14
## HIV infection 1 10 10
## human papilloma virus infection 6 0 24
## lung cancer 2 8 5
## cesarean section 5 0 14
## Crohn's disease 0 5 2
## acute lymphoblastic leukemia 0 4 4
## antimicrobial agent 5 9 22
## cervical cancer 4 0 4
## hypertension 4 0 2
## age 2 5 2
## alcohol drinking 3 0 2
## Alzheimer's disease 0 3 2
## atopic asthma 4 1 7
## Parkinson's disease 6 3 12
## periodontitis 3 0 3
## squamous cell carcinoma 3 0 3
## cervical glandular intraepithelial neoplasia 2 0 9
## colorectal cancer 6 8 24
## diet 6 4 14
## Eczema 0 2 10
## ethnic group 4 6 7
## food allergy 0 2 10
## schizophrenia 1 3 7
## smoking behavior 3 1 14
## socioeconomic status 4 2 4
## acute myeloid leukemia 2 3 2
## air pollution 7 6 3
## asthma 1 0 11
## atopic eczema 2 3 17
## chronic fatigue syndrome 0 1 4
## chronic hepatitis B virus infection 0 1 5
## endometriosis 2 3 17
## esophageal cancer 1 2 2
## milk allergic reaction 1 0 5
## multiple sclerosis 0 1 7
## pancreatic carcinoma 0 1 4
## Response to immunochemotherapy 2 1 3
## arthritis 1 1 5
## colorectal adenoma 2 2 5
## gestational diabetes 0 0 5
## hepatocellular carcinoma 0 0 6
## HIV mother to child transmission 0 0 5
## irritable bowel syndrome 0 0 6
## obsessive-compulsive disorder 0 0 5
## rheumatoid arthritis 3 3 4
## type II diabetes mellitus 2 2 9
tabDiv(exps, "Shannon", "Condition", perc = TRUE)
## increased decreased unchanged
## COVID-19 0.094 0.280 0.62
## obesity 0.058 0.250 0.69
## gastric cancer 0.100 0.410 0.48
## HIV infection 0.048 0.480 0.48
## human papilloma virus infection 0.200 0.000 0.80
## lung cancer 0.130 0.530 0.33
## cesarean section 0.260 0.000 0.74
## Crohn's disease 0.000 0.710 0.29
## acute lymphoblastic leukemia 0.000 0.500 0.50
## antimicrobial agent 0.140 0.250 0.61
## cervical cancer 0.500 0.000 0.50
## hypertension 0.670 0.000 0.33
## age 0.220 0.560 0.22
## alcohol drinking 0.600 0.000 0.40
## Alzheimer's disease 0.000 0.600 0.40
## atopic asthma 0.330 0.083 0.58
## Parkinson's disease 0.290 0.140 0.57
## periodontitis 0.500 0.000 0.50
## squamous cell carcinoma 0.500 0.000 0.50
## cervical glandular intraepithelial neoplasia 0.180 0.000 0.82
## colorectal cancer 0.160 0.210 0.63
## diet 0.250 0.170 0.58
## Eczema 0.000 0.170 0.83
## ethnic group 0.240 0.350 0.41
## food allergy 0.000 0.170 0.83
## schizophrenia 0.091 0.270 0.64
## smoking behavior 0.170 0.056 0.78
## socioeconomic status 0.400 0.200 0.40
## acute myeloid leukemia 0.290 0.430 0.29
## air pollution 0.440 0.380 0.19
## asthma 0.083 0.000 0.92
## atopic eczema 0.091 0.140 0.77
## chronic fatigue syndrome 0.000 0.200 0.80
## chronic hepatitis B virus infection 0.000 0.170 0.83
## endometriosis 0.091 0.140 0.77
## esophageal cancer 0.200 0.400 0.40
## milk allergic reaction 0.170 0.000 0.83
## multiple sclerosis 0.000 0.120 0.88
## pancreatic carcinoma 0.000 0.200 0.80
## Response to immunochemotherapy 0.330 0.170 0.50
## arthritis 0.140 0.140 0.71
## colorectal adenoma 0.220 0.220 0.56
## gestational diabetes 0.000 0.000 1.00
## hepatocellular carcinoma 0.000 0.000 1.00
## HIV mother to child transmission 0.000 0.000 1.00
## irritable bowel syndrome 0.000 0.000 1.00
## obsessive-compulsive disorder 0.000 0.000 1.00
## rheumatoid arthritis 0.300 0.300 0.40
## type II diabetes mellitus 0.150 0.150 0.69
tabDiv(exps, "Richness", "Condition")
## increased decreased unchanged
## COVID-19 4 15 17
## air pollution 15 5 3
## Parkinson's disease 9 0 2
## antimicrobial agent 1 8 5
## alcohol drinking 5 0 0
## HIV infection 0 5 8
## acute lymphoblastic leukemia 5 1 0
## cervical glandular intraepithelial neoplasia 4 0 2
## atopic asthma 4 1 7
## food allergy 0 3 9
## gastric cancer 3 6 13
## colorectal adenoma 0 2 8
## diet 3 1 6
## endometriosis 3 1 15
## asthma 1 0 10
## atopic eczema 2 1 7
## colorectal cancer 6 5 9
## ethnic group 3 2 1
## human papilloma virus infection 2 1 13
## irritable bowel syndrome 0 1 7
## lung cancer 0 1 6
## obesity 6 7 18
## obsessive-compulsive disorder 0 1 4
## schizophrenia 1 2 2
## cesarean section 2 2 9
## HIV mother to child transmission 0 0 5
## multiple sclerosis 0 0 9
## smoking behavior 1 1 8
## socioeconomic status 2 2 1
tabDiv(exps, "Richness", "Condition", perc = TRUE)
## increased decreased unchanged
## COVID-19 0.110 0.420 0.47
## air pollution 0.650 0.220 0.13
## Parkinson's disease 0.820 0.000 0.18
## antimicrobial agent 0.071 0.570 0.36
## alcohol drinking 1.000 0.000 0.00
## HIV infection 0.000 0.380 0.62
## acute lymphoblastic leukemia 0.830 0.170 0.00
## cervical glandular intraepithelial neoplasia 0.670 0.000 0.33
## atopic asthma 0.330 0.083 0.58
## food allergy 0.000 0.250 0.75
## gastric cancer 0.140 0.270 0.59
## colorectal adenoma 0.000 0.200 0.80
## diet 0.300 0.100 0.60
## endometriosis 0.160 0.053 0.79
## asthma 0.091 0.000 0.91
## atopic eczema 0.200 0.100 0.70
## colorectal cancer 0.300 0.250 0.45
## ethnic group 0.500 0.330 0.17
## human papilloma virus infection 0.120 0.062 0.81
## irritable bowel syndrome 0.000 0.120 0.88
## lung cancer 0.000 0.140 0.86
## obesity 0.190 0.230 0.58
## obsessive-compulsive disorder 0.000 0.200 0.80
## schizophrenia 0.200 0.400 0.40
## cesarean section 0.150 0.150 0.69
## HIV mother to child transmission 0.000 0.000 1.00
## multiple sclerosis 0.000 0.000 1.00
## smoking behavior 0.100 0.100 0.80
## socioeconomic status 0.400 0.400 0.20
Body sites with consistently increased or decreased alpha diversity:
tabDiv(exps, "Shannon", "Body site")
## increased decreased unchanged
## Feces 69 140 340
## Mouth 9 2 11
## Stomach 3 9 5
## Vagina 9 3 10
## Meconium 5 0 7
## Subgingival dental plaque 7 2 7
## Tongue 0 5 8
## Posterior fornix of vagina 4 0 3
## Caecum 1 4 1
## Dental plaque 0 3 3
## Skin of body 4 7 6
## Uterine cervix 3 0 23
## Nasopharynx 1 3 18
## Oral cavity 3 1 2
## Saliva 15 13 38
## Breast 3 2 0
## Lung 1 2 6
## Oropharynx 1 2 3
## Rectum 0 1 12
## Uterus 0 1 7
## Blood 0 0 5
## Bronchus 0 0 6
## Intestine 1 1 6
tabDiv(exps, "Shannon", "Body site", perc = TRUE)
## increased decreased unchanged
## Feces 0.130 0.260 0.62
## Mouth 0.410 0.091 0.50
## Stomach 0.180 0.530 0.29
## Vagina 0.410 0.140 0.45
## Meconium 0.420 0.000 0.58
## Subgingival dental plaque 0.440 0.120 0.44
## Tongue 0.000 0.380 0.62
## Posterior fornix of vagina 0.570 0.000 0.43
## Caecum 0.170 0.670 0.17
## Dental plaque 0.000 0.500 0.50
## Skin of body 0.240 0.410 0.35
## Uterine cervix 0.120 0.000 0.88
## Nasopharynx 0.045 0.140 0.82
## Oral cavity 0.500 0.170 0.33
## Saliva 0.230 0.200 0.58
## Breast 0.600 0.400 0.00
## Lung 0.110 0.220 0.67
## Oropharynx 0.170 0.330 0.50
## Rectum 0.000 0.077 0.92
## Uterus 0.000 0.120 0.88
## Blood 0.000 0.000 1.00
## Bronchus 0.000 0.000 1.00
## Intestine 0.120 0.120 0.75
tabDiv(exps, "Richness", "Body site")
## increased decreased unchanged
## Colon 9 0 1
## Feces 55 61 157
## Mouth 7 1 5
## Oropharynx 0 6 3
## Uterine cervix 6 0 16
## Stomach 2 7 3
## Posterior fornix of vagina 4 0 2
## Subgingival dental plaque 4 0 4
## Rectum 0 3 7
## Nasopharynx 3 5 10
## Saliva 5 7 18
## Tongue 0 2 5
## Vagina 3 1 11
## Caecum 2 3 0
## Meconium 1 2 6
## Bronchus 0 0 6
## Intestine 0 0 7
tabDiv(exps, "Richness", "Body site", perc = TRUE)
## increased decreased unchanged
## Colon 0.90 0.000 0.10
## Feces 0.20 0.220 0.58
## Mouth 0.54 0.077 0.38
## Oropharynx 0.00 0.670 0.33
## Uterine cervix 0.27 0.000 0.73
## Stomach 0.17 0.580 0.25
## Posterior fornix of vagina 0.67 0.000 0.33
## Subgingival dental plaque 0.50 0.000 0.50
## Rectum 0.00 0.300 0.70
## Nasopharynx 0.17 0.280 0.56
## Saliva 0.17 0.230 0.60
## Tongue 0.00 0.290 0.71
## Vagina 0.20 0.067 0.73
## Caecum 0.40 0.600 0.00
## Meconium 0.11 0.220 0.67
## Bronchus 0.00 0.000 1.00
## Intestine 0.00 0.000 1.00
sigs <- bugsigdbr::getSignatures(dat, tax.id.type = "metaphlan")
Number unique microbes contained in the signatures:
## [1] 2484
Development of unique microbes captured over time:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 4.000 7.449 9.000 471.000
gghistogram(lengths(sigs), bins = 30, ylab = "number of signatures",
xlab = "signature size", fill = "#00AFBB", ggtheme = theme_bw())
## [1] 1371
dat.feces <- subset(dat, `Body site` == "Feces")
cooc.mat <- microbeHeatmap(dat.feces, tax.level = "genus")
## Loading required namespace: safe
antag.mat <- microbeHeatmap(dat.feces, tax.level = "genus", antagonistic = TRUE)
Get the top 20 genera most frequently reported as differentially abundant:
sigs.feces <- getSignatures(dat.feces, tax.id.type = "taxname",
tax.level = "genus", exact.tax.level = FALSE)
top20 <- sort(table(unlist(sigs.feces)), decreasing = TRUE)[1:20]
top20
##
## Bacteroides Bifidobacterium Faecalibacterium Prevotella
## 299 204 199 192
## Streptococcus Clostridium Ruminococcus Roseburia
## 181 163 162 157
## Blautia Lactobacillus Parabacteroides Alistipes
## 153 141 129 122
## Coprococcus Dorea Eubacterium Veillonella
## 115 114 111 110
## Enterococcus Lachnospira Akkermansia Anaerostipes
## 105 102 100 98
Subset heatmaps to the top 20 genera most frequently reported as differentially abundant:
## [1] TRUE
## [1] TRUE
Distinguish by direction of abundance change (increased / decreased):
# increased
sub.dat.feces <- subset(dat.feces, `Abundance in Group 1` == "increased")
sigs.feces.up <- getSignatures(sub.dat.feces, tax.id.type = "taxname",
tax.level = "genus", exact.tax.level = FALSE)
top20.up <- table(unlist(sigs.feces.up))[names(top20)]
top20.up
##
## Bacteroides Bifidobacterium Faecalibacterium Prevotella
## 116 84 66 100
## Streptococcus Clostridium Ruminococcus Roseburia
## 111 84 64 48
## Blautia Lactobacillus Parabacteroides Alistipes
## 64 95 63 41
## Coprococcus Dorea Eubacterium Veillonella
## 37 47 47 73
## Enterococcus Lachnospira Akkermansia Anaerostipes
## 86 22 61 34
# decreased
sub.dat.feces <- subset(dat.feces, `Abundance in Group 1` == "decreased")
sigs.feces.down <- getSignatures(sub.dat.feces, tax.id.type = "taxname",
tax.level = "genus", exact.tax.level = FALSE)
top20.down <- table(unlist(sigs.feces.down))[names(top20)]
top20.down
##
## Bacteroides Bifidobacterium Faecalibacterium Prevotella
## 174 110 127 88
## Streptococcus Clostridium Ruminococcus Roseburia
## 60 74 93 103
## Blautia Lactobacillus Parabacteroides Alistipes
## 83 44 60 75
## Coprococcus Dorea Eubacterium Veillonella
## 72 61 58 32
## Enterococcus Lachnospira Akkermansia Anaerostipes
## 15 74 33 58
Plot the heatmap
# annotation
mat <- matrix(nc = 2, cbind(top20.up, top20.down))
bp <- ComplexHeatmap::anno_barplot(mat, gp = gpar(fill = c("#D55E00", "#0072B2"),
col = c("#D55E00", "#0072B2")),
height = unit(2, "cm"))
banno <- ComplexHeatmap::HeatmapAnnotation(`Abundance in Group 1` = bp)
lgd_list <- list(
Legend(labels = c("increased", "decreased"),
title = "Abundance in Group 1",
type = "grid",
legend_gp = gpar(col = c("#D55E00", "#0072B2"), fill = c("#D55E00", "#0072B2"))))
# same direction
# lcm <- sweep(cooc.mat, 2, matrixStats::colMaxs(cooc.mat), FUN = "/")
# we need to dampen the maximum here a bit down,
# otherwise 100% self co-occurrence takes up a large fraction of the colorscale,
sec <- apply(cooc.mat, 2, function(x) sort(x, decreasing = TRUE)[2])
cooc.mat2 <- cooc.mat
for(i in 1:ncol(cooc.mat2)) cooc.mat2[i,i] <- min(cooc.mat2[i,i], 1.4 * sec[i])
lcm <- sweep(cooc.mat2, 2, matrixStats::colMaxs(cooc.mat2), FUN = "/")
col <- circlize::colorRamp2(c(0,1), c("#EEEEEE", "red"))
ht1 <- ComplexHeatmap::Heatmap(lcm,
col = col,
name = "Relative frequency (top)",
cluster_columns = FALSE,
row_km = 3,
row_title = "same direction",
column_names_rot = 45,
row_names_gp = gpar(fontsize = 8),
column_names_gp = gpar(fontsize = 8))
# opposite direction
acm <- sweep(antag.mat, 2, matrixStats::colMaxs(antag.mat), FUN = "/")
col <- circlize::colorRamp2(c(0,1), c("#EEEEEE", "blue"))
ht2 <- ComplexHeatmap::Heatmap(acm,
col = col,
name = "Relative frequency (bottom)",
cluster_columns = FALSE,
row_title = "opposite direction",
row_km = 3,
column_names_rot = 45,
row_names_gp = gpar(fontsize = 8),
column_names_gp = gpar(fontsize = 8))
# phylum
sfp <- bugsigdbr::getSignatures(dat.feces, tax.id.type = "metaphlan",
tax.level = "genus", exact.tax.level = FALSE)
sfp20 <- sort(table(unlist(sfp)), decreasing = TRUE)[1:20]
uanno <- bugsigdbr::extractTaxLevel(names(sfp20),
tax.id.type = "taxname",
tax.level = "phylum",
exact.tax.level = FALSE)
phyla.grid <- seq_along(unique(uanno))
panno <- ComplexHeatmap::HeatmapAnnotation(phylum = uanno)
uanno <- matrix(uanno, nrow = 1)
colnames(uanno) <- names(top20)
pcols <- c("#CC79A7", "#F0E442", "#009E73", "#56B4E9", "#E69F00")
uanno <- ComplexHeatmap::Heatmap(uanno, name = "Phylum",
col = pcols[phyla.grid],
cluster_columns = FALSE,
column_names_rot = 45,
column_names_gp = gpar(fontsize = 8))
# put everything together
ht_list <- ht1 %v% banno %v% ht2 %v% uanno
ComplexHeatmap::draw(ht_list, annotation_legend_list = lgd_list, merge_legend = TRUE)
decorate_annotation("Abundance in Group 1", {
grid.text("# signatures", x = unit(-1, "cm"), rot = 90, just = "bottom", gp = gpar(fontsize = 8))
grid.text("*", x = unit(2.45, "cm"), y = unit(1.2, "cm"))
grid.text("*", x = unit(5.18, "cm"), y = unit(1, "cm"))
grid.text("*", x = unit(6.55, "cm"), y = unit(0.95, "cm"))
grid.text("*", x = unit(8.6, "cm"), y = unit(0.85, "cm"))
grid.text("*", x = unit(10, "cm"), y = unit(0.7, "cm"))
grid.text("*", x = unit(10.7, "cm"), y = unit(0.7, "cm"))
})
Inspect signature similarity for signatures from stomach samples based on Jaccard index:
stomachsub <- subset(dat, `Body site` == "Stomach")
sigsub <- bugsigdbr::getSignatures(stomachsub)
pair.jsim <- calcJaccardSimilarity(sigsub)
Create a dendrogram of Jaccard dissimilarities (1.0 has no overlap, 0.0 are identical signatures).