Get bulk export from bugsigdb.org:
full.dat <- bugsigdbr::importBugSigDB(version = "devel", cache = FALSE)
dim(full.dat)
## [1] 2942 50
colnames(full.dat)
## [1] "BSDB ID" "Study"
## [3] "Study design" "PMID"
## [5] "DOI" "URL"
## [7] "Authors list" "Title"
## [9] "Journal" "Year"
## [11] "Keywords" "Experiment"
## [13] "Location of subjects" "Host species"
## [15] "Body site" "UBERON ID"
## [17] "Condition" "EFO ID"
## [19] "Group 0 name" "Group 1 name"
## [21] "Group 1 definition" "Group 0 sample size"
## [23] "Group 1 sample size" "Antibiotics exclusion"
## [25] "Sequencing type" "16S variable region"
## [27] "Sequencing platform" "Statistical test"
## [29] "Significance threshold" "MHT correction"
## [31] "LDA Score above" "Matched on"
## [33] "Confounders controlled for" "Pielou"
## [35] "Shannon" "Chao1"
## [37] "Simpson" "Inverse Simpson"
## [39] "Richness" "Signature page name"
## [41] "Source" "Curated date"
## [43] "Curator" "Revision editor"
## [45] "Description" "Abundance in Group 1"
## [47] "MetaPhlAn taxon names" "NCBI Taxonomy IDs"
## [49] "State" "Reviewer"
Stripping illformed entries:
Number of papers and signatures curated:
## [1] 726
nrow(full.dat)
## [1] 2942
Publication date of the curated papers:
pmids <- pmids[!is.na(pmids)]
pubyear1 <- pmid2pubyear(pmids[1:361])
pubyear2 <- pmid2pubyear(pmids[362:length(pmids)])
pubyear <- c(pubyear1, pubyear2)
head(cbind(pmids, pubyear))
## pmids pubyear
## [1,] "28038683" "2016"
## [2,] "28173873" "2017"
## [3,] "27015276" "2016"
## [4,] "27625705" "2016"
## [5,] "23071781" "2012"
## [6,] "28467925" "2017"
tab <- table(pubyear)
tab <- tab[-length(tab)]
tab <- tab[order(as.integer(names(tab)))]
df <- data.frame(year = names(tab), papers = as.integer(tab))
ggbarplot(df, x = "year", y = "papers",
label = TRUE, fill = "steelblue",
ggtheme = theme_bw())
Stripping empty signatures:
ind1 <- lengths(full.dat[["MetaPhlAn taxon names"]]) > 0
ind2 <- lengths(full.dat[["NCBI Taxonomy IDs"]]) > 0
dat <- full.dat[ind1 & ind2,]
nrow(dat)
## [1] 2942
Papers containing only empty UP and DOWN signatures (under curation?):
## numeric(0)
Progress over time:
dat[,"Curated date"] <- as.character(lubridate::dmy(dat[,"Curated date"]))
plotProgressOverTime(dat)
## Warning in rbind(cdbm, cpbm): number of columns of result is not a multiple of
## vector length (arg 2)
plotProgressOverTime(dat, diff = TRUE)
## Warning in rbind(cdbm, cpbm): number of columns of result is not a multiple of
## vector length (arg 2)
Stratified by curator:
npc <- stratifyByCurator(dat)
plotCuratorStats(dat, npc)
Number of complete and revised signatures:
table(df[["State"]])
## < table of extent 0 >
table(dat[,"Revision editor"])
##
## Adi13 Aiyshaaaa
## 8 21
## Aiyshaaaa,Atrayees Aiyshaaaa,Claregrieve1
## 2 4
## Aiyshaaaa,WikiWorks Aiyshaaaa,WikiWorks,Merit
## 1 4
## Andre Andre,Chloe
## 2 1
## Annabelcute,Aiyshaaaa Atrayees
## 1 93
## Atrayees,Aiyshaaaa Atrayees,Aiyshaaaa,Claregrieve1
## 1 1
## Atrayees,Claregrieve1 Atrayees,Lwaldron
## 6 2
## Atrayees,Lwaldron,Claregrieve1 Atrayees,WikiWorks
## 1 20
## Atrayees,WikiWorks,Merit Barakat Dindi,Chloe
## 2 2
## Blessing Kaz,Aiyshaaaa,Atrayees BLESSING123
## 1 15
## BLESSING123,Chloe Brian,Suwaiba
## 2 1
## Brian,Suwaiba,Atrayees Busayo
## 1 1
## Busayo,Fatima Busayo,Mcarlson
## 1 2
## Chioma Chioma,Fatima
## 4 2
## Chloe Chloe,Kwekuamoo
## 7 1
## Chloe,Lorakasselman,Aiyshaaaa Chloe,Merit
## 1 1
## Chloe,WikiWorks Claregrieve1
## 4 157
## Claregrieve1,Aiyshaaaa Claregrieve1,Atrayees
## 2 1
## Claregrieve1,Atrayees,WikiWorks Claregrieve1,Atrayees,WikiWorks,Merit
## 2 1
## Claregrieve1,Chloe Claregrieve1,Chloe,WikiWorks,Merit
## 1 1
## Claregrieve1,Fatima Claregrieve1,Fatima,LGeistlinger
## 10 1
## Claregrieve1,Fatima,Yu Wang Claregrieve1,Lwaldron
## 1 2
## Claregrieve1,Lwaldron,Suwaiba Claregrieve1,Merit
## 1 8
## Claregrieve1,Merit,WikiWorks Claregrieve1,Rukky,WikiWorks
## 2 1
## Claregrieve1,Suwaiba Claregrieve1,Suwaiba,Merit
## 1 1
## Claregrieve1,WikiWorks Claregrieve1,WikiWorks,Atrayees
## 262 1
## Claregrieve1,WikiWorks,Merit Cyberian
## 6 4
## Cyberian,Chloe Cyberian,Chloe,Aiyshaaaa
## 1 1
## Cynthia Anderson Cynthia Anderson,Claregrieve1
## 28 2
## Cynthia Anderson,Fatima Cynthia Anderson,LGeistlinger,WikiWorks
## 2 2
## Cynthia Anderson,Lwaldron,WikiWorks Danyab56
## 1 2
## Danyab56,Aiyshaaaa,Claregrieve1 Deacme,Aiyshaaaa,Atrayees
## 5 8
## Deacme,Atrayees Dupe
## 6 1
## Dupe,Aiyshaaaa,Atrayees Dupe,Atrayees
## 1 2
## Dupe,Mcarlson Ellajessica
## 2 2
## Ellajessica,Aiyshaaaa Fatima
## 2 23
## Fatima,Aiyshaaaa Fatima,Claregrieve1
## 2 2
## Fatima,Claregrieve1,WikiWorks Fatima,Kwekuamoo,WikiWorks
## 11 2
## Fatima,LGeistlinger,WikiWorks Fatima,Lwaldron,WikiWorks
## 1 3
## Fatima,Merit,WikiWorks Fatima,WikiWorks
## 2 40
## Fatima,WikiWorks,Merit Fcuevas3
## 3 48
## Fcuevas3,Aiyshaaaa Fcuevas3,Aiyshaaaa,Atrayees
## 2 1
## Fcuevas3,Atrayees Fcuevas3,Claregrieve1
## 1 6
## Fcuevas3,Fatima Fcuevas3,Lwaldron,Aiyshaaaa
## 2 2
## Fcuevas3,Rimsha Gina
## 8 14
## Haoyanzh Haoyanzh,Lwaldron
## 19 2
## Itslanapark Itslanapark,Aiyshaaaa
## 27 5
## Itslanapark,Atrayees Itslanapark,Chloe
## 1 3
## Itslanapark,Chloe,Aiyshaaaa,Merit Itslanapark,Claregrieve1
## 1 4
## Itslanapark,Claregrieve1,Aiyshaaaa Itslanapark,Claregrieve1,Atrayees
## 1 1
## Itslanapark,Fatima Itslanapark,Fatima,Chloe,Merit
## 2 1
## Itslanapark,Rimsha Jacquelynshevin
## 1 37
## Jacquelynshevin,Chloe Jacquelynshevin,Chloe,WikiWorks
## 1 1
## Jacquelynshevin,Fatima Jeshudy
## 7 55
## Jeshudy,Aiyshaaaa Jeshudy,Claregrieve1
## 3 38
## Jeshudy,Fatima Jeshudy,Suwaiba
## 4 2
## Joyessa Joyessa,Aiyshaaaa
## 1 3
## Joyessa,Claregrieve1 Joyessa,Claregrieve1,Merit
## 18 1
## Joyessa,Fatima,Claregrieve1 Kahvecirem,Aiyshaaaa,Claregrieve1
## 2 2
## Kahvecirem,Aiyshaaaa,Merit,Claregrieve1 Kahvecirem,Atrayees
## 3 2
## Kahvecirem,Atrayees,Merit,Claregrieve1 Kahvecirem,Merit,Claregrieve1
## 1 2
## Kaluifeanyi101 Kaluifeanyi101,Aiyshaaaa
## 50 2
## Kaluifeanyi101,Atrayees Kaluifeanyi101,Claregrieve1
## 3 16
## Kaluifeanyi101,Fatima KathyWaldron,WikiWorks
## 2 3
## KathyWaldron,WikiWorks,Merit Kelvin Joseph,Atrayees
## 1 6
## Kelvin Joseph,Claregrieve1 Khadeeejah,Aiyshaaaa,Atrayees
## 2 3
## Khadeeejah,Aiyshaaaa,Chloe,Atrayees Khadeeejah,Atrayees,Chloe,Aiyshaaaa
## 1 1
## Khadeeejah,Atrayees,Claregrieve1 Kwekuamoo
## 2 22
## Kwekuamoo,Aiyshaaaa Kwekuamoo,Atrayees
## 1 3
## Kwekuamoo,Claregrieve1 Kwekuamoo,Merit,WikiWorks
## 4 1
## Levitest,WikiWorks,Merit Lorakasselman,Aiyshaaaa,Merit
## 1 1
## Lorakasselman,Chloe Lorakasselman,Claregrieve1
## 2 2
## Lwaldron Lwaldron,Atrayees,WikiWorks,Aiyshaaaa
## 11 1
## Lwaldron,Claregrieve1,WikiWorks Lwaldron,Claregrieve1,WikiWorks,Merit
## 5 2
## Lwaldron,Fatima,WikiWorks Lwaldron,WikiWorks
## 1 30
## Lwaldron,WikiWorks,LGeistlinger Lwaldron,WikiWorks,Merit
## 1 4
## Madhubani Dey Madhubani Dey,Aiyshaaaa
## 11 1
## Madhubani Dey,Atrayees Madhubani Dey,Chloe,Merit
## 2 1
## Madhubani Dey,Claregrieve1 Madhubani Dey,Fatima,Claregrieve1
## 26 2
## Madhubani Dey,Lwaldron Madhubani Dey,Merit
## 1 2
## Manuela Mary Bearkland
## 11 24
## Mary Bearkland,Aiyshaaaa Mary Bearkland,Aiyshaaaa,Claregrieve1
## 1 1
## Mary Bearkland,Claregrieve1 Mary Bearkland,Fatima
## 20 3
## Mary Bearkland,Fatima,Merit Mary Bearkland,Merit
## 1 6
## Maryemzaki,Lwaldron Merit
## 2 2
## Merit,Aiyshaaaa Merit,Claregrieve1
## 1 1
## Merit,WikiWorks Mmarin
## 18 21
## Mmarin,Atrayees Mmarin,Claregrieve1
## 1 13
## Nice25 Nnadichioma,Aiyshaaaa,Atrayees
## 1 3
## Nnadichioma,Aiyshaaaa,Merit,Atrayees Nnadichioma,Atrayees,Aiyshaaaa,Merit
## 2 1
## Ombati,Atrayees Ombati,Chloe,Atrayees
## 3 1
## Rimsha Rimsha,Fatima,LGeistlinger,WikiWorks
## 4 1
## Rimsha,Fatima,WikiWorks Rimsha,Lwaldron
## 1 1
## Samara.Khan Samara.Khan,Claregrieve1
## 38 9
## Samara.Khan,Fatima Sharmilac
## 1 5
## Sharmilac,Aiyshaaaa Sharmilac,Claregrieve1
## 1 2
## Sharmilac,Fatima Sharmilac,Fatima,Aiyshaaaa
## 6 1
## Sharmilac,Merit Sophy
## 2 3
## Sophy,Aiyshaaaa,Claregrieve1 Sophy,Atrayees
## 4 4
## Sophy,Chloe Sophy,Claregrieve1
## 3 4
## Sophy,Mcarlson,Atrayees Suwaiba
## 2 14
## Suwaiba,Atrayees Tislam
## 9 29
## Tislam,Aiyshaaaa Tislam,Aiyshaaaa,Claregrieve1
## 2 1
## Tislam,Atrayees Tislam,Claregrieve1
## 2 6
## Tislam,Fatima Tislam,Fatima,Claregrieve1
## 8 1
## Tislam,Rimsha,Claregrieve1,Merit Titas
## 2 11
## Titas,Fatima Titas,Lwaldron
## 1 1
## Ufuoma Ejite Ufuoma Ejite,Atrayees
## 1 8
## Ufuoma Ejite,Chloe,Aiyshaaaa Ufuoma Ejite,Chloe,Aiyshaaaa,Atrayees
## 1 1
## Ufuoma Ejite,Lwaldron,WikiWorks Uyokeeswaran
## 7 2
## Valentina WikiWorks
## 2 1069
## WikiWorks,Aiyshaaaa WikiWorks,Aiyshaaaa,Atrayees
## 2 1
## WikiWorks,Atrayees WikiWorks,Atrayees,Merit
## 22 4
## WikiWorks,Claregrieve1 WikiWorks,Fatima
## 13 2
## WikiWorks,Jeshudy WikiWorks,Merit
## 1 27
## WikiWorks,Merit,Atrayees WikiWorks,Rukky
## 2 3
## WikiWorks,Suwaiba Yu Wang,Fatima
## 1 1
## Yu Wang,Fatima,Claregrieve1
## 4
spl <- split(dat[["Study"]], dat[["Study design"]])
sds <- lapply(spl, unique)
sort(lengths(sds), decreasing = FALSE)
## case-control,prospective cohort
## 1
## laboratory experiment,time series / longitudinal observational
## 1
## case-control,meta-analysis
## 5
## meta-analysis
## 5
## randomized controlled trial
## 30
## laboratory experiment
## 33
## time series / longitudinal observational
## 60
## prospective cohort
## 73
## cross-sectional observational, not case-control
## 214
## case-control
## 317
Columns of the full dataset that describe experiments:
# Experiment ID
exp.cols <- c("Study", "Experiment")
# Subjects
sub.cols <- c("Host species",
"Location of subjects",
"Body site",
"Condition",
"Antibiotics exclusion",
"Group 0 sample size",
"Group 1 sample size")
# Lab analysis
lab.cols <- c("Sequencing type",
"16S variable region",
"Sequencing platform")
# Statistical analysis
stat.cols <- c("Statistical test",
"MHT correction",
"Significance threshold")
# Alpha diversity
div.cols <- c("Pielou",
"Shannon",
"Chao1",
"Simpson",
"Inverse Simpson",
"Richness")
Restrict dataset to experiment information:
Number of experiments for the top 10 categories for each subjects column:
## $`Host species`
##
## Homo sapiens Mus musculus Rattus norvegicus
## 1601 72 6
##
## $`Location of subjects`
##
## China United States of America Italy
## 504 446 72
## Spain Japan South Korea
## 53 49 45
## Finland Canada Netherlands
## 38 31 31
## Brazil
## 25
##
## $`Body site`
##
## Feces Saliva Vagina
## 980 112 52
## Uterine cervix Mouth Skin of body
## 40 30 29
## Nasopharynx Intestine Meconium
## 28 26 25
## Subgingival dental plaque
## 24
##
## $Condition
##
## obesity
## 101
## COVID-19
## 89
## colorectal cancer
## 81
## antimicrobial agent
## 77
## Parkinson's disease
## 60
## diet
## 56
## smoking behavior
## 52
## human papilloma virus infection
## 42
## cervical glandular intraepithelial neoplasia
## 38
## gastric cancer
## 36
##
## $`Antibiotics exclusion`
##
## 3 months 1 month 6 months 2 months 4 weeks
## 206 119 85 61 54
## 2 weeks None 30 days 8 weeks None specified
## 31 16 15 12 12
Proportions instead:
sub.tab <- lapply(sub.cols[1:5], tabCol, df = exps, n = 10, perc = TRUE)
names(sub.tab) <- sub.cols[1:5]
sub.tab
## $`Host species`
##
## Homo sapiens Mus musculus Rattus norvegicus
## 0.95400 0.04290 0.00357
##
## $`Location of subjects`
##
## China United States of America Italy
## 0.3010 0.2660 0.0430
## Spain Japan South Korea
## 0.0316 0.0293 0.0269
## Finland Canada Netherlands
## 0.0227 0.0185 0.0185
## Brazil
## 0.0149
##
## $`Body site`
##
## Feces Saliva Vagina
## 0.5830 0.0667 0.0310
## Uterine cervix Mouth Skin of body
## 0.0238 0.0179 0.0173
## Nasopharynx Intestine Meconium
## 0.0167 0.0155 0.0149
## Subgingival dental plaque
## 0.0143
##
## $Condition
##
## obesity
## 0.0603
## COVID-19
## 0.0532
## colorectal cancer
## 0.0484
## antimicrobial agent
## 0.0460
## Parkinson's disease
## 0.0358
## diet
## 0.0335
## smoking behavior
## 0.0311
## human papilloma virus infection
## 0.0251
## cervical glandular intraepithelial neoplasia
## 0.0227
## gastric cancer
## 0.0215
##
## $`Antibiotics exclusion`
##
## 3 months 1 month 6 months 2 months 4 weeks
## 0.1990 0.1150 0.0820 0.0589 0.0521
## 2 weeks None 30 days 8 weeks None specified
## 0.0299 0.0154 0.0145 0.0116 0.0116
Sample size:
ssize <- apply(exps[,sub.cols[6:7]], 2, summary)
ssize
## Group 0 sample size Group 1 sample size
## Min. 0.00000 1.00000
## 1st Qu. 14.00000 13.00000
## Median 27.00000 25.00000
## Mean 64.62247 78.79228
## 3rd Qu. 54.00000 47.00000
## Max. 1883.00000 9623.00000
## NA's 53.00000 50.00000
Number of experiments for the top 10 categories for each lab analysis column:
## $`Sequencing type`
##
## 16S WMS PCR
## 1527 138 2
##
## $`16S variable region`
##
## 34 4 123 12 45 345 3 56 23 1234
## 513 402 128 77 63 61 38 18 15 12
##
## $`Sequencing platform`
##
## Illumina Roche454
## 1172 237
## Ion Torrent RT-qPCR
## 95 85
## Human Intestinal Tract Chip MGISEQ-2000
## 12 11
## Sanger Mass spectrometry
## 6 5
## BGISEQ-500 Sequencing HTF-Microbi.Array
## 4 4
Proportions instead:
lab.tab <- lapply(lab.cols, tabCol, df = exps, n = 10, perc = TRUE)
names(lab.tab) <- lab.cols
lab.tab
## $`Sequencing type`
##
## 16S WMS PCR
## 0.9160 0.0828 0.0012
##
## $`16S variable region`
##
## 34 4 123 12 45 345 3 56 23 1234
## 0.37500 0.29400 0.09360 0.05630 0.04610 0.04460 0.02780 0.01320 0.01100 0.00877
##
## $`Sequencing platform`
##
## Illumina Roche454
## 0.71100 0.14400
## Ion Torrent RT-qPCR
## 0.05760 0.05150
## Human Intestinal Tract Chip MGISEQ-2000
## 0.00728 0.00667
## Sanger Mass spectrometry
## 0.00364 0.00303
## BGISEQ-500 Sequencing HTF-Microbi.Array
## 0.00243 0.00243
Number of experiments for the top 10 categories for each statistical analysis column:
## $`Statistical test`
##
## LEfSe Mann-Whitney (Wilcoxon)
## 461 455
## DESeq2 Kruskall-Wallis
## 123 108
## T-Test ANOVA
## 88 71
## Linear Regression PERMANOVA
## 49 33
## Negative Binomial Regression Metastats
## 31 28
##
## $`MHT correction`
##
## TRUE FALSE
## 784 723
##
## $`Significance threshold`
##
## 0.05 0.1 0.01 0.001 0.15 0.2 2 0.005 0 0.25
## 1424 70 44 17 16 13 11 4 3 3
Proportions instead:
stat.tab <- lapply(stat.cols, tabCol, df = exps, n = 10, perc = TRUE)
names(stat.tab) <- stat.cols
stat.tab
## $`Statistical test`
##
## LEfSe Mann-Whitney (Wilcoxon)
## 0.2800 0.2760
## DESeq2 Kruskall-Wallis
## 0.0747 0.0656
## T-Test ANOVA
## 0.0535 0.0431
## Linear Regression PERMANOVA
## 0.0298 0.0200
## Negative Binomial Regression Metastats
## 0.0188 0.0170
##
## $`MHT correction`
##
## TRUE FALSE
## 0.52 0.48
##
## $`Significance threshold`
##
## 0.05 0.1 0.01 0.001 0.15 0.2 2 0.005 0 0.25
## 0.88200 0.04340 0.02730 0.01050 0.00991 0.00805 0.00682 0.00248 0.00186 0.00186
Overall distribution:
apply(exps[,div.cols], 2, table)
## Pielou Shannon Chao1 Simpson Inverse Simpson Richness
## decreased 18 243 143 77 17 124
## increased 14 176 83 47 18 121
## unchanged 42 575 340 225 40 294
Correspondence of Shannon diversity and Richness:
table(exps$Shannon, exps$Richness)
##
## decreased increased unchanged
## decreased 61 5 20
## increased 3 48 18
## unchanged 21 31 222
Conditions with consistently increased or decreased alpha diversity:
tabDiv(exps, "Shannon", "Condition")
## increased decreased unchanged
## COVID-19 5 15 33
## gastric cancer 3 12 14
## Aging 0 7 0
## human papilloma virus infection 7 0 24
## HIV infection 1 7 10
## lung cancer 2 8 5
## urinary tract infection 0 6 3
## cesarean section 5 0 14
## Gut microbiome measurement 0 5 0
## Parkinson's disease 11 6 15
## smoking behavior 10 5 17
## acute lymphoblastic leukemia 0 4 4
## antimicrobial agent 5 9 22
## cervical cancer 4 0 4
## hypertension 4 0 2
## obesity 3 7 36
## age 2 5 2
## alcohol drinking 3 0 2
## Alzheimer's disease 0 3 2
## atopic asthma 4 1 7
## Crohn's disease 0 3 2
## periodontitis 3 0 3
## squamous cell carcinoma 3 0 3
## cervical glandular intraepithelial neoplasia 2 0 9
## colorectal cancer 6 8 24
## Eczema 0 2 10
## ethnic group 4 6 7
## food allergy 0 2 10
## schizophrenia 1 3 7
## socioeconomic status 4 2 4
## acute myeloid leukemia 2 3 2
## air pollution 7 6 3
## asthma 1 0 11
## atopic eczema 2 3 17
## chronic fatigue syndrome 0 1 4
## chronic hepatitis B virus infection 0 1 5
## diet 6 5 14
## endometriosis 2 3 17
## esophageal cancer 1 2 2
## irritable bowel syndrome 1 0 6
## milk allergic reaction 1 0 5
## multiple sclerosis 0 1 7
## pancreatic carcinoma 0 1 4
## Response to immunochemotherapy 2 1 3
## arthritis 1 1 5
## colorectal adenoma 2 2 5
## gestational diabetes 0 0 5
## hepatocellular carcinoma 0 0 6
## HIV mother to child transmission 0 0 5
## obsessive-compulsive disorder 0 0 5
## rheumatoid arthritis 3 3 4
## type II diabetes mellitus 2 2 9
tabDiv(exps, "Shannon", "Condition", perc = TRUE)
## increased decreased unchanged
## COVID-19 0.094 0.280 0.62
## gastric cancer 0.100 0.410 0.48
## Aging 0.000 1.000 0.00
## human papilloma virus infection 0.230 0.000 0.77
## HIV infection 0.056 0.390 0.56
## lung cancer 0.130 0.530 0.33
## urinary tract infection 0.000 0.670 0.33
## cesarean section 0.260 0.000 0.74
## Gut microbiome measurement 0.000 1.000 0.00
## Parkinson's disease 0.340 0.190 0.47
## smoking behavior 0.310 0.160 0.53
## acute lymphoblastic leukemia 0.000 0.500 0.50
## antimicrobial agent 0.140 0.250 0.61
## cervical cancer 0.500 0.000 0.50
## hypertension 0.670 0.000 0.33
## obesity 0.065 0.150 0.78
## age 0.220 0.560 0.22
## alcohol drinking 0.600 0.000 0.40
## Alzheimer's disease 0.000 0.600 0.40
## atopic asthma 0.330 0.083 0.58
## Crohn's disease 0.000 0.600 0.40
## periodontitis 0.500 0.000 0.50
## squamous cell carcinoma 0.500 0.000 0.50
## cervical glandular intraepithelial neoplasia 0.180 0.000 0.82
## colorectal cancer 0.160 0.210 0.63
## Eczema 0.000 0.170 0.83
## ethnic group 0.240 0.350 0.41
## food allergy 0.000 0.170 0.83
## schizophrenia 0.091 0.270 0.64
## socioeconomic status 0.400 0.200 0.40
## acute myeloid leukemia 0.290 0.430 0.29
## air pollution 0.440 0.380 0.19
## asthma 0.083 0.000 0.92
## atopic eczema 0.091 0.140 0.77
## chronic fatigue syndrome 0.000 0.200 0.80
## chronic hepatitis B virus infection 0.000 0.170 0.83
## diet 0.240 0.200 0.56
## endometriosis 0.091 0.140 0.77
## esophageal cancer 0.200 0.400 0.40
## irritable bowel syndrome 0.140 0.000 0.86
## milk allergic reaction 0.170 0.000 0.83
## multiple sclerosis 0.000 0.120 0.88
## pancreatic carcinoma 0.000 0.200 0.80
## Response to immunochemotherapy 0.330 0.170 0.50
## arthritis 0.140 0.140 0.71
## colorectal adenoma 0.220 0.220 0.56
## gestational diabetes 0.000 0.000 1.00
## hepatocellular carcinoma 0.000 0.000 1.00
## HIV mother to child transmission 0.000 0.000 1.00
## obsessive-compulsive disorder 0.000 0.000 1.00
## rheumatoid arthritis 0.300 0.300 0.40
## type II diabetes mellitus 0.150 0.150 0.69
tabDiv(exps, "Richness", "Condition")
## increased decreased unchanged
## COVID-19 4 15 17
## air pollution 14 5 4
## antimicrobial agent 1 10 5
## Parkinson's disease 13 4 3
## alcohol drinking 5 0 0
## acute lymphoblastic leukemia 5 1 0
## cervical glandular intraepithelial neoplasia 4 0 2
## atopic asthma 4 1 7
## food allergy 0 3 9
## gastric cancer 3 6 13
## HIV infection 0 3 7
## human papilloma virus infection 4 1 13
## smoking behavior 1 4 8
## colorectal adenoma 0 2 8
## endometriosis 3 1 15
## asthma 1 0 10
## atopic eczema 2 1 7
## colorectal cancer 6 5 9
## diet 3 2 6
## ethnic group 3 2 1
## irritable bowel syndrome 0 1 7
## lung cancer 0 1 6
## obesity 6 7 18
## obsessive-compulsive disorder 0 1 4
## schizophrenia 1 2 2
## cesarean section 2 2 7
## HIV mother to child transmission 0 0 5
## multiple sclerosis 0 0 9
## socioeconomic status 2 2 1
tabDiv(exps, "Richness", "Condition", perc = TRUE)
## increased decreased unchanged
## COVID-19 0.110 0.420 0.47
## air pollution 0.610 0.220 0.17
## antimicrobial agent 0.062 0.620 0.31
## Parkinson's disease 0.650 0.200 0.15
## alcohol drinking 1.000 0.000 0.00
## acute lymphoblastic leukemia 0.830 0.170 0.00
## cervical glandular intraepithelial neoplasia 0.670 0.000 0.33
## atopic asthma 0.330 0.083 0.58
## food allergy 0.000 0.250 0.75
## gastric cancer 0.140 0.270 0.59
## HIV infection 0.000 0.300 0.70
## human papilloma virus infection 0.220 0.056 0.72
## smoking behavior 0.077 0.310 0.62
## colorectal adenoma 0.000 0.200 0.80
## endometriosis 0.160 0.053 0.79
## asthma 0.091 0.000 0.91
## atopic eczema 0.200 0.100 0.70
## colorectal cancer 0.300 0.250 0.45
## diet 0.270 0.180 0.55
## ethnic group 0.500 0.330 0.17
## irritable bowel syndrome 0.000 0.120 0.88
## lung cancer 0.000 0.140 0.86
## obesity 0.190 0.230 0.58
## obsessive-compulsive disorder 0.000 0.200 0.80
## schizophrenia 0.200 0.400 0.40
## cesarean section 0.180 0.180 0.64
## HIV mother to child transmission 0.000 0.000 1.00
## multiple sclerosis 0.000 0.000 1.00
## socioeconomic status 0.400 0.400 0.20
Body sites with consistently increased or decreased alpha diversity:
tabDiv(exps, "Shannon", "Body site")
## increased decreased unchanged
## Feces 75 147 343
## Mouth 9 2 11
## Duodenum 0 6 0
## Stomach 3 9 5
## Meconium 5 0 7
## Posterior fornix of vagina 5 0 3
## Subgingival dental plaque 7 2 7
## Tongue 0 5 8
## Buccal mucosa 4 0 1
## Vagina 9 5 10
## Caecum 1 4 1
## Dental plaque 0 3 3
## Oral cavity 3 0 2
## Skin of body 4 7 6
## Uterine cervix 3 0 23
## Nasopharynx 1 3 18
## Breast 3 2 0
## Lung 1 2 7
## Oropharynx 1 2 3
## Saliva 19 20 40
## Uterus 0 1 7
## Blood 0 0 5
## Bronchus 0 0 6
## Intestine 1 1 6
## Rectum 0 0 12
tabDiv(exps, "Shannon", "Body site", perc = TRUE)
## increased decreased unchanged
## Feces 0.130 0.260 0.61
## Mouth 0.410 0.091 0.50
## Duodenum 0.000 1.000 0.00
## Stomach 0.180 0.530 0.29
## Meconium 0.420 0.000 0.58
## Posterior fornix of vagina 0.620 0.000 0.38
## Subgingival dental plaque 0.440 0.120 0.44
## Tongue 0.000 0.380 0.62
## Buccal mucosa 0.800 0.000 0.20
## Vagina 0.380 0.210 0.42
## Caecum 0.170 0.670 0.17
## Dental plaque 0.000 0.500 0.50
## Oral cavity 0.600 0.000 0.40
## Skin of body 0.240 0.410 0.35
## Uterine cervix 0.120 0.000 0.88
## Nasopharynx 0.045 0.140 0.82
## Breast 0.600 0.400 0.00
## Lung 0.100 0.200 0.70
## Oropharynx 0.170 0.330 0.50
## Saliva 0.240 0.250 0.51
## Uterus 0.000 0.120 0.88
## Blood 0.000 0.000 1.00
## Bronchus 0.000 0.000 1.00
## Intestine 0.120 0.120 0.75
## Rectum 0.000 0.000 1.00
tabDiv(exps, "Richness", "Body site")
## increased decreased unchanged
## Feces 57 70 157
## Colon 8 0 2
## Mouth 7 1 5
## Oropharynx 0 6 3
## Posterior fornix of vagina 6 0 2
## Uterine cervix 6 0 16
## Stomach 2 7 3
## Subgingival dental plaque 4 0 4
## Nasopharynx 3 5 10
## Rectum 0 2 7
## Saliva 5 7 19
## Tongue 0 2 5
## Vagina 3 1 11
## Caecum 2 3 0
## Meconium 1 2 4
## Bronchus 0 0 6
## Intestine 0 0 7
tabDiv(exps, "Richness", "Body site", perc = TRUE)
## increased decreased unchanged
## Feces 0.20 0.250 0.55
## Colon 0.80 0.000 0.20
## Mouth 0.54 0.077 0.38
## Oropharynx 0.00 0.670 0.33
## Posterior fornix of vagina 0.75 0.000 0.25
## Uterine cervix 0.27 0.000 0.73
## Stomach 0.17 0.580 0.25
## Subgingival dental plaque 0.50 0.000 0.50
## Nasopharynx 0.17 0.280 0.56
## Rectum 0.00 0.220 0.78
## Saliva 0.16 0.230 0.61
## Tongue 0.00 0.290 0.71
## Vagina 0.20 0.067 0.73
## Caecum 0.40 0.600 0.00
## Meconium 0.14 0.290 0.57
## Bronchus 0.00 0.000 1.00
## Intestine 0.00 0.000 1.00
sigs <- bugsigdbr::getSignatures(dat, tax.id.type = "metaphlan")
Number unique microbes contained in the signatures:
## [1] 2406
Development of unique microbes captured over time:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 4.000 7.477 9.000 100.000
gghistogram(lengths(sigs), bins = 30, ylab = "number of signatures",
xlab = "signature size", fill = "#00AFBB", ggtheme = theme_bw())
## [1] 1440
dat.feces <- subset(dat, `Body site` == "Feces")
cooc.mat <- microbeHeatmap(dat.feces, tax.level = "genus")
## Loading required namespace: safe
antag.mat <- microbeHeatmap(dat.feces, tax.level = "genus", antagonistic = TRUE)
Get the top 20 genera most frequently reported as differentially abundant:
sigs.feces <- getSignatures(dat.feces, tax.id.type = "taxname",
tax.level = "genus", exact.tax.level = FALSE)
top20 <- sort(table(unlist(sigs.feces)), decreasing = TRUE)[1:20]
top20
##
## Bacteroides Bifidobacterium Faecalibacterium Prevotella
## 313 216 202 197
## Streptococcus Blautia Clostridium Roseburia
## 196 173 171 170
## Ruminococcus Lactobacillus Parabacteroides Alistipes
## 168 146 137 126
## Coprococcus Dorea Eubacterium Veillonella
## 123 121 121 116
## Enterococcus Anaerostipes Lachnospira Akkermansia
## 111 104 104 103
Subset heatmaps to the top 20 genera most frequently reported as differentially abundant:
## [1] TRUE
## [1] TRUE
Distinguish by direction of abundance change (increased / decreased):
# increased
sub.dat.feces <- subset(dat.feces, `Abundance in Group 1` == "increased")
sigs.feces.up <- getSignatures(sub.dat.feces, tax.id.type = "taxname",
tax.level = "genus", exact.tax.level = FALSE)
top20.up <- table(unlist(sigs.feces.up))[names(top20)]
top20.up
##
## Bacteroides Bifidobacterium Faecalibacterium Prevotella
## 126 90 68 102
## Streptococcus Blautia Clostridium Roseburia
## 122 71 86 50
## Ruminococcus Lactobacillus Parabacteroides Alistipes
## 66 101 67 46
## Coprococcus Dorea Eubacterium Veillonella
## 37 48 49 76
## Enterococcus Anaerostipes Lachnospira Akkermansia
## 92 37 23 64
# decreased
sub.dat.feces <- subset(dat.feces, `Abundance in Group 1` == "decreased")
sigs.feces.down <- getSignatures(sub.dat.feces, tax.id.type = "taxname",
tax.level = "genus", exact.tax.level = FALSE)
top20.down <- table(unlist(sigs.feces.down))[names(top20)]
top20.down
##
## Bacteroides Bifidobacterium Faecalibacterium Prevotella
## 179 117 129 92
## Streptococcus Blautia Clostridium Roseburia
## 65 97 81 115
## Ruminococcus Lactobacillus Parabacteroides Alistipes
## 98 44 65 75
## Coprococcus Dorea Eubacterium Veillonella
## 81 68 67 36
## Enterococcus Anaerostipes Lachnospira Akkermansia
## 16 62 76 34
Plot the heatmap
# annotation
mat <- matrix(nc = 2, cbind(top20.up, top20.down))
bp <- ComplexHeatmap::anno_barplot(mat, gp = gpar(fill = c("#D55E00", "#0072B2"),
col = c("#D55E00", "#0072B2")),
height = unit(2, "cm"))
banno <- ComplexHeatmap::HeatmapAnnotation(`Abundance in Group 1` = bp)
lgd_list <- list(
Legend(labels = c("increased", "decreased"),
title = "Abundance in Group 1",
type = "grid",
legend_gp = gpar(col = c("#D55E00", "#0072B2"), fill = c("#D55E00", "#0072B2"))))
# same direction
# lcm <- sweep(cooc.mat, 2, matrixStats::colMaxs(cooc.mat), FUN = "/")
# we need to dampen the maximum here a bit down,
# otherwise 100% self co-occurrence takes up a large fraction of the colorscale,
sec <- apply(cooc.mat, 2, function(x) sort(x, decreasing = TRUE)[2])
cooc.mat2 <- cooc.mat
for(i in 1:ncol(cooc.mat2)) cooc.mat2[i,i] <- min(cooc.mat2[i,i], 1.4 * sec[i])
lcm <- sweep(cooc.mat2, 2, matrixStats::colMaxs(cooc.mat2), FUN = "/")
col <- circlize::colorRamp2(c(0,1), c("#EEEEEE", "red"))
ht1 <- ComplexHeatmap::Heatmap(lcm,
col = col,
name = "Relative frequency (top)",
cluster_columns = FALSE,
row_km = 3,
row_title = "same direction",
column_names_rot = 45,
row_names_gp = gpar(fontsize = 8),
column_names_gp = gpar(fontsize = 8))
# opposite direction
acm <- sweep(antag.mat, 2, matrixStats::colMaxs(antag.mat), FUN = "/")
col <- circlize::colorRamp2(c(0,1), c("#EEEEEE", "blue"))
ht2 <- ComplexHeatmap::Heatmap(acm,
col = col,
name = "Relative frequency (bottom)",
cluster_columns = FALSE,
row_title = "opposite direction",
row_km = 3,
column_names_rot = 45,
row_names_gp = gpar(fontsize = 8),
column_names_gp = gpar(fontsize = 8))
# phylum
sfp <- bugsigdbr::getSignatures(dat.feces, tax.id.type = "metaphlan",
tax.level = "genus", exact.tax.level = FALSE)
sfp20 <- sort(table(unlist(sfp)), decreasing = TRUE)[1:20]
uanno <- bugsigdbr::extractTaxLevel(names(sfp20),
tax.id.type = "taxname",
tax.level = "phylum",
exact.tax.level = FALSE)
phyla.grid <- seq_along(unique(uanno))
panno <- ComplexHeatmap::HeatmapAnnotation(phylum = uanno)
uanno <- matrix(uanno, nrow = 1)
colnames(uanno) <- names(top20)
pcols <- c("#CC79A7", "#F0E442", "#009E73", "#56B4E9", "#E69F00")
uanno <- ComplexHeatmap::Heatmap(uanno, name = "Phylum",
col = pcols[phyla.grid],
cluster_columns = FALSE,
column_names_rot = 45,
column_names_gp = gpar(fontsize = 8))
# put everything together
ht_list <- ht1 %v% banno %v% ht2 %v% uanno
ComplexHeatmap::draw(ht_list, annotation_legend_list = lgd_list, merge_legend = TRUE)
decorate_annotation("Abundance in Group 1", {
grid.text("# signatures", x = unit(-1, "cm"), rot = 90, just = "bottom", gp = gpar(fontsize = 8))
grid.text("*", x = unit(2.45, "cm"), y = unit(1.2, "cm"))
grid.text("*", x = unit(5.18, "cm"), y = unit(1, "cm"))
grid.text("*", x = unit(6.55, "cm"), y = unit(0.95, "cm"))
grid.text("*", x = unit(8.6, "cm"), y = unit(0.85, "cm"))
grid.text("*", x = unit(10, "cm"), y = unit(0.7, "cm"))
grid.text("*", x = unit(10.7, "cm"), y = unit(0.7, "cm"))
})
Inspect signature similarity for signatures from stomach samples based on Jaccard index:
stomachsub <- subset(dat, `Body site` == "Stomach")
sigsub <- bugsigdbr::getSignatures(stomachsub)
pair.jsim <- calcJaccardSimilarity(sigsub)
Create a dendrogram of Jaccard dissimilarities (1.0 has no overlap, 0.0 are identical signatures).