healthysig.Rmd
Compare genus abundance change (increased/decreased) from all diseased subjects in BugsigDB with existing typical genus from healthy volunteers recorded in BugsigDB Study 562 typical genus were set at prevalence level=0, 50, and 70 Using Feces sample as an example
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggplot2
full.dat <- bugsigdbr::importBugSigDB(version = "devel", cache=FALSE)
#Stripping empty signatures:
ind1 <- lengths(full.dat[["MetaPhlAn taxon names"]]) > 0
ind2 <- lengths(full.dat[["NCBI Taxonomy IDs"]]) > 0
dat <- full.dat[ind1 & ind2,]
#prevalence threshold=50%
bugs50<- (dat) %>%
filter (Study=='Study 562') %>%
filter (Experiment=="Experiment 3" & `Signature page name`=='Signature 1') %>%
filter (`Body site` == "Feces") %>%
filter (grepl("genus",Description))
healthysig50 <- bugsigdbr::getSignatures(bugs50)
head(healthysig50)
## $`bsdb:562/3/1_Health-study-participation:feces-from-healthy-adult_vs_none---50%-prevalence-threshold_NA`
## [1] "447020" "2048137" "239934" "239759" "2569097" "207244" "816"
## [8] "397864" "1678" "35832" "572511" "574697" "1485" "102106"
## [15] "33042" "189330" "561" "1730" "216851" "946234" "1407607"
## [22] "204475" "644652" "1505657" "28050" "2172004" "2316020" "283168"
## [29] "459786" "375288" "577310" "33024" "909656" "838" "841"
## [36] "1263" "1905344" "1301"
#prevalence threshold=0
bugs0<- (dat) %>%
filter (Study=='Study 562') %>%
filter (Experiment=="Experiment 10") %>%
filter (`Body site` == "Feces") %>%
filter (grepl("genus",Description))
healthysig0 <- bugsigdbr::getSignatures(bugs0)
head(healthysig0)
#prevalence threshold=70%
bugs70<- (dat) %>%
filter (Study=='Study 562') %>%
filter (Experiment=="Experiment 9" & `Signature page name`=='Signature 1') %>%
filter (`Body site` == "Feces") %>%
filter (grepl("genus",Description))
healthysig70 <- bugsigdbr::getSignatures(bugs70)
head(healthysig70)
## $`bsdb:562/9/1_Health-study-participation:feces-from-healthy-adult_vs_none---70%-prevalence-threshold_NA`
## [1] "447020" "2048137" "239759" "2569097" "207244" "816" "1678"
## [8] "572511" "1485" "102106" "33042" "189330" "1730" "216851"
## [15] "946234" "1407607" "204475" "28050" "2316020" "283168" "459786"
## [22] "375288" "909656" "841" "1263" "1301" "1905344"
bugdisease_increased <- dat %>%
filter (`Host species` == 'Homo sapiens') %>%
filter (`Body site` == 'Feces') %>%
filter (`Abundance in Group 1`=="increased")
#get genus level sig change in abundance only
diseasesig_increased <- bugsigdbr::getSignatures(bugdisease_increased, tax.level='genus')
length(diseasesig_increased)
## [1] 1178
regexp <- "[[:digit:]]+"
bugdisease_increased2<-bugdisease_increased%>%mutate(
study_id=stringr::str_extract(Study, regexp),
experiment_id=stringr::str_extract(Experiment, regexp),
signature_id=stringr::str_extract(`Signature page name`, regexp)
)
diseasesig_increased2 <- purrr::map_df(diseasesig_increased, ~as.data.frame(.x), .id="id") %>%
# tidyr::pivot_wider(id_cols=id) %>%
mutate(name2_1=stringr::str_replace(id, "bsdb:", "")) %>%
mutate(name2_2=(stringr::str_split(name2_1,"/"))) %>%
mutate(study_id= purrr::map_chr(name2_2, 1),
experiment_id= purrr::map_chr(name2_2, 2),
signature_id= purrr::map_chr(stringr::str_split(purrr::map_chr(name2_2, 3),'_'),1)
) %>%
dplyr::select(-c("name2_1","name2_2"))
#exclude studies does not return genus level abundance change
bugall_increased <- merge(x = bugdisease_increased2, y = diseasesig_increased2,
by =c('study_id','experiment_id','signature_id'), all.x = TRUE)
bugdisease_decreased <- dat %>%
filter (`Host species` == 'Homo sapiens') %>%
filter (`Body site` == 'Feces') %>%
filter (`Abundance in Group 1`=="decreased")
#get genus level sig change in abundance only
diseasesig_decreased <- bugsigdbr::getSignatures(bugdisease_decreased, tax.level='genus')
length(diseasesig_decreased)
## [1] 1133
regexp <- "[[:digit:]]+"
bugdisease_decreased2<-bugdisease_decreased%>%mutate(
study_id=stringr::str_extract(Study, regexp),
experiment_id=stringr::str_extract(Experiment, regexp),
signature_id=stringr::str_extract(`Signature page name`, regexp)
)
diseasesig_decreased2 <- purrr::map_df(diseasesig_decreased, ~as.data.frame(.x), .id="id") %>%
# tidyr::pivot_wider(id_cols=id) %>%
mutate(name2_1=stringr::str_replace(id, "bsdb:", "")) %>%
mutate(name2_2=(stringr::str_split(name2_1,"/"))) %>%
mutate(study_id= purrr::map_chr(name2_2, 1),
experiment_id= purrr::map_chr(name2_2, 2),
signature_id= purrr::map_chr(stringr::str_split(purrr::map_chr(name2_2, 3),'_'),1)
) %>%
dplyr::select(-c("name2_1","name2_2"))
#exclude studies does not return genus level abundance change
bugall_decreased <- merge(x = diseasesig_decreased2, y = bugdisease_decreased2,
by =c('study_id','experiment_id','signature_id'), all.x = TRUE)
#this code didn’t work because experiment 10 has no signatures,
therefore line 130 - 180 were commented out
{r overlap_prevelance=0} #
{r overlap_prevelance=0}
experiment 10 has no signatures
library(purrr) list0_decreased <- c(healthysig0, diseasesig_decreased) paircomp0_decreased <- calcPairwiseOverlaps(list0_decreased)
paircheck0_decreased <- paircomp0_decreased %>% filter(stringr::str_detect(name1, ‘feces-from-healthy-adult_vs_none—0%-prevalence-threshold’)) %>% mutate(name2_1=stringr::str_replace(name2, “bsdb:”, ““)) %>% mutate(name2_2=(stringr::str_split(name2_1,”/“))) %>% mutate(study_id= purrr::map_chr(name2_2, 1), experiment_id= purrr::map_chr(name2_2, 2), signature_id= purrr::map_chr(stringr::str_split(purrr::map_chr(name2_2, 3),’_’),1) )
list0_increased <- c(healthysig0, diseasesig_increased) paircomp0_increased <- calcPairwiseOverlaps(list0_increased)
paircheck0_increased <- paircomp0_increased %>% filter(stringr::str_detect(name1, ‘feces-from-healthy-adult_vs_none—0%-prevalence-threshold’)) %>% mutate(name2_1=stringr::str_replace(name2, “bsdb:”, ““)) %>% mutate(name2_2=(stringr::str_split(name2_1,”/“))) %>% mutate(study_id= purrr::map_chr(name2_2, 1), experiment_id= purrr::map_chr(name2_2, 2), signature_id= purrr::map_chr(stringr::str_split(purrr::map_chr(name2_2, 3),’_’),1) )
# merge jacard scores with disease study data
The issue why I need to merge overlap score back with all disease budsigdb studies is that calcPairwiseOverlaps drops all 0 intersect pairs
#```{r merge overlap scores with all disease study data}
overlap0_decreased<- merge(x = bugall_decreased, y = paircheck0_decreased, by =c('study_id','experiment_id','signature_id'), all.x = TRUE) %>%
mutate(type="decreased")%>%
filter(length_set2>4) #keep only bugsigdb studies that tested more than 4 increased or decreased
overlap0_decreased[is.na(overlap0_decreased)] <- 0
overlap0_increased<- merge(x = bugall_increased, y = paircheck0_increased, by =c('study_id','experiment_id','signature_id'), all.x = TRUE) %>%
mutate(type="increased") %>%
filter(length_set2 > 4) #keep only bugsigdb studies that tested more than 4 increased or decreased
overlap0_increased[is.na(overlap0_increased)] <- 0
#{r comparison-increased_vs_decrease}
t.test(overlap0_increasedoverlap)
wilcox.test(overlap0_increasedoverlap)
p <- rbind(overlap0_increased,overlap0_decreased) %>% ggplot( aes(x=overlap, fill=type)) + geom_histogram( color=“#e9ecef”, alpha=0.6, position = ‘dodge’,bins=20) p
``` r
library(purrr)
list50_decreased <- c(healthysig50, diseasesig_decreased)
paircomp50_decreased <- calcPairwiseOverlaps(list50_decreased)
paircheck50_decreased <- paircomp50_decreased %>%
filter(stringr::str_detect(name1, 'feces-from-healthy-adult_vs_none---50%-prevalence-threshold')) %>%
mutate(name2_1=stringr::str_replace(name2, "bsdb:", "")) %>%
mutate(name2_2=(stringr::str_split(name2_1,"/"))) %>%
mutate(study_id= purrr::map_chr(name2_2, 1),
experiment_id= purrr::map_chr(name2_2, 2),
signature_id= purrr::map_chr(stringr::str_split(purrr::map_chr(name2_2, 3),'_'),1)
)
list50_increased <- c(healthysig50, diseasesig_increased)
paircomp50_increased <- calcPairwiseOverlaps(list50_increased)
paircheck50_increased <- paircomp50_increased %>%
filter(stringr::str_detect(name1, 'feces-from-healthy-adult_vs_none---50%-prevalence-threshold')) %>%
mutate(name2_1=stringr::str_replace(name2, "bsdb:", "")) %>%
mutate(name2_2=(stringr::str_split(name2_1,"/"))) %>%
mutate(study_id= purrr::map_chr(name2_2, 1),
experiment_id= purrr::map_chr(name2_2, 2),
signature_id= purrr::map_chr(stringr::str_split(purrr::map_chr(name2_2, 3),'_'),1)
)
overlap50_decreased<- merge(x = bugall_decreased, y = paircheck50_decreased, by =c('study_id','experiment_id','signature_id'), all.x = TRUE) %>%
mutate(type="decreased")%>%
filter(length_set2>4) #keep only bugsigdb studies that tested more than 4 increased or decreased
overlap50_decreased[is.na(overlap50_decreased)] <- 0
overlap50_increased<- merge(x = bugall_increased, y = paircheck50_increased, by =c('study_id','experiment_id','signature_id'), all.x = TRUE) %>%
mutate(type="increased") %>%
filter(length_set2 > 4) #keep only bugsigdb studies that tested more than 4 increased or decreased
overlap50_increased[is.na(overlap50_increased)] <- 0
t.test(overlap50_increased$overlap, overlap50_decreased$overlap)
##
## Welch Two Sample t-test
##
## data: overlap50_increased$overlap and overlap50_decreased$overlap
## t = -21.474, df = 7227.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1155067 -0.0961821
## sample estimates:
## mean of x mean of y
## 0.3849731 0.4908175
wilcox.test(overlap50_increased$overlap, overlap50_decreased$overlap)
##
## Wilcoxon rank sum test with continuity correction
##
## data: overlap50_increased$overlap and overlap50_decreased$overlap
## W = 4683276, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
p <- rbind(overlap50_increased,overlap50_decreased) %>%
ggplot( aes(x=overlap, fill=type)) +
geom_histogram( color="#e9ecef", alpha=0.6, position = 'dodge',bins=20)
p
library(purrr)
list70_decreased <- c(healthysig70, diseasesig_decreased)
paircomp70_decreased <- calcPairwiseOverlaps(list70_decreased)
paircheck70_decreased <- paircomp70_decreased %>%
filter(stringr::str_detect(name1, 'feces-from-healthy-adult_vs_none---70%-prevalence-threshold')) %>%
mutate(name2_1=stringr::str_replace(name2, "bsdb:", "")) %>%
mutate(name2_2=(stringr::str_split(name2_1,"/"))) %>%
mutate(study_id= purrr::map_chr(name2_2, 1),
experiment_id= purrr::map_chr(name2_2, 2),
signature_id= purrr::map_chr(stringr::str_split(purrr::map_chr(name2_2, 3),'_'),1)
)
list70_increased <- c(healthysig70, diseasesig_increased)
paircomp70_increased <- calcPairwiseOverlaps(list70_increased)
paircheck70_increased <- paircomp70_increased %>%
filter(stringr::str_detect(name1, 'feces-from-healthy-adult_vs_none---70%-prevalence-threshold')) %>%
mutate(name2_1=stringr::str_replace(name2, "bsdb:", "")) %>%
mutate(name2_2=(stringr::str_split(name2_1,"/"))) %>%
mutate(study_id= purrr::map_chr(name2_2, 1),
experiment_id= purrr::map_chr(name2_2, 2),
signature_id= purrr::map_chr(stringr::str_split(purrr::map_chr(name2_2, 3),'_'),1)
)
overlap70_decreased<- merge(x = bugall_decreased, y = paircheck70_decreased, by =c('study_id','experiment_id','signature_id'), all.x = TRUE) %>%
mutate(type="decreased")%>%
filter(length_set2>4) #keep only bugsigdb studies that tested more than 4 increased or decreased
overlap70_decreased[is.na(overlap70_decreased)] <- 0
overlap70_increased<- merge(x = bugall_increased, y = paircheck70_increased, by =c('study_id','experiment_id','signature_id'), all.x = TRUE) %>%
mutate(type="increased") %>%
filter(length_set2 > 4) #keep only bugsigdb studies that tested more than 4 increased or decreased
overlap70_increased[is.na(overlap70_increased)] <- 0
t.test(overlap70_increased$overlap, overlap70_decreased$overlap)
wilcox.test(overlap70_increased$overlap, overlap70_decreased$overlap)
p <- rbind(overlap70_increased,overlap70_decreased) %>%
ggplot( aes(x=overlap, fill=type)) +
geom_histogram( color="#e9ecef", alpha=0.6, position = 'dodge',bins=20)
p