vignettes/articles/validation.Rmd
validation.Rmd
Validation scripts and outputs are in: https://github.com/waldronlab/taxPProValidation/tree/main
Column | Description |
---|---|
method | The R package used for the analysis. Either phytools or castor. ltp stands for Living Tree Project. Phytools was used for discrete attributes. castor was used for numeric attributes. |
rank | Rank of the input annotated taxa used. ‘all’ means no filter, so the data could contain genus, species, and strains, but possibly just one of two of these ranks. |
physiology | Name of the physiology in bugphyzz. |
attribute | Name of the attribute. In the case of numeric attributes, it’s the same name of the physiology. |
mcc_mean and mcc_sd | Mean and standard deviation of the Mathews correlation coefficient values. It’s the result of 10-fold cross-validation. These values only apply to discrete attributes. |
r_squared_mean and r_squated_sd | Mean and standard deviation of the R-square values. It’s the result of 10-fold cross-validation. These values only apply to numeric attributes. |
ltp_and_bp | Number of annotations in both the LTP tree and a given subset of a bughyzz dataset (per rank and attribute). |
ltp_and_bp_phys | Number of annotations in both the LTP tree and a given set of a bugphyzz physiology (adding up all attributes). |
bugphyzz | Number of taxa with annotations in a given subset of a bugphyzz dataset (per rank and attribute). |
ltp | total number of tips in the LTP tree. These tips include a mix of genera, species, and strains. |
nsti_mean and nsti_sd | Mean and standard deviation of NSTI values of unknwon tips. Lower is better. |
output <- dat |>
filter(rank == "all") |>
select(
method,
Attribute = physiology,
`Attribute value` = attribute,
`MCC mean` = mcc_mean,
`MCC sd` = mcc_sd,
`R-squared mean` = r2_mean,
`R-squared sd` = r2_sd,
`No. annotated tips per attribute value` = ltp_bp,
`No. annotated tips per attribute` = ltp_bp_phys,
`Percent of annotated tips per attribute value` = ltp_bp_per,
`Percent of annotated tips per attribute` = ltp_bp_phys_per
) |>
arrange(
-`MCC mean`, -`R-squared mean`, method, Attribute, `Attribute value`) |>
select(-method)
myDataTable(output, page_len = nrow(output))
fname <- system.file(
"extdata", "attributes.tsv", package = "bugphyzz"
)
o <- readr::read_tsv(fname, show_col_types = FALSE)
o2 <- o |>
mutate(ontology = gsub("(?<!CO)_(?!(\\d+:\\d+))", ":", ontology, perl = TRUE)) |>
mutate(ontology = sub("^(.*):.*$", "\\1", ontology)) |>
select(ontology) |>
filter(!is.na(ontology)) |>
arrange(ontology) |>
distinct()
o3 <- o2 |>
mutate(
Description = case_when(
ontology == "APO" ~ "obophenotype/ascomycete-phenotype-ontology",
ontology == "ARO" ~ "Antibiotic Resistance Ontology",
ontology == "BTO" ~ "The BRENDA Tissue Ontology",
ontology == "CO_320" ~ "The Planteome Project",
ontology == "CO_331" ~ "The Planteome Project",
ontology == "CO_345" ~ "The Planteome Project",
ontology == "CO_357" ~ "The Planteome Project",
ontology == "ECOCORE" ~ "An ontology of core ecological entities",
ontology == "EFO" ~ "Experimental Factor Ontology",
ontology == "EHDAA2" ~ "Ontobee/Human developmental anatomy, abstract",
ontology == "ENM" ~ "eNanoMapper ontology",
ontology == "ENVO" ~ "The Environment Ontology",
ontology == "ERO" ~ "Eagle-I Research Resource Ontology",
ontology == "FMA" ~ "The Foundational Model of Anatomy",
ontology == "FOODON" ~ "Ontobee/Food Ontology",
ontology == "GO" ~ "Gene Ontology",
ontology == "IDO" ~ "Infectious Disease Ontology",
ontology == "IDOMAL" ~ "Malaria Ontology",
ontology == "MCO" ~ "Microbial Conditions Ontology",
ontology == "MICRO" ~ "Ontology of Prokaryotic Phenotypic and Metabolic Characters",
ontology == "MONDO" ~ "Mondo Disease Ontology",
ontology == "MP" ~ "The Mammalian Phenotype Ontology",
ontology == "NCBITaxon" ~ "NCBI organismal classification",
ontology == "NCIT" ~ "NCI Thesaurus OBO Edition",
ontology == "OBI" ~ "Ontology for Biomedical Investigations",
ontology == "OHMI" ~ "OHMI: Ontology of Host-Microbiome Interactions",
ontology == "OMIT" ~ "Ontology for MIRNA Target",
ontology == "OMP" ~ "Ontology of Microbial Phenotypes",
ontology == "Orphanet" ~ "Orphanet",
ontology == "PATO" ~ "PATO - the Phenotype And Trait Ontology",
ontology == "PHIPO" ~ "Pathogen Host Interactions Phenotype Ontology",
ontology == "PO" ~ "Plant Ontology",
ontology == "SIO" ~ "Ontobee/Semanticscience Integrated Ontology",
ontology == "SNOMED" ~ "SNOMED CT (International Edition)",
ontology == "SPD" ~ "The Spider Anatomy Ontology",
ontology == "SYMP" ~ "Symptom Ontology",
ontology == "UBERON" ~ "Uber-anatomy ontology",
ontology == "UPa" ~ "Unipathway",
ontology == "XCO" ~ "Experimental condition ontology",
ontology == "ZFA" ~ "Zebrafish Anatomy Ontology (ZFA)"
)
) |>
mutate(
ontology = case_when(
grepl("^(CO_).*$", ontology) ~ "CO",
TRUE ~ ontology
)) |>
distinct() |>
dplyr::rename(Ontology = ontology)
myDataTable(o3)
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.4.1 (2024-06-14)
#> os Ubuntu 22.04.4 LTS
#> system x86_64, linux-gnu
#> ui X11
#> language en
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Etc/UTC
#> date 2024-06-28
#> pandoc 3.2 @ /usr/bin/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> abind 1.4-5 2016-07-21 [1] RSPM (R 4.4.0)
#> Biobase * 2.64.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> BiocGenerics * 0.50.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> bit 4.0.5 2022-11-15 [1] RSPM (R 4.4.0)
#> bit64 4.0.5 2020-08-30 [1] RSPM (R 4.4.0)
#> bslib 0.7.0 2024-03-29 [1] RSPM (R 4.4.0)
#> bugphyzzAnalyses * 0.1.19 2024-06-28 [1] local
#> cachem 1.1.0 2024-05-16 [1] RSPM (R 4.4.0)
#> cli 3.6.3 2024-06-21 [1] RSPM (R 4.4.0)
#> colorspace 2.1-0 2023-01-23 [1] RSPM (R 4.4.0)
#> crayon 1.5.3 2024-06-20 [1] RSPM (R 4.4.0)
#> crosstalk 1.2.1 2023-11-23 [1] RSPM (R 4.4.0)
#> DelayedArray 0.30.1 2024-05-07 [1] Bioconductor 3.19 (R 4.4.1)
#> desc 1.4.3 2023-12-10 [1] RSPM (R 4.4.0)
#> digest 0.6.36 2024-06-23 [1] RSPM (R 4.4.0)
#> dplyr * 1.1.4 2023-11-17 [1] RSPM (R 4.4.0)
#> DT * 0.33 2024-04-04 [1] RSPM (R 4.4.0)
#> evaluate 0.24.0 2024-06-10 [1] RSPM (R 4.4.0)
#> fansi 1.0.6 2023-12-08 [1] RSPM (R 4.4.0)
#> fastmap 1.2.0 2024-05-15 [1] RSPM (R 4.4.0)
#> fs 1.6.4 2024-04-25 [1] RSPM (R 4.4.0)
#> generics 0.1.3 2022-07-05 [1] RSPM (R 4.4.0)
#> GenomeInfoDb * 1.40.1 2024-05-24 [1] Bioconductor 3.19 (R 4.4.1)
#> GenomeInfoDbData 1.2.12 2024-06-25 [1] Bioconductor
#> GenomicRanges * 1.56.1 2024-06-12 [1] Bioconductor 3.19 (R 4.4.1)
#> ggplot2 * 3.5.1 2024-04-23 [1] RSPM (R 4.4.0)
#> ggrepel * 0.9.5 2024-01-10 [1] RSPM (R 4.4.0)
#> glue 1.7.0 2024-01-09 [1] RSPM (R 4.4.0)
#> gtable 0.3.5 2024-04-22 [1] RSPM (R 4.4.0)
#> hms 1.1.3 2023-03-21 [1] RSPM (R 4.4.0)
#> htmltools 0.5.8.1 2024-04-04 [1] RSPM (R 4.4.0)
#> htmlwidgets 1.6.4 2023-12-06 [1] RSPM (R 4.4.0)
#> httr 1.4.7 2023-08-15 [1] RSPM (R 4.4.0)
#> IRanges * 2.38.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> jquerylib 0.1.4 2021-04-26 [1] RSPM (R 4.4.0)
#> jsonlite 1.8.8 2023-12-04 [1] RSPM (R 4.4.0)
#> knitr 1.47 2024-05-29 [1] RSPM (R 4.4.0)
#> lattice 0.22-6 2024-03-20 [2] CRAN (R 4.4.1)
#> lifecycle 1.0.4 2023-11-07 [1] RSPM (R 4.4.0)
#> magrittr 2.0.3 2022-03-30 [1] RSPM (R 4.4.0)
#> Matrix 1.7-0 2024-04-26 [2] CRAN (R 4.4.1)
#> MatrixGenerics * 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> matrixStats * 1.3.0 2024-04-11 [1] RSPM (R 4.4.0)
#> memoise 2.0.1 2021-11-26 [1] RSPM (R 4.4.0)
#> munsell 0.5.1 2024-04-01 [1] RSPM (R 4.4.0)
#> pillar 1.9.0 2023-03-22 [1] RSPM (R 4.4.0)
#> pkgconfig 2.0.3 2019-09-22 [1] RSPM (R 4.4.0)
#> pkgdown 2.0.9 2024-04-18 [1] RSPM (R 4.4.0)
#> purrr 1.0.2 2023-08-10 [1] RSPM (R 4.4.0)
#> R6 2.5.1 2021-08-19 [1] RSPM (R 4.4.0)
#> ragg 1.3.2 2024-05-15 [1] RSPM (R 4.4.0)
#> Rcpp 1.0.12 2024-01-09 [1] RSPM (R 4.4.0)
#> readr 2.1.5 2024-01-10 [1] RSPM (R 4.4.0)
#> rlang 1.1.4 2024-06-04 [1] RSPM (R 4.4.0)
#> rmarkdown 2.27 2024-05-17 [1] RSPM (R 4.4.0)
#> S4Arrays 1.4.1 2024-05-20 [1] Bioconductor 3.19 (R 4.4.1)
#> S4Vectors * 0.42.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> sass 0.4.9 2024-03-15 [1] RSPM (R 4.4.0)
#> scales 1.3.0 2023-11-28 [1] RSPM (R 4.4.0)
#> sessioninfo 1.2.2 2021-12-06 [1] RSPM (R 4.4.0)
#> SparseArray 1.4.8 2024-05-24 [1] Bioconductor 3.19 (R 4.4.1)
#> SummarizedExperiment * 1.34.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.1)
#> systemfonts 1.1.0 2024-05-15 [1] RSPM (R 4.4.0)
#> textshaping 0.4.0 2024-05-24 [1] RSPM (R 4.4.0)
#> tibble 3.2.1 2023-03-20 [1] RSPM (R 4.4.0)
#> tidyselect 1.2.1 2024-03-11 [1] RSPM (R 4.4.0)
#> tzdb 0.4.0 2023-05-12 [1] RSPM (R 4.4.0)
#> UCSC.utils 1.0.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> utf8 1.2.4 2023-10-22 [1] RSPM (R 4.4.0)
#> vctrs 0.6.5 2023-12-01 [1] RSPM (R 4.4.0)
#> vroom 1.6.5 2023-12-05 [1] RSPM (R 4.4.0)
#> withr 3.0.0 2024-01-16 [1] RSPM (R 4.4.0)
#> xfun 0.45 2024-06-16 [1] RSPM (R 4.4.0)
#> XVector 0.44.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> yaml 2.3.8 2023-12-11 [1] RSPM (R 4.4.0)
#> zlibbioc 1.50.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#>
#> [1] /usr/local/lib/R/site-library
#> [2] /usr/local/lib/R/library
#>
#> ──────────────────────────────────────────────────────────────────────────────