Import validation data

Validation scripts and outputs are in: https://github.com/waldronlab/taxPProValidation/tree/main

Columns description

Column Description
method The R package used for the analysis. Either phytools or castor. ltp stands for Living Tree Project. Phytools was used for discrete attributes. castor was used for numeric attributes.
rank Rank of the input annotated taxa used. ‘all’ means no filter, so the data could contain genus, species, and strains, but possibly just one of two of these ranks.
physiology Name of the physiology in bugphyzz.
attribute Name of the attribute. In the case of numeric attributes, it’s the same name of the physiology.
mcc_mean and mcc_sd Mean and standard deviation of the Mathews correlation coefficient values. It’s the result of 10-fold cross-validation. These values only apply to discrete attributes.
r_squared_mean and r_squated_sd Mean and standard deviation of the R-square values. It’s the result of 10-fold cross-validation. These values only apply to numeric attributes.
ltp_and_bp Number of annotations in both the LTP tree and a given subset of a bughyzz dataset (per rank and attribute).
ltp_and_bp_phys Number of annotations in both the LTP tree and a given set of a bugphyzz physiology (adding up all attributes).
bugphyzz Number of taxa with annotations in a given subset of a bugphyzz dataset (per rank and attribute).
ltp total number of tips in the LTP tree. These tips include a mix of genera, species, and strains.
nsti_mean and nsti_sd Mean and standard deviation of NSTI values of unknwon tips. Lower is better.
output <- dat |> 
    filter(rank == "all") |> 
    select(
        method,
        Attribute = physiology,
        `Attribute value` = attribute,
        `MCC mean` = mcc_mean,
        `MCC sd` = mcc_sd,
        `R-squared mean` = r2_mean,
        `R-squared sd` = r2_sd,
        `No. annotated tips per attribute value` = ltp_bp,
        `No. annotated tips per attribute` = ltp_bp_phys,
        `Percent of annotated tips per attribute value` = ltp_bp_per,
        `Percent of annotated tips per attribute` = ltp_bp_phys_per
    ) |> 
    arrange(
        -`MCC mean`, -`R-squared mean`, method, Attribute, `Attribute value`) |> 
    select(-method)
myDataTable(output, page_len = nrow(output))

Ontology data

fname <- system.file(
    "extdata", "attributes.tsv", package = "bugphyzz"
)
o <- readr::read_tsv(fname, show_col_types = FALSE)
o2 <- o |> 
    mutate(ontology = gsub("(?<!CO)_(?!(\\d+:\\d+))", ":", ontology, perl = TRUE)) |>
    mutate(ontology = sub("^(.*):.*$", "\\1", ontology)) |> 
    select(ontology) |> 
    filter(!is.na(ontology)) |> 
    arrange(ontology) |> 
    distinct()
o3 <- o2 |> 
    mutate(
        Description = case_when(
            ontology == "APO" ~ "obophenotype/ascomycete-phenotype-ontology",
            ontology == "ARO" ~ "Antibiotic Resistance Ontology",
            ontology == "BTO" ~ "The BRENDA Tissue Ontology",
            ontology == "CO_320" ~ "The Planteome Project",
            ontology == "CO_331" ~ "The Planteome Project",
            ontology == "CO_345" ~ "The Planteome Project",
            ontology == "CO_357" ~ "The Planteome Project",
            ontology == "ECOCORE" ~ "An ontology of core ecological entities",
            ontology == "EFO" ~ "Experimental Factor Ontology",
            ontology == "EHDAA2" ~ "Ontobee/Human developmental anatomy, abstract",
            ontology == "ENM" ~ "eNanoMapper ontology",
            ontology == "ENVO" ~ "The Environment Ontology",
            ontology == "ERO" ~ "Eagle-I Research Resource Ontology",
            ontology == "FMA" ~ "The Foundational Model of Anatomy",
            ontology == "FOODON" ~ "Ontobee/Food Ontology",
            ontology == "GO" ~ "Gene Ontology",
            ontology == "IDO" ~ "Infectious Disease Ontology",
            ontology == "IDOMAL" ~ "Malaria Ontology",
            ontology == "MCO" ~ "Microbial Conditions Ontology",
            ontology == "MICRO" ~ "Ontology of Prokaryotic Phenotypic and Metabolic Characters",
            ontology == "MONDO" ~ "Mondo Disease Ontology",
            ontology == "MP" ~ "The Mammalian Phenotype Ontology",
            ontology == "NCBITaxon" ~ "NCBI organismal classification",
            ontology == "NCIT" ~ "NCI Thesaurus OBO Edition",
            ontology == "OBI" ~ "Ontology for Biomedical Investigations",
            ontology == "OHMI" ~ "OHMI: Ontology of Host-Microbiome Interactions",
            ontology == "OMIT" ~ "Ontology for MIRNA Target",
            ontology == "OMP" ~ "Ontology of Microbial Phenotypes",
            ontology == "Orphanet" ~ "Orphanet",
            ontology == "PATO" ~ "PATO - the Phenotype And Trait Ontology",
            ontology == "PHIPO" ~ "Pathogen Host Interactions Phenotype Ontology",
            ontology == "PO" ~ "Plant Ontology",
            ontology == "SIO" ~ "Ontobee/Semanticscience Integrated Ontology",
            ontology == "SNOMED" ~ "SNOMED CT (International Edition)",
            ontology == "SPD" ~ "The Spider Anatomy Ontology",
            ontology == "SYMP" ~ "Symptom Ontology",
            ontology == "UBERON" ~ "Uber-anatomy ontology",
            ontology == "UPa" ~ "Unipathway",
            ontology == "XCO" ~ "Experimental condition ontology",
            ontology == "ZFA" ~ "Zebrafish Anatomy Ontology (ZFA)"
        )
    ) |> 
    mutate(
        ontology = case_when(
            grepl("^(CO_).*$", ontology) ~ "CO",
            TRUE ~ ontology
    )) |> 
    distinct() |>
    dplyr::rename(Ontology = ontology)
myDataTable(o3)

Session information

#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.4.1 (2024-06-14)
#>  os       Ubuntu 22.04.4 LTS
#>  system   x86_64, linux-gnu
#>  ui       X11
#>  language en
#>  collate  en_US.UTF-8
#>  ctype    en_US.UTF-8
#>  tz       Etc/UTC
#>  date     2024-06-28
#>  pandoc   3.2 @ /usr/bin/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package              * version date (UTC) lib source
#>  abind                  1.4-5   2016-07-21 [1] RSPM (R 4.4.0)
#>  Biobase              * 2.64.0  2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#>  BiocGenerics         * 0.50.0  2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#>  bit                    4.0.5   2022-11-15 [1] RSPM (R 4.4.0)
#>  bit64                  4.0.5   2020-08-30 [1] RSPM (R 4.4.0)
#>  bslib                  0.7.0   2024-03-29 [1] RSPM (R 4.4.0)
#>  bugphyzzAnalyses     * 0.1.19  2024-06-28 [1] local
#>  cachem                 1.1.0   2024-05-16 [1] RSPM (R 4.4.0)
#>  cli                    3.6.3   2024-06-21 [1] RSPM (R 4.4.0)
#>  colorspace             2.1-0   2023-01-23 [1] RSPM (R 4.4.0)
#>  crayon                 1.5.3   2024-06-20 [1] RSPM (R 4.4.0)
#>  crosstalk              1.2.1   2023-11-23 [1] RSPM (R 4.4.0)
#>  DelayedArray           0.30.1  2024-05-07 [1] Bioconductor 3.19 (R 4.4.1)
#>  desc                   1.4.3   2023-12-10 [1] RSPM (R 4.4.0)
#>  digest                 0.6.36  2024-06-23 [1] RSPM (R 4.4.0)
#>  dplyr                * 1.1.4   2023-11-17 [1] RSPM (R 4.4.0)
#>  DT                   * 0.33    2024-04-04 [1] RSPM (R 4.4.0)
#>  evaluate               0.24.0  2024-06-10 [1] RSPM (R 4.4.0)
#>  fansi                  1.0.6   2023-12-08 [1] RSPM (R 4.4.0)
#>  fastmap                1.2.0   2024-05-15 [1] RSPM (R 4.4.0)
#>  fs                     1.6.4   2024-04-25 [1] RSPM (R 4.4.0)
#>  generics               0.1.3   2022-07-05 [1] RSPM (R 4.4.0)
#>  GenomeInfoDb         * 1.40.1  2024-05-24 [1] Bioconductor 3.19 (R 4.4.1)
#>  GenomeInfoDbData       1.2.12  2024-06-25 [1] Bioconductor
#>  GenomicRanges        * 1.56.1  2024-06-12 [1] Bioconductor 3.19 (R 4.4.1)
#>  ggplot2              * 3.5.1   2024-04-23 [1] RSPM (R 4.4.0)
#>  ggrepel              * 0.9.5   2024-01-10 [1] RSPM (R 4.4.0)
#>  glue                   1.7.0   2024-01-09 [1] RSPM (R 4.4.0)
#>  gtable                 0.3.5   2024-04-22 [1] RSPM (R 4.4.0)
#>  hms                    1.1.3   2023-03-21 [1] RSPM (R 4.4.0)
#>  htmltools              0.5.8.1 2024-04-04 [1] RSPM (R 4.4.0)
#>  htmlwidgets            1.6.4   2023-12-06 [1] RSPM (R 4.4.0)
#>  httr                   1.4.7   2023-08-15 [1] RSPM (R 4.4.0)
#>  IRanges              * 2.38.0  2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#>  jquerylib              0.1.4   2021-04-26 [1] RSPM (R 4.4.0)
#>  jsonlite               1.8.8   2023-12-04 [1] RSPM (R 4.4.0)
#>  knitr                  1.47    2024-05-29 [1] RSPM (R 4.4.0)
#>  lattice                0.22-6  2024-03-20 [2] CRAN (R 4.4.1)
#>  lifecycle              1.0.4   2023-11-07 [1] RSPM (R 4.4.0)
#>  magrittr               2.0.3   2022-03-30 [1] RSPM (R 4.4.0)
#>  Matrix                 1.7-0   2024-04-26 [2] CRAN (R 4.4.1)
#>  MatrixGenerics       * 1.16.0  2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#>  matrixStats          * 1.3.0   2024-04-11 [1] RSPM (R 4.4.0)
#>  memoise                2.0.1   2021-11-26 [1] RSPM (R 4.4.0)
#>  munsell                0.5.1   2024-04-01 [1] RSPM (R 4.4.0)
#>  pillar                 1.9.0   2023-03-22 [1] RSPM (R 4.4.0)
#>  pkgconfig              2.0.3   2019-09-22 [1] RSPM (R 4.4.0)
#>  pkgdown                2.0.9   2024-04-18 [1] RSPM (R 4.4.0)
#>  purrr                  1.0.2   2023-08-10 [1] RSPM (R 4.4.0)
#>  R6                     2.5.1   2021-08-19 [1] RSPM (R 4.4.0)
#>  ragg                   1.3.2   2024-05-15 [1] RSPM (R 4.4.0)
#>  Rcpp                   1.0.12  2024-01-09 [1] RSPM (R 4.4.0)
#>  readr                  2.1.5   2024-01-10 [1] RSPM (R 4.4.0)
#>  rlang                  1.1.4   2024-06-04 [1] RSPM (R 4.4.0)
#>  rmarkdown              2.27    2024-05-17 [1] RSPM (R 4.4.0)
#>  S4Arrays               1.4.1   2024-05-20 [1] Bioconductor 3.19 (R 4.4.1)
#>  S4Vectors            * 0.42.0  2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#>  sass                   0.4.9   2024-03-15 [1] RSPM (R 4.4.0)
#>  scales                 1.3.0   2023-11-28 [1] RSPM (R 4.4.0)
#>  sessioninfo            1.2.2   2021-12-06 [1] RSPM (R 4.4.0)
#>  SparseArray            1.4.8   2024-05-24 [1] Bioconductor 3.19 (R 4.4.1)
#>  SummarizedExperiment * 1.34.0  2024-05-01 [1] Bioconductor 3.19 (R 4.4.1)
#>  systemfonts            1.1.0   2024-05-15 [1] RSPM (R 4.4.0)
#>  textshaping            0.4.0   2024-05-24 [1] RSPM (R 4.4.0)
#>  tibble                 3.2.1   2023-03-20 [1] RSPM (R 4.4.0)
#>  tidyselect             1.2.1   2024-03-11 [1] RSPM (R 4.4.0)
#>  tzdb                   0.4.0   2023-05-12 [1] RSPM (R 4.4.0)
#>  UCSC.utils             1.0.0   2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#>  utf8                   1.2.4   2023-10-22 [1] RSPM (R 4.4.0)
#>  vctrs                  0.6.5   2023-12-01 [1] RSPM (R 4.4.0)
#>  vroom                  1.6.5   2023-12-05 [1] RSPM (R 4.4.0)
#>  withr                  3.0.0   2024-01-16 [1] RSPM (R 4.4.0)
#>  xfun                   0.45    2024-06-16 [1] RSPM (R 4.4.0)
#>  XVector                0.44.0  2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#>  yaml                   2.3.8   2023-12-11 [1] RSPM (R 4.4.0)
#>  zlibbioc               1.50.0  2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)
#> 
#>  [1] /usr/local/lib/R/site-library
#>  [2] /usr/local/lib/R/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────