Calculate distance between two vectors, rows of one matrix/dataframe, or rows of two matrices/dataframes.

This function does some simple looping to allow x and y to be various combinations of vectors and matrices/dataframes.

Usage

phenoDist(x, y = NULL, bins = 10, vectorDistFun = vectorWeightedDist, ...)

Arguments

x: A vector, matrix or dataframe
y: NULL, a vector, matrix, or dataframe. If x is a vector, y must also be specified.
bins: discretize continuous fields in the specified number of bins
vectorDistFun: A function of two vectors that returns the distance between those vectors.
...: Extra arguments passed on to vectorDistFun

Value

a matrix of distances between pairs of rows of x (if y is unspecified), or between all pairs of rows between x and y (if both are provided).

Author

Levi Waldron, Markus Riester, Marcel Ramos

Examples


example("phenoFinder")
#> 
#> phnFnd> library(curatedOvarianData)
#> 
#> phnFnd> data(GSE32063_eset)
#> 
#> phnFnd> data(GSE17260_eset)
#> 
#> phnFnd> esets2 <- list(JapaneseB=GSE32063_eset,
#> phnFnd+                 Yoshihara2010=GSE17260_eset)
#> 
#> phnFnd> ## standardize the sample ids to improve matching based on clinical annotation
#> phnFnd> esets2 <- lapply(esets2, function(X){
#> phnFnd+     X$alt_sample_name <- paste(X$sample_type, gsub("[^0-9]", "", X$alt_sample_name), sep="_")
#> phnFnd+ 
#> phnFnd+ ## Removal of columns that cannot possibly match also helps duplicated patients to stand out
#> phnFnd+     pData(X) <- pData(X)[, !grepl("uncurated_author_metadata", colnames(pData(X)))]
#> phnFnd+     X <- X[, 1:20]  ##speed computations
#> phnFnd+     return(X) })
#> 
#> phnFnd> ## See first six samples in both rows and columns
#> phnFnd> phenoFinder(esets2)[1:6, 1:6]
#>           GSM432220 GSM432221 GSM432222 GSM432223 GSM432224  GSM432225
#> GSM795125 0.2351904 0.1014047 0.3525417 0.7274151 0.2189890 0.27397077
#> GSM795126 0.5404524 0.2588727 0.4083015 0.4079720 0.2927870 0.74123368
#> GSM795127 0.3791279 0.5008562 0.4983502 0.4981226 0.6385506 0.04416984
#> GSM795128 0.2351904 0.1014047 0.3525417 0.3523760 0.2189890 0.27397077
#> GSM795129 0.1076309 0.2395470 0.2190910 0.2189890 0.3643260 0.16030839
#> GSM795130 0.2603947 0.1344290 0.1077761 0.1076793 0.2489234 0.29544860

pdat1 <- pData(esets2[[1]])
pdat2 <- pData(esets2[[2]])

## Use phenoDist() to calculate a weighted distance matrix
distmat <- phenoDist(as.matrix(pdat1), as.matrix(pdat2))
## Note outliers with identical clinical data, these are probably the same patients:
graphics::boxplot(distmat)


if (FALSE) { # \dontrun{
   library(curatedOvarianData)
   data(GSE32063_eset)
   data(GSE17260_eset)
   pdat1 <- pData(GSE32063_eset)
   pdat2 <- pData(GSE17260_eset)
   ## Curation of the alternative sample identifiers makes duplicates stand out more:
   pdat1$alt_sample_name <-
     paste(pdat1$sample_type,
           gsub("[^0-9]", "", pdat1$alt_sample_name),
           sep = "_")
   pdat2$alt_sample_name <-
     paste(pdat2$sample_type,
           gsub("[^0-9]", "", pdat2$alt_sample_name),
           sep = "_")
   ## Removal of columns that cannot possibly match also helps duplicated patients to stand out
   pdat1 <-
     pdat1[,!grepl("uncurated_author_metadata", colnames(pdat1))]
   pdat2 <-
     pdat2[,!grepl("uncurated_author_metadata", colnames(pdat2))]
   ## Use phenoDist() to calculate a weighted distance matrix
   distmat <- phenoDist(as.matrix(pdat1), as.matrix(pdat2))
   ## Note outliers with identical clinical data, these are probably the same patients:
   graphics::boxplot(distmat)
} # }