Installation

if (!require("BiocManager"))
    install.packages("BiocManager")
BiocManager::install("RaggedExperiment")
library(RaggedExperiment)
library(curatedTCGAData)
library(GenomeInfoDb)
library(RaggedExperiment.SoftNote)
BRCA <- curatedTCGAData(
    "BRCA", version = "2.0.1", assays = c("CNASeq", "Mutation"), dry.run = FALSE
)
BRCA
## A MultiAssayExperiment object of 2 listed
##  experiments with user-defined names and respective classes.
##  Containing an ExperimentList class object of length 2:
##  [1] BRCA_CNASeq-20160128: RaggedExperiment with 5298 rows and 38 columns
##  [2] BRCA_Mutation-20160128: RaggedExperiment with 90490 rows and 993 columns
## Functionality:
##  experiments() - obtain the ExperimentList instance
##  colData() - the primary/phenotype DataFrame
##  sampleMap() - the sample coordination DataFrame
##  `$`, `[`, `[[` - extract colData columns, subset, or experiment
##  *Format() - convert into a long or wide DataFrame
##  assays() - convert ExperimentList to a SimpleList of matrices
##  exportClass() - save data to flat files

Measuring size

CNAseq

object_size(BRCA[["BRCA_CNASeq-20160128"]])
object_size(sparseAssay(BRCA[["BRCA_CNASeq-20160128"]], sparse = TRUE))
object_size(compactAssay(BRCA[["BRCA_CNASeq-20160128"]]))
object_size(sparseAssay(BRCA[["BRCA_CNASeq-20160128"]]))

Mutation

object_size(BRCA[["BRCA_Mutation-20160128"]])
# object_size(sparseAssay(BRCA[["BRCA_Mutation-20160128"]], sparse = TRUE))
object_size(compactAssay(BRCA[["BRCA_Mutation-20160128"]]))
object_size(sparseAssay(BRCA[["BRCA_Mutation-20160128"]]))

Object Sizes from curatedTCGAData

library(TxDb.Hsapiens.UCSC.hg19.knownGene)

Extract all gene regions from TxDb

txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
genes <- genes(txdb, single.strand.genes.only = FALSE)
genes <- keepStandardChromosomes(genes, pruning.mode = "coarse")
ugenes <- unlist(genes)

Standardize seqlevelsStyle to UCSC

# https://gdac.broadinstitute.org/runs/analyses__2016_01_28/reports/cancer/BRCA-TP/CopyNumberLowPass_Gistic2/nozzle.html
re <- BRCA[["BRCA_CNASeq-20160128"]]
## possible bug -- seqlevelsStyle must go first, then genome
seqlevelsStyle(re) <- "UCSC"
genome(rowRanges(re)) <- "hg19"
rowRanges(re)
## GRanges object with 5298 ranges and 0 metadata columns:
##          seqnames              ranges strand
##             <Rle>           <IRanges>  <Rle>
##      [1]     chr1       10209-2583075      *
##      [2]     chr1   2583076-249240606      *
##      [3]     chr2     10002-243189359      *
##      [4]     chr3     60175-162511435      *
##      [5]     chr3 162511436-162626067      *
##      ...      ...                 ...    ...
##   [5294]    chr20      60001-62965506      *
##   [5295]    chr21    9422166-48119869      *
##   [5296]    chr22   16051206-51244552      *
##   [5297]    chr23   2699503-116067549      *
##   [5298]    chr24    2649450-28784074      *
##   -------
##   seqinfo: 24 sequences from hg19 genome; no seqlengths

It looks like the order matters, otherwise you get NCBI seqlevels…

re2 <- BRCA[["BRCA_CNASeq-20160128"]]
genome(rowRanges(re2)) <- "hg19"
seqlevelsStyle(re2) <- "UCSC"
rowRanges(re2)
identical(re, re2)
#' [1] FALSE

CNAseq in genes

ingenes <- subsetByOverlaps(re, ugenes)

Mutations in genes

# https://gdac.broadinstitute.org/runs/analyses__2016_01_28/reports/cancer/BRCA-TP/CopyNumberLowPass_Gistic2/nozzle.html
mre <- BRCA[["BRCA_Mutation-20160128"]]
## possible bug -- seqlevelsStyle must go first, then genome
seqlevelsStyle(mre) <- "UCSC"
## Warning in (function (seqlevels, genome, new_style) : cannot switch some
## GRCh37's seqlevels from NCBI to UCSC style
genome(rowRanges(mre)) <- "hg19"
rowRanges(mre)
## GRanges object with 90490 ranges and 0 metadata columns:
##           seqnames    ranges strand
##              <Rle> <IRanges>  <Rle>
##       [1]    chr10 116247760      +
##       [2]    chr12  43944926      +
##       [3]     chr3  85932472      +
##       [4]     chr2  25678299      +
##       [5]    chr17  40272381      +
##       ...      ...       ...    ...
##   [90486]     chr3  48299430      +
##   [90487]    chr19  52394623      +
##   [90488]    chr16  30537313      +
##   [90489]    chr19  35449130      +
##   [90490]    chr19  53994951      +
##   -------
##   seqinfo: 26 sequences from hg19 genome; no seqlengths
mingenes <- subsetByOverlaps(mre, ugenes)

Obtaining data directly from RTCGAToolbox

library(RTCGAToolbox)
getLinks("BRCA", CNASeq = TRUE)
## [1] "https://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/BRCA/20160128/gdac.broadinstitute.org_BRCA.Merge_cna__illuminahiseq_dnaseqc__hms_harvard_edu__Level_3__segmentation__seg.Level_3.2016012800.0.0.tar.gz"
BRCAseq <- getFirehoseData("BRCA", CNASeq = TRUE)
## RTCGAToolbox cache directory set to:
##     /home/mr148/.cache/R/RTCGAToolbox
## Using locally cached version of /home/mr148/.cache/R/RTCGAToolbox/20160128-BRCA-Clinical.txt
## Using locally cached version of /home/mr148/.cache/R/RTCGAToolbox/20160128-BRCA-CNAseq.txt
cnatoolbox <- biocExtract(BRCAseq, "CNASeq")
## working on: CNASeq
seqlevelsStyle(cnatoolbox) <- "UCSC"
genome(cnatoolbox) <- "hg19"
rowRanges(cnatoolbox)
## GRanges object with 5298 ranges and 0 metadata columns:
##          seqnames              ranges strand
##             <Rle>           <IRanges>  <Rle>
##      [1]     chr1       10209-2583075      *
##      [2]     chr1   2583076-249240606      *
##      [3]     chr2     10002-243189359      *
##      [4]     chr3     60175-162511435      *
##      [5]     chr3 162511436-162626067      *
##      ...      ...                 ...    ...
##   [5294]    chr20      60001-62965506      *
##   [5295]    chr21    9422166-48119869      *
##   [5296]    chr22   16051206-51244552      *
##   [5297]    chr23   2699503-116067549      *
##   [5298]    chr24    2649450-28784074      *
##   -------
##   seqinfo: 24 sequences from hg19 genome; no seqlengths
getLinks("BRCA", Mutation = TRUE)
## [1] "https://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/BRCA/20160128/gdac.broadinstitute.org_BRCA.Mutation_Packager_Calls.Level_3.2016012800.0.0.tar.gz"
BRCAmut <- getFirehoseData("BRCA", Mutation = TRUE)
## RTCGAToolbox cache directory set to:
##     /home/mr148/.cache/R/RTCGAToolbox
## Using locally cached version of /home/mr148/.cache/R/RTCGAToolbox/20160128-BRCA-Clinical.txt
muttoolbox <- biocExtract(BRCAmut, "Mutation")
## working on: Mutation
seqlevelsStyle(muttoolbox) <- "UCSC"
## Warning in (function (seqlevels, genome, new_style) : cannot switch some
## GRCh37's seqlevels from NCBI to UCSC style
genome(muttoolbox) <- "hg19"
rowRanges(muttoolbox)
## GRanges object with 90490 ranges and 0 metadata columns:
##           seqnames    ranges strand
##              <Rle> <IRanges>  <Rle>
##       [1]    chr10 116247760      +
##       [2]    chr12  43944926      +
##       [3]     chr3  85932472      +
##       [4]     chr2  25678299      +
##       [5]    chr17  40272381      +
##       ...      ...       ...    ...
##   [90486]     chr3  48299430      +
##   [90487]    chr19  52394623      +
##   [90488]    chr16  30537313      +
##   [90489]    chr19  35449130      +
##   [90490]    chr19  53994951      +
##   -------
##   seqinfo: 26 sequences from hg19 genome; no seqlengths

Sizes for CNAseq from RTCGAToolbox

object_size(cnatoolbox)
object_size(BRCAseq@CNASeq)
object_size(sparseAssay(cnatoolbox, sparse = TRUE))
object_size(compactAssay(cnatoolbox))
object_size(sparseAssay(cnatoolbox))
object_size(muttoolbox)
object_size(BRCAmut@Mutation)
# object_size(sparseAssay(muttoolbox, sparse = TRUE)) # typeof character
object_size(compactAssay(muttoolbox))
object_size(sparseAssay(muttoolbox))

Restrict to genic regions

incnabox <- subsetByOverlaps(cnatoolbox, ugenes)
inmutbox <- subsetByOverlaps(muttoolbox, ugenes)
object_size(incnabox)
# NA
object_size(sparseAssay(incnabox, sparse = TRUE))
object_size(compactAssay(incnabox))
object_size(sparseAssay(incnabox))
object_size(inmutbox)
# NA
# object_size(sparseAssay(inmutbox, sparse = TRUE))
object_size(compactAssay(inmutbox))
object_size(sparseAssay(inmutbox))

Table of object sizes by data type and source

Data Source Assay Data Type RaggedExperiment as.data.frame sparse Matrix matrix (reduced rows) matrix (sparse)
curatedTCGAData CNASeq numeric 0.2 MB 0.3 MB 0.3 MB 1 MB 1.9 MB
curatedTCGAData CNASeq (in genes) numeric 0.2 MB 0.2 MB 0.3 MB 0.9 MB 1.7 MB
curatedTCGAData Mutation character 70.6 MB 71.8 MB NA 680.3 MB 726.2 MB
curatedTCGAData Mutation (in genes) character 37.6 MB 38.1 MB NA 351.3 MB 375.5 MB