aroma.affymetrix 2.4.0
aroma.cn 1.0.0
What's new?
Author: Pierre Neuvial
Created: 2010-03-04
Last updated: 2011-03-07
The TumorBoost method for normalizing allele-specific copy number based on a pair of tumor-normal genotyping microarrays uses genotypes of the normal sample for normalizing allele B fractions in a tumor. Here we show how such genotypes can be obtained from allele B fractions in a normal sample. This is called "naive genotyping" because genotypes are simply called based on local minima of an estimate of the density of allele fractions in the normal sample, as described in Bengtsson et al. (2010). We also illustrate how associated genotype confidence score files can be created.
WARNING: This naive genotyper is intended for identifying heterozygous SNPs to be used by TumorBoost and/or parent-specific CN segmentation methods, which are methods that are very forgiving on genotyping errors scattered along the genome. It was not designed to be a per-SNP high-performing genotyper.
totalAndFracBData/
annotationData/
chipTypes/
Note: above we are showing the same data sets as in the TumorBoost vignette, but in the present vignette only data files corresponding to allelic ratios in normal samples are used: here, "TCGA-*-10A-*,fracB.asb", for normal blood.
library("aroma.cn");
log <- verbose <- Arguments$getVerbose(-8, timestamp=TRUE);
rootPath <- "totalAndFracBData";
rootPath <- Arguments$getReadablePath(rootPath);
dataSets <- c("TCGA,GBM,BeadStudio,XY", "TCGA,GBM,CRMAv2");
dataSet <- dataSets[1]; ## Work with Illumina data
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Load the raw (tumor,normal) data set
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ds <- AromaUnitFracBCnBinarySet$byName(dataSet, chipType="*", paths=rootPath);
setFullNamesTranslator(ds, function(names, ...) {
pattern <- "^(TCGA-[0-9]{2}-[0-9]{4})-([0-9]{2}[A-Z])[-]*(.*)";
gsub(pattern, "\\1,\\2,\\3", names);
});
print(ds);
This gives
AromaUnitFracBCnBinarySet:
Name: TCGA
Tags: GBM,BeadStudio,XY
Full name: TCGA,GBM,BeadStudio,XY
Number of files: 2
Names: TCGA-02-0001, TCGA-02-0001 [2]
Path (to the first file): totalAndFracBData/TCGA,GBM,BeadStudio,XY/HumanHap550
Total file size: 4.28 MB
RAM: 0.00MB
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Extract the normals
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
types <- sapply(ds, FUN=function(df) getTags(df)[1]);
normals <- grep("(10|11)[A-Z]", types);
dsN <- extract(ds, normals);
print(dsN);
This gives
AromaUnitFracBCnBinarySet:
Name: TCGA
Tags: GBM,BeadStudio,XY
Full name: TCGA,GBM,BeadStudio,XY
Number of files: 1
Names: TCGA-02-0001 [1]
Path (to the first file): totalAndFracBData/TCGA,GBM,BeadStudio,XY/HumanHap550
Total file size: 2.14 MB
RAM: 0.00MB
fullname <- paste(c(getFullName(dsN), "NGC"), collapse=",");
chipType <- getChipType(dsN, fullname=FALSE);
outPath <- file.path("callData", fullname, chipType);
units <- NULL;
if (is.null(units)) {
df <- getFile(dsN, 1);
units <- seq(length=nbrOfUnits(df));
rm(df);
}
adjust <- 1.5;
type <- NULL; rm(type); # To please R CMD check
# Identify units on ChrX and ChrY
ugp <- getAromaUgpFile(dsN);
units23 <- getUnitsOnChromosome(ugp, 23);
is23 <- is.element(units, units23);
units24 <- getUnitsOnChromosome(ugp, 24);
is24 <- is.element(units, units24);
kk <- 1;
dfN <- getFile(dsN, kk);
tags <- getTags(dfN);
tags <- setdiff(tags, "fracB");
tags <- c(tags, "genotypes");
fullname <- paste(c(getName(dfN), tags), collapse=",");
filename <- sprintf("%s.acf", fullname);
gcPathname <- Arguments$getWritablePathname(filename, path=outPath, mustNotExist=FALSE);
csTags <- c(tags, "confidenceScores");
fullname <- paste(c(getName(dfN), csTags), collapse=",");
filename <- sprintf("%s.acf", fullname);
csPathname <- Arguments$getWritablePathname(filename, path=outPath, mustNotExist=FALSE);
if (isFile(gcPathname) && isFile(csPathname)) {
next;
}
betaN <- dfN[units,1,drop=TRUE];
# Call gender
gender <- callXXorXY(betaN[is23], betaN[is24], adjust=adjust, from=0, to=1);
# Call genotypes
naValue <- as.double(NA);
fit <- NULL;
mu <- rep(naValue, times=length(units));
cs <- rep(naValue, times=length(units));
if (gender == "XY") {
# All but ChrX & ChrY in male
isDiploid <- (!(is23 | is24));
use <- which(isDiploid);
muT <- callNaiveGenotypes(betaN[use], cn=2, adjust=adjust, from=0, to=1,
verbose=less(verbose,10));
fit <- attr(muT, 'modelFit');
mu[use] <- muT;
use <- which(!isDiploid);
muT <- callNaiveGenotypes(betaN[use], cn=1, adjust=adjust, from=0, to=1,
verbose=less(verbose,10));
mu[use] <- muT;
} else {
# All but ChrY in female
isDiploid <- (!is24);
use <- which(isDiploid);
muT <- callNaiveGenotypes(betaN[use], cn=2, adjust=adjust, from=0, to=1,
verbose=less(verbose,10));
fit <- attr(muT, 'modelFit');
mu[use] <- muT;
}
print(table(mu, exclude=NULL));
# Translate genotype calls in fracB space to (AA,AB,BB,...)
calls <- rep(as.character(NA), times=length(mu));
calls[mu == 0] <- "AA";
calls[mu == 1/2] <- "AB";
calls[mu == 1] <- "BB";
print(table(calls, exclude=NULL));
# Calculate confidence scores
a <- fit[[1]]$fitValleys$x[1];
b <- fit[[1]]$fitValleys$x[2];
cs[isDiploid] <- rowMins(abs(cbind(betaN[isDiploid]-a, betaN[isDiploid]-b)));
print(table(mu, exclude=NULL));
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Writing genotype calls (via temporary file)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
pathname <- gcPathname;
pathnameT <- sprintf("%s.tmp", pathname);
nbrOfUnits <- nbrOfUnits(dfN);
gfN <- AromaUnitGenotypeCallFile$allocate(pathnameT, platform=getPlatform(dfN), chipType=getChipType(dfN), nbrOfRows=nbrOfUnits);
footer <- readFooter(gfN);
footer$method <- "NaiveGenotypeCaller";
writeFooter(gfN, footer);
rm(footer);
updateGenotypes(gfN, units=units, calls=calls);
rm(calls);
res <- file.rename(pathnameT, pathname);
if (!isFile(pathname)) {
throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname);
}
if (isFile(pathnameT)) {
throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname);
}
rm(pathnameT);
gfN <- AromaUnitGenotypeCallFile(pathname);
print(gfN);
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Writing confidence scores (via temporary file)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
pathname <- csPathname;
pathnameT <- sprintf("%s.tmp", pathname);
nbrOfUnits <- nbrOfUnits(dfN);
csfN <- AromaUnitSignalBinaryFile$allocate(pathnameT, platform=getPlatform(dfN), chipType=getChipType(dfN), nbrOfRows=nbrOfUnits, types="double", size=4, signed=TRUE);
footer <- readFooter(csfN);
footer$method <- "NaiveGenotypeConfidenceScoreEstimator";
writeFooter(csfN, footer);
rm(footer);
csfN[units, 1] <- cs
rm(cs);
res <- file.rename(pathnameT, pathname);
if (!isFile(pathname)) {
throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname);
}
if (isFile(pathnameT)) {
throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname);
}
rm(pathnameT);
cfN <- AromaUnitSignalBinaryFile(pathname);
print(cfN);
The data sets created are set up as follows:
callData/
TCGA,GBM,BeadStudio,XY,NGC/
HumanHap550/
TCGA-02-0001,10A,01D-0184-06,genotypes,confidenceScores.acf
TCGA-02-0001,10A,01D-0184-06,genotypes.acf
They can be loaded by:
gcN <- AromaUnitGenotypeCallSet$byName(dataSet, tags="NGC", chipType="*"); print(gcN);
This gives:
AromaUnitGenotypeCallSet:Name: TCGA
Tags: GBM,BeadStudio,XY,NGC
Full name: TCGA,GBM,BeadStudio,XY,NGC
Number of files: 1
Names: TCGA-02-0001 [1]
Path (to the first file): callData/TCGA,GBM,BeadStudio,XY,NGC/HumanHap550
Total file size: 1.07 MB
RAM: 0.00MB
csN <- AromaUnitSignalBinarySet$byName(dataSet, tags="NGC", chipType="*", pattern="confidenceScores", paths="callData"); print(csN);
This gives:
AromaUnitSignalBinarySet:
Name: TCGA
Tags: GBM,BeadStudio,XY,NGC
Full name: TCGA,GBM,BeadStudio,XY,NGC
Number of files: 1
Names: TCGA-02-0001 [1]
Path (to the first file): callData/TCGA,GBM,BeadStudio,XY,NGC/HumanHap550
Total file size: 2.14 MB
RAM: 0.00MB