Vignette: Naive genotype calls and confidence scores

Vignette: Naive genotype calls and confidence scores

Author: Pierre Neuvial
Created: 2010-03-04
Last updated: 2011-03-07

The TumorBoost method for normalizing allele-specific copy number based on a pair of tumor-normal genotyping microarrays uses genotypes of the normal sample for normalizing allele B fractions in a tumor. Here we show how such genotypes can be obtained from allele B fractions in a normal sample.  This is called "naive genotyping" because genotypes are simply called based on local minima of an estimate of the density of allele fractions in the normal sample, as described in Bengtsson et al. (2010).  We also illustrate how associated genotype confidence score files can be created.

WARNING: This naive genotyper is intended for identifying heterozygous SNPs to be used by TumorBoost and/or parent-specific CN segmentation methods, which are methods that are very forgiving on genotyping errors scattered along the genome.  It was not designed to be a per-SNP high-performing genotyper.

 

Data

totalAndFracBData/

  TCGA,GBM,BeadStudio,XY/
    HumanHap550/
      TCGA-02-0001-01C-01D-0184-06,fracB.asb
      TCGA-02-0001-01C-01D-0184-06,total.asb
      TCGA-02-0001-10A-01D-0184-06,fracB.asb
      TCGA-02-0001-10A-01D-0184-06,total.asb
  TCGA,GBM,CRMAv2/
    GenomeWideSNP/
      TCGA-02-0001-01C-01D-0182-01,fracB.asb
      TCGA-02-0001-01C-01D-0182-01,total.asb
      TCGA-02-0001-10A-01D-0182-01,fracB.asb
      TCGA-02-0001-10A-01D-0182-01,total.asb



annotationData/

  chipTypes/

    HumanHap550/
      HumanHap550,TCGA,HB20080512.ugp
      HumanHap550,TCGA,HB20100107,unitNames.txt
    GenomeWideSNP_6/
      GenomeWideSNP_6,Full.CDF
      GenomeWideSNP_6,Full,na26,HB20080821.ugp

Note: above we are showing the same data sets as in the TumorBoost vignette, but in the present vignette only data files corresponding to allelic ratios in normal samples are used: here, "TCGA-*-10A-*,fracB.asb", for normal blood.

 

Setup

library("aroma.cn");
log <- verbose <- Arguments$getVerbose(-8, timestamp=TRUE);
rootPath <- "totalAndFracBData";
rootPath <- Arguments$getReadablePath(rootPath);


dataSets <- c("TCGA,GBM,BeadStudio,XY", "TCGA,GBM,CRMAv2");
dataSet <- dataSets[1]; ## Work with Illumina data

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# Load the raw (tumor,normal) data set
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
ds <- AromaUnitFracBCnBinarySet$byName(dataSet, chipType="*", paths=rootPath);
setFullNamesTranslator(ds, function(names, ...) {
  pattern <- "^(TCGA-[0-9]{2}-[0-9]{4})-([0-9]{2}[A-Z])[-]*(.*)";
  gsub(pattern, "\\1,\\2,\\3", names);
});
print(ds);

This gives

 

AromaUnitFracBCnBinarySet:

Name: TCGA

Tags: GBM,BeadStudio,XY

Full name: TCGA,GBM,BeadStudio,XY

Number of files: 2

Names: TCGA-02-0001, TCGA-02-0001 [2]

Path (to the first file): totalAndFracBData/TCGA,GBM,BeadStudio,XY/HumanHap550

Total file size: 4.28 MB

RAM: 0.00MB

 

 

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# Extract the normals
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
types <- sapply(ds, FUN=function(df) getTags(df)[1]);
normals <- grep("(10|11)[A-Z]", types);
dsN <- extract(ds, normals);
print(dsN);

 

This gives

 

AromaUnitFracBCnBinarySet:

Name: TCGA

Tags: GBM,BeadStudio,XY

Full name: TCGA,GBM,BeadStudio,XY

Number of files: 1

Names: TCGA-02-0001 [1]

Path (to the first file): totalAndFracBData/TCGA,GBM,BeadStudio,XY/HumanHap550

Total file size: 2.14 MB

RAM: 0.00MB


 

Naive genotype calling and associated confidence scores

fullname <- paste(c(getFullName(dsN), "NGC"), collapse=",");
chipType <- getChipType(dsN, fullname=FALSE);
outPath <- file.path("callData", fullname, chipType);

units <- NULL;
if (is.null(units)) {
  df <- getFile(dsN, 1);
  units <- seq(length=nbrOfUnits(df));
  rm(df);
}

adjust <- 1.5;
type <- NULL; rm(type); # To please R CMD check

# Identify units on ChrX and ChrY
ugp <- getAromaUgpFile(dsN);
units23 <- getUnitsOnChromosome(ugp, 23);
is23 <- is.element(units, units23);
units24 <- getUnitsOnChromosome(ugp, 24);
is24 <- is.element(units, units24);

kk <- 1; 
dfN <- getFile(dsN, kk);

tags <- getTags(dfN);
tags <- setdiff(tags, "fracB");
tags <- c(tags, "genotypes");
fullname <- paste(c(getName(dfN), tags), collapse=",");

filename <- sprintf("%s.acf", fullname);
gcPathname <- Arguments$getWritablePathname(filename, path=outPath, mustNotExist=FALSE);

csTags <- c(tags, "confidenceScores");
fullname <- paste(c(getName(dfN), csTags), collapse=",");
filename <- sprintf("%s.acf", fullname);
csPathname <- Arguments$getWritablePathname(filename, path=outPath, mustNotExist=FALSE);

if (isFile(gcPathname) && isFile(csPathname)) {
  next;
}

betaN <- dfN[units,1,drop=TRUE];

# Call gender
gender <- callXXorXY(betaN[is23], betaN[is24], adjust=adjust, from=0, to=1);

# Call genotypes 
naValue <- as.double(NA);
fit <- NULL;
mu <- rep(naValue, times=length(units));
cs <- rep(naValue, times=length(units));

if (gender == "XY") {
  # All but ChrX & ChrY in male
  isDiploid <- (!(is23 | is24));
  use <- which(isDiploid);
  muT <- callNaiveGenotypes(betaN[use], cn=2, adjust=adjust, from=0, to=1,
                                        verbose=less(verbose,10));
  fit <- attr(muT, 'modelFit');
  mu[use] <- muT;
  use <- which(!isDiploid);
  muT <- callNaiveGenotypes(betaN[use], cn=1, adjust=adjust, from=0, to=1,
                                         verbose=less(verbose,10));
  mu[use] <- muT;
} else {
  # All but ChrY in female
  isDiploid <- (!is24);
  use <- which(isDiploid);
  muT <- callNaiveGenotypes(betaN[use], cn=2, adjust=adjust, from=0, to=1,
                                        verbose=less(verbose,10));
  fit <- attr(muT, 'modelFit');
  mu[use] <- muT;
}
print(table(mu, exclude=NULL));

# Translate genotype calls in fracB space to (AA,AB,BB,...)
calls <- rep(as.character(NA), times=length(mu));
calls[mu ==   0] <- "AA";
calls[mu == 1/2] <- "AB";
calls[mu ==   1] <- "BB";
print(table(calls, exclude=NULL));

# Calculate confidence scores
a <- fit[[1]]$fitValleys$x[1];
b <- fit[[1]]$fitValleys$x[2];
cs[isDiploid] <- rowMins(abs(cbind(betaN[isDiploid]-a, betaN[isDiploid]-b)));
print(table(mu, exclude=NULL));

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# Writing genotype calls (via temporary file)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
pathname <- gcPathname;
pathnameT <- sprintf("%s.tmp", pathname);
nbrOfUnits <- nbrOfUnits(dfN);
gfN <- AromaUnitGenotypeCallFile$allocate(pathnameT, platform=getPlatform(dfN), chipType=getChipType(dfN), nbrOfRows=nbrOfUnits);
footer <- readFooter(gfN);
footer$method <- "NaiveGenotypeCaller";
writeFooter(gfN, footer);
rm(footer);

updateGenotypes(gfN, units=units, calls=calls);
rm(calls);

res <- file.rename(pathnameT, pathname);
if (!isFile(pathname)) {
  throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname);
}
if (isFile(pathnameT)) {
  throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname);
}
rm(pathnameT);

gfN <- AromaUnitGenotypeCallFile(pathname);

print(gfN);




# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# Writing confidence scores (via temporary file)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
pathname <- csPathname;
pathnameT <- sprintf("%s.tmp", pathname);
nbrOfUnits <- nbrOfUnits(dfN);
csfN <- AromaUnitSignalBinaryFile$allocate(pathnameT, platform=getPlatform(dfN), chipType=getChipType(dfN), nbrOfRows=nbrOfUnits, types="double", size=4, signed=TRUE);
footer <- readFooter(csfN);
footer$method <- "NaiveGenotypeConfidenceScoreEstimator";
writeFooter(csfN, footer);
rm(footer);

csfN[units, 1] <- cs
rm(cs);

res <- file.rename(pathnameT, pathname);
if (!isFile(pathname)) {
  throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname);
}
if (isFile(pathnameT)) {
  throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname);
}
rm(pathnameT);



cfN <- AromaUnitSignalBinaryFile(pathname);
print(cfN);

 

The data sets created are set up as follows:

 

callData/

  TCGA,GBM,BeadStudio,XY,NGC/

    HumanHap550/

        TCGA-02-0001,10A,01D-0184-06,genotypes,confidenceScores.acf

        TCGA-02-0001,10A,01D-0184-06,genotypes.acf

 

They can be loaded by:

 

gcN <- AromaUnitGenotypeCallSet$byName(dataSet, tags="NGC", chipType="*");
print(gcN);


This gives:

 

AromaUnitGenotypeCallSet:Name: TCGA

Tags: GBM,BeadStudio,XY,NGC

Full name: TCGA,GBM,BeadStudio,XY,NGC

Number of files: 1

Names: TCGA-02-0001 [1]

Path (to the first file): callData/TCGA,GBM,BeadStudio,XY,NGC/HumanHap550

Total file size: 1.07 MB

RAM: 0.00MB

 

 

csN <- AromaUnitSignalBinarySet$byName(dataSet, tags="NGC", chipType="*", pattern="confidenceScores", paths="callData");
print(csN);

 

This gives:

 

AromaUnitSignalBinarySet:

Name: TCGA

Tags: GBM,BeadStudio,XY,NGC

Full name: TCGA,GBM,BeadStudio,XY,NGC

Number of files: 1

Names: TCGA-02-0001 [1]

Path (to the first file): callData/TCGA,GBM,BeadStudio,XY,NGC/HumanHap550

Total file size: 2.14 MB

RAM: 0.00MB