Vignette: Naive genotype calls and confidence scores

Author: Pierre Neuvial
Created: 2010-03-04
Last updated: 2011-03-07

The TumorBoost method for normalizing allele-specific copy number based on a pair of tumor-normal genotyping microarrays uses genotypes of the normal sample for normalizing allele B fractions in a tumor. Here we show how such genotypes can be obtained from allele B fractions in a normal sample. This is called "naive genotyping" because genotypes are simply called based on local minima of an estimate of the density of allele fractions in the normal sample, as described in Bengtsson, Neuvial, and Speed (2010). We also illustrate how associated genotype confidence score files can be created.

WARNING: This naive genotyper is intended for identifying heterozygous SNPs to be used by TumorBoost and/or parent-specific CN segmentation methods, which are methods that are very forgiving on genotyping errors scattered along the genome. It was not designed to be a per-SNP high-performing genotyper.

Data

totalAndFracBData/
  TCGA,GBM,BeadStudio,XY/
    HumanHap550/
      TCGA-02-0001-01C-01D-0184-06,fracB.asb
      TCGA-02-0001-01C-01D-0184-06,total.asb
      **TCGA-02-0001-10A-01D-0184-06,fracB.asb**
      TCGA-02-0001-10A-01D-0184-06,total.asb

  TCGA,GBM,CRMAv2/
    GenomeWideSNP/
      TCGA-02-0001-01C-01D-0182-01,fracB.asb
      TCGA-02-0001-01C-01D-0182-01,total.asb
      **TCGA-02-0001-10A-01D-0182-01,fracB.asb**
      TCGA-02-0001-10A-01D-0182-01,total.asb

annotationData/
  chipTypes/
    HumanHap550/
      HumanHap550,TCGA,HB20080512.ugp
      HumanHap550,TCGA,HB20100107,unitNames.txt  

    GenomeWideSNP_6/
      GenomeWideSNP_6,Full.CDF
      GenomeWideSNP_6,Full,na26,HB20080821.ugp

Note: above we are showing the same data sets as in the TumorBoost vignette, but in the present vignette only data files corresponding to allelic ratios in normal samples are used: here, "TCGA--10A-,fracB.asb", for normal blood.

Setup

library("aroma.cn")
log <- verbose <- Arguments$getVerbose(-8, timestamp=TRUE)
rootPath <- "totalAndFracBData"
rootPath <- Arguments$getReadablePath(rootPath)

dataSets <- c("TCGA,GBM,BeadStudio,XY", "TCGA,GBM,CRMAv2")
dataSet <- dataSets[1]  ## Work with Illumina data

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Load the raw (tumor,normal) data set
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ds <- AromaUnitFracBCnBinarySet$byName(dataSet, chipType="*", paths=rootPath)
setFullNamesTranslator(ds, function(names, ...) {
  pattern <- "^(TCGA-[0-9]{2}-[0-9]{4})-([0-9]{2}[A-Z])[-]*(.*)"
  gsub(pattern, "\\1,\\2,\\3", names)
})
print(ds)

This gives

AromaUnitFracBCnBinarySet:
Name: TCGA
Tags: GBM,BeadStudio,XY
Full name: TCGA,GBM,BeadStudio,XY
Number of files: 2
Names: TCGA-02-0001, TCGA-02-0001 [2]
Path (to the first file): totalAndFracBData/TCGA,GBM,BeadStudio,XY/HumanHap550
Total file size: 4.28 MB
RAM: 0.00MB

Extract the normals

types <- sapply(ds, FUN=function(df) getTags(df)[1])
normals <- grep("(10|11)[A-Z]", types)
dsN <- ds[normals]
print(dsN)

This gives

AromaUnitFracBCnBinarySet:
Name: TCGA
Tags: GBM,BeadStudio,XY
Full name: TCGA,GBM,BeadStudio,XY
Number of files: 1
Names: TCGA-02-0001 [1]
Path (to the first file): totalAndFracBData/TCGA,GBM,BeadStudio,XY/HumanHap550
Total file size: 2.14 MB
RAM: 0.00MB

Naive genotype calling and associated confidence scores

fullname <- paste(c(getFullName(dsN), "NGC"), collapse=",")
chipType <- getChipType(dsN, fullname=FALSE)
outPath <- file.path("callData", fullname, chipType)

units <- NULL
if (is.null(units)) {
  df <- dsN[[1]]
  units <- seq(length=nbrOfUnits(df))
}

adjust <- 1.5

# Identify units on ChrX and ChrY
ugp <- getAromaUgpFile(dsN)
units23 <- getUnitsOnChromosome(ugp, 23)
is23 <- is.element(units, units23)
units24 <- getUnitsOnChromosome(ugp, 24)
is24 <- is.element(units, units24)

kk <- 1
dfN <- dsN[[kk]]

tags <- getTags(dfN)
tags <- setdiff(tags, "fracB")
tags <- c(tags, "genotypes")
fullname <- paste(c(getName(dfN), tags), collapse=",")

filename <- sprintf("%s.acf", fullname)
gcPathname <- Arguments$getWritablePathname(filename, path=outPath, mustNotExist=FALSE)

csTags <- c(tags, "confidenceScores")
fullname <- paste(c(getName(dfN), csTags), collapse=",")
filename <- sprintf("%s.acf", fullname)
csPathname <- Arguments$getWritablePathname(filename, path=outPath, mustNotExist=FALSE)

if (isFile(gcPathname) && isFile(csPathname)) {
  next
}

betaN <- dfN[units,1,drop=TRUE]

# Call gender
gender <- callXXorXY(betaN[is23], betaN[is24], adjust=adjust, from=0, to=1)

# Call genotypes
naValue <- as.double(NA)
fit <- NULL
mu <- rep(naValue, times=length(units))
cs <- rep(naValue, times=length(units))

if (gender == "XY") {
  # All but ChrX & ChrY in male
  isDiploid <- (!(is23 | is24))
  use <- which(isDiploid)
  muT <- callNaiveGenotypes(betaN[use], cn=2, adjust=adjust, from=0, to=1,
                                        verbose=less(verbose,10))
  fit <- attr(muT, 'modelFit')
  mu[use] <- muT
  use <- which(!isDiploid)
  muT <- callNaiveGenotypes(betaN[use], cn=1, adjust=adjust, from=0, to=1,
                                         verbose=less(verbose,10))
  mu[use] <- muT
} else {
  # All but ChrY in female
  isDiploid <- (!is24)
  use <- which(isDiploid)
  muT <- callNaiveGenotypes(betaN[use], cn=2, adjust=adjust, from=0, to=1,
                                        verbose=less(verbose,10))
  fit <- attr(muT, 'modelFit')
  mu[use] <- muT
}
print(table(mu, exclude=NULL))

# Translate genotype calls in fracB space to (AA,AB,BB,...)
calls <- rep(as.character(NA), times=length(mu))
calls[mu ==   0] <- "AA"
calls[mu == 1/2] <- "AB"
calls[mu ==   1] <- "BB"
print(table(calls, exclude=NULL))

# Calculate confidence scores
a <- fit[[1]]$fitValleys$x[1]
b <- fit[[1]]$fitValleys$x[2]
cs[isDiploid] <- rowMins(abs(cbind(betaN[isDiploid]-a, betaN[isDiploid]-b)))
print(table(mu, exclude=NULL))

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Writing genotype calls (via temporary file)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
pathname <- gcPathname
pathnameT <- sprintf("%s.tmp", pathname)
nbrOfUnits <- nbrOfUnits(dfN)
gfN <- AromaUnitGenotypeCallFile$allocate(pathnameT, platform=getPlatform(dfN), chipType=getChipType(dfN), nbrOfRows=nbrOfUnits)
footer <- readFooter(gfN)
footer$method <- "NaiveGenotypeCaller"
writeFooter(gfN, footer)

updateGenotypes(gfN, units=units, calls=calls)

res <- file.rename(pathnameT, pathname)
if (!isFile(pathname)) {
  throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname)
}
if (isFile(pathnameT)) {
  throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname)
}

gfN <- AromaUnitGenotypeCallFile(pathname)

print(gfN)




# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Writing confidence scores (via temporary file)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
pathname <- csPathname
pathnameT <- sprintf("%s.tmp", pathname)
nbrOfUnits <- nbrOfUnits(dfN)
csfN <- AromaUnitSignalBinaryFile$allocate(pathnameT, platform=getPlatform(dfN), chipType=getChipType(dfN), nbrOfRows=nbrOfUnits, types="double", size=4, signed=TRUE)
footer <- readFooter(csfN)
footer$method <- "NaiveGenotypeConfidenceScoreEstimator"
writeFooter(csfN, footer)

csfN[units, 1] <- cs

res <- file.rename(pathnameT, pathname)
if (!isFile(pathname)) {
  throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname)
}
if (isFile(pathnameT)) {
  throw("Failed to rename temporary file: ", pathnameT, " -> ", pathname)
}

cfN <- AromaUnitSignalBinaryFile(pathname)
print(cfN)

The data sets created are set up as follows:

callData/
  TCGA,GBM,BeadStudio,XY,NGC/
    HumanHap550/
      TCGA-02-0001,10A,01D-0184-06,genotypes,confidenceScores.acf
      TCGA-02-0001,10A,01D-0184-06,genotypes.acf

They can be loaded by:

gcN <- AromaUnitGenotypeCallSet$byName(dataSet, tags="NGC", chipType="*")
print(gcN)

This gives:

AromaUnitGenotypeCallSet:Name: TCGA
Tags: GBM,BeadStudio,XY,NGC
Full name: TCGA,GBM,BeadStudio,XY,NGC
Number of files: 1
Names: TCGA-02-0001 [1]
Path (to the first file): callData/TCGA,GBM,BeadStudio,XY,NGC/HumanHap550
Total file size: 1.07 MB
RAM: 0.00MB

csN <- AromaUnitSignalBinarySet$byName(dataSet, tags="NGC", chipType="*", pattern="confidenceScores", paths="callData")
print(csN)

This gives:

AromaUnitSignalBinarySet:
Name: TCGA
Tags: GBM,BeadStudio,XY,NGC
Full name: TCGA,GBM,BeadStudio,XY,NGC
Number of files: 1
Names: TCGA-02-0001 [1]
Path (to the first file): callData/TCGA,GBM,BeadStudio,XY,NGC/HumanHap550
Total file size: 2.14 MB
RAM: 0.00MB

References

[1] H. Bengtsson, P. Neuvial, and T. P. Speed. "TumorBoost: normalization of allele-specific tumor copy numbers from a single pair of tumor-normal genotyping microarrays". Eng. In: BMC bioinformatics 11 (May. 2010), p. 245. ISSN: 1471-2105. DOI: 10.1186/1471-2105-11-245. PMID: 20462408.