aroma.affymetrix 1.7.0
aroma.cn 0.5.0
What's new?
Author: Henrik Bengtsson
Last updated: 2010-06-05
Aroma ACS files are binary files storing probe sequences for a particular chip type. The contains a (cell, probe sequence, strand) map in a tabular format, where the cells are implicit, that is, they are not stored in the file but instead the index of the rows corresponds to the index of the cells.
Note that, contrary to many other chip type annotation files, the probe-sequence annotation never changes (except for correction of errors) and remains the same through-out the life time of a chip type. However, in order to differentiate different version of ACS files in case they are updated, we still suggest to use a tag reflecting when and by whom it was created.
acs <- AromaCellSequenceFile$byChipType("GenomeWideSNP_6");
cells <- c(100:103, 9848:9851);
seqs <- readSequences(acs, cells=cells);
print(seqs);[1] "GCCACACTATGTCTTATTGGTTACA" "CCAACCATTCATTATGTAGGTCCGT" [3] "CCAACCATTCATTGTGTAGGTCCGT" "CCATACCTCGATTGTGGTGGTAGTT" [5] NA "ATCTGCCCAGTGGACCAGCACAGAG" [7] "ATCTGCCCAATGGACCAGCACAGAG" "CTTCAATGACCAGACTGGACGAATA"
seqs <- readSequenceMatrix(acs, cells=cells, positions=10:15); print(seqs);
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] "T" "G" "T" "C" "T" "T"
[2,] "C" "A" "T" "T" "A" "T"
[3,] "C" "A" "T" "T" "G" "T"
[4,] "G" "A" "T" "T" "G" "T"
[5,] NA NA NA NA NA NA
[6,] "G" "T" "G" "G" "A" "C"
[8,] "C" "C" "A" "G" "A" "C"
attr(,"map")
A C G T
00 01 02 03 04
After having downloaded and extracted the Affymetrix probe tab file for the chip type, then place it in the correct annotationData/ directory. This way it will be automatically found.
chipType <- "Mapping50K_Hind240"; ptb <- AffymetrixProbeTabFile$byChipType(chipType); print(ptb); AffymetrixProbeTabFile: Name: Mapping50K_Hind240 Tags: Full name: Mapping50K_Hind240 Pathname: annotationData/chipTypes/Mapping50K_Hind240/Mapping50K_Hind240.probe_tab RAM: 0.01 MB Number of data rows: NA Columns [NA]: Number of text lines: NA # Allocate ACS of same chip type and probe dimensions as the CDF cdf <- AffymetrixCdfFile$byChipType(chipType); acs <- AromaCellSequenceFile$allocateFromCdf(cdf, tags="HB20091208"); print(acs); AromaCellSequenceFile: Name: Mapping50K_Hind240 Tags: HB20091208 Full name: Mapping50K_Hind240,HB20091208 Pathname: annotationData/chipTypes/Mapping50K_Hind240/Mapping50K_Hind240,HB20091208.acs File size: 63.48 MB (66560262 bytes) RAM: 0.00 MB Number of data rows: 2560000 File format: v1 Dimensions: 2560000x26 Column classes: raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw Number of bytes per column: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 Footer: 20091209 20:47:56 PSTAffymetrixMapping50K_Hind240 Chip type: Mapping50K_Hind240 Platform: Affymetrix # Import from the probe-tab file (which contains only PMs). This will also update the file footer. importFrom(acs, ptb, verbose=-10); # For chips with MMs, infer them from the PM sequences (will give an error if no MMs) inferMmFromPm(acs, cdf=cdf, verbose=-10);
Consider a custom chip type 'MitoP-2r520651' for which we have the probe sequences in the following tab-delimited file:
chipType <- "MitoP-2r520651";
cdf <- AffymetrixCdfFile$byChipType(chipType);
db <- TabularTextFile("MitoP-2r520651_probeSequences.txt", path=getPath(cdf));
print(db);
TabularTextFile:
Name: MitoP-2r520651_probeSequences
Tags:
Full name: MitoP-2r520651_probeSequences
Pathname: annotationData/chipTypes/MitoP-2r520651/MitoP-2r520651_probeSequences.txt
File size: 414.74 MB (434886759 bytes)
RAM: 0.01 MB
Number of data rows: NA
Columns [7]: 'Probe Set Name', 'Probe X', 'Probe Y', 'Probe Interrogation Position', 'Probe Sequence', 'Target Strandedness', 'Probe Type'
Number of text lines: NA
Just to give the details of this example file:
print(readLines(db, n=4))
[1] "Probe Set Name\tProbe X\tProbe Y\tProbe Interrogation Position\tProbe Sequence\tTarget Strandedness\tProbe Type"
[2] "ENSE00000931001\t6\t0\t95\tTTATTGGCGTGTCTACTGATCCAGC\tAntisense\tMM"
[3] "ENSE00000931001\t7\t0\t95\tTTATTGGCGTGTTTACTGATCCAGC\tAntisense\tPM"
[4] "ENSE00001310320\t8\t0\t23\tTTTGATAGGTGTCTGAATTCCAGTT\tAntisense\tMM"
# In order to assign sequences to the ACS file, we need the (x,y) probe coordinate
# in addition to the sequence. We also need the strand information (if available).
# We could read all columns, but for large files it is more efficient to read only the
# columns of interest.
colClassPattern <- c("^Probe (X|Y)"="integer", "^(Probe Sequence|Target Strandedness)$"="character");
df <- readDataFrame(db, colClassPattern=colClassPattern);
print(df[1:3,]);
Probe X Probe Y Probe Sequence Target Strandedness
1 6 0 TTATTGGCGTGTCTACTGATCCAGC Antisense
2 7 0 TTATTGGCGTGTTTACTGATCCAGC Antisense
3 8 0 TTTGATAGGTGTCTGAATTCCAGTT Antisense
# The ACS format can only hold 25-mers. However, the input data file
# contains a few exceptions to this:
lens <- nchar(df[["Probe Sequence"]]);
print(table(lens));
19 21 23 25
72 208 5856 6534912
# So, we'll only keep sequences that are 25-mers
df <- df[(lens == 25),];
# Translate (x,y) to cell indices
# More: help("2. Cell coordinates and cell indices", package="affxparser").
x <- df[["Probe X"]];
y <- df[["Probe Y"]];
cells <- nbrOfColumns(cdf) * y + x + 1L;
str(cells);
num [1:6534912] 7 8 9 10 19 20 21 22 23 24 ...
# Extract sequences and target strand
seqs <- df[["Probe Sequence"]];
strands <- df[["Target Strandedness"]];
# Not needed anymore
rm(df, lens);
# Allocate empty ACS file
acs <- AromaCellSequenceFile$allocateFromCdf(cdf, tags="HB20091208");
print(acs);
# Assign sequences and target strands to ACS file*
updateSequences(acs, cells=cells, seqs=seqs, verbose=-10);
updateTargetStrands(acs, cells=cells, strands=strands, verbose=-10);
# Update the file footer
footer <- readFooter(acs);
footer$srcFile <- list(filename=getFilename(db), checksum=getChecksum(db));
footer$createdBy <- list(name="Henrik Bengtsson", email="foo@stat.berkeley.edu");
writeFooter(acs, footer);
# Done
print(acs)
AromaCellSequenceFile:
Name: MitoP-2r520651
Tags: HB20091208
Full name: MitoP-2r520651,HB20091208
Pathname: annotationData/chipTypes/MitoP-2r520651/MitoP-2r520651,HB20091208.acs
File size: 162.50 MB (170394079 bytes)
RAM: 0.00 MB
File format: v1
Dimensions: 6553600x26
Column classes: raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw, raw
Number of bytes per column: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
Footer: 20091209 21:43:55 PSTAffymetrixMitoP-2r520651MitoP-2r520651_probeSequences.txt94e1610d112228e0d51efceac19de17d
Henrik Beng...@stat.berkeley.edu
Chip type: MitoP-2r520651
Platform: Affymetrix
* If you run out of memory while assigning the sequences, you can do it in chunks which is more memory efficient, e.g.
dummy <- lapplyInChunks(seq(along=cells), function(rr) {
updateSequences(acs, cells=cells[rr], seqs=seqs[rr], verbose=-10);
}, chunkSize=500e3)