1 # POD at __END__, let the code begin...
3 package Bio
::SeqIO
::fastq
;
6 use Bio
::Seq
::SeqFactory
;
8 use base
qw(Bio::SeqIO);
30 $self->SUPER::_initialize
(@args);
31 my ($variant, $validate, $header) = $self->_rearrange([qw(VARIANT
33 QUALITY_HEADER)], @args);
34 $variant ||= 'sanger';
35 $self->variant($variant);
36 $self->_init_tables($variant);
37 $validate = defined $validate ?
$validate : 1;
38 $self->validate($validate);
39 $header && $self->quality_header($header);
41 if( ! defined $self->sequence_factory ) {
42 $self->sequence_factory(Bio
::Seq
::SeqFactory
->new(
43 -verbose
=> $self->verbose(),
44 -type
=> 'Bio::Seq::Quality')
51 while (defined(my $data = $self->next_dataset)) {
52 # Are FASTQ sequences w/o any sequence valid? Removing for now
54 my $seq = $self->sequence_factory->create(%$data);
66 # speed this up by directly accessing the filehandle and in-lining the
67 # _readline stuff vs. making the repeated method calls. Tradeoff is speed
70 # we can probably normalize line endings using PerlIO::eol or
74 my $line = $self->{lastline
} || <$fh>;
77 while (defined $line) {
78 $line =~ s/\015\012/\012/;
80 if ($mode eq '-seq' && $line =~ m{^@([^\n]+)$}xmso) {
81 $data->{-descriptor
} = $1;
83 if ($data->{-descriptor
} =~ /^\s*(\S+)\s*(.*)/) {
84 ($id,$fulldesc) = ($1, $2);
86 $self->throw("Can't parse fastq header");
89 $data->{-desc
} = $fulldesc;
90 $data->{-namespace
} = $self->variant;
91 } elsif ($mode eq '-seq' && $line =~ m{^\+([^\n]*)}xmso) {
93 $self->throw("No description line parsed") unless $data->{-descriptor
};
94 if ($desc && $data->{-descriptor
} ne $desc) {
95 $self->throw("Quality descriptor [$desc] doesn't match seq ".
96 "descriptor ".$data->{-descriptor
}.", line: $." );
98 $mode = '-raw_quality';
100 if ($mode eq '-raw_quality' && defined($data->{-raw_quality
}) &&
101 (length($data->{-raw_quality
}) >= length($data->{-seq
}))) {
102 $self->{lastline
} = $line;
107 delete $self->{lastline
};
110 $data->{$mode} .= $line
113 if (!defined $line) {
114 delete $self->{lastline
};
120 if (!$data->{-seq
} || !defined($data->{-raw_quality
})) {
121 $self->throw("Missing sequence and/or quality data; line: $.");
124 # simple quality control tests
125 if (length $data->{-seq
} != length $data->{-raw_quality
}) {
126 $self->throw("Quality string [".$data->{-raw_quality
}."] of length [".
127 length($data->{-raw_quality
})."]\ndoesn't match length of sequence ".
128 $data->{-seq
}."\n[".length($data->{-seq
})."], line: $.");
131 $data->{-qual
} = [map {
132 if ($self->{_validate_qual
} && !exists($self->{chr2qual
}->{$_})) {
133 $self->throw("Unknown symbol with ASCII value ".ord($_)." outside ".
137 $self->variant eq 'solexa' ?
138 $self->{sol2phred
}->{$self->{chr2qual
}->{$_}}:
139 $self->{chr2qual
}->{$_};
140 } unpack("A1" x
length($data->{-raw_quality
}), $data->{-raw_quality
})];
144 # This should be creating fastq output only. Bio::SeqIO::fasta and
145 # Bio::SeqIO::qual should be used for that output
148 my ($self,@seq) = @_;
149 my $var = $self->variant;
150 foreach my $seq (@seq) {
151 unless ($seq->isa("Bio::Seq::Quality")){
152 $self->warn("You can't write FASTQ without supplying a Bio::Seq::".
153 "Quality object! ".ref($seq)."\n");
156 my $str = $seq->seq || '';
157 my @qual = @
{$seq->qual};
159 # this should be the origin of the sequence (illumina, solexa, sanger)
160 my $ns= $seq->namespace;
162 my $top = $seq->display_id();
163 if (my $desc = $seq->desc()) {
169 ($ns eq 'solexa' && $var eq 'solexa') ?
$self->{phred_fp2chr
} :
170 ($var eq 'solexa') ?
$self->{phred_int2chr
} :
175 $q = sprintf("%.0f", $q) if ($var ne 'solexa' && $ns eq 'solexa');
176 if (exists $qual_map->{$q}) {
177 $qual .= $qual_map->{$q};
180 # fuzzy mapping, for edited qual scores
181 my $qr = sprintf("%.0f",$q);
182 my $bounds = sprintf("%.1f-%.1f",$qr-0.5, $qr+0.5);
183 if (exists $self->{fuzzy_qual2chr
}->{$bounds}) {
184 $qual .= $self->{fuzzy_qual2chr
}->{$bounds};
187 my $rep = ($q <= $self->{qual_start
}) ?
188 $qual_map->{$self->{qual_start
}} : $qual_map->{$self->{qual_end
}};
194 if ($self->{_validate_qual
} && %bad_qual) {
195 $self->warn("Data loss for $var: following values not found\n".
196 join(',',sort {$a <=> $b} keys %bad_qual))
198 $self->_print("\@",$top,"\n",$str,"\n") or return;
199 $self->_print("+",($self->{_quality_header
} ?
$top : ''),"\n",$qual,"\n") or return;
205 my ($self,@seq) = @_;
206 return $self->write_seq(@seq);
210 my ($self,@seq) = @_;
211 if (!exists($self->{fasta_proxy
})) {
212 $self->{fasta_proxy
} = Bio
::SeqIO
->new(-format
=> 'fasta', -fh
=> $self->_fh);
214 return $self->{fasta_proxy
}->write_seq(@seq);
218 my ($self,@seq) = @_;
219 if (!exists($self->{qual_proxy
})) {
220 $self->{qual_proxy
} = Bio
::SeqIO
->new(-format
=> 'qual', -fh
=> $self->_fh);
222 return $self->{qual_proxy
}->write_seq(@seq);
225 # variant() method inherited from Bio::Root::IO
228 my ($self, $var) = @_;
229 # cache encode/decode values for quicker accession
230 ($self->{qual_start
}, $self->{qual_end
}, $self->{qual_offset
}) =
231 @
{ $variant{$var} }{qw(qual_start qual_end offset)};
232 if ($var eq 'solexa') {
233 for my $q ($self->{qual_start
} .. $self->{qual_end
}) {
234 my $char = chr($q + $self->{qual_offset
});
235 $self->{chr2qual
}->{$char} = $q;
236 $self->{qual2chr
}->{$q} = $char;
237 my $s2p = 10 * log(1 + 10 ** ($q / 10.0)) / log(10);
239 # solexa <=> solexa mapping speedup (retain floating pt precision)
240 $self->{phred_fp2chr
}->{$s2p} = $char;
241 $self->{sol2phred
}->{$q} = $s2p;
243 # this is for mapping values fuzzily (fallback)
244 $self->{fuzzy_qual2chr
}->{sprintf("%.1f-%.1f",$q - 0.5, $q + 0.5)} = $char;
246 next if $q < 0; # skip loop; PHRED scores greater than 0
247 my $p2s = sprintf("%.0f",($q <= 1) ?
-5 : 10 * log(-1 + 10 ** ($q / 10.0)) / log(10));
248 # sanger/illumina PHRED <=> Solexa char mapping speedup
249 $self->{phred_int2chr
}->{$q} = chr($p2s + $self->{qual_offset
});
252 for my $c ($self->{qual_start
}..$self->{qual_end
}) {
254 my $char = chr($c + $self->{qual_offset
});
255 $self->{chr2qual
}->{$char} = $c;
256 $self->{qual2chr
}->{$c} = $char;
257 # this is for mapping values not found with above
258 $self->{fuzzy_qual2chr
}->{sprintf("%.1f-%.1f",$c - 0.5, $c + 0.5)} = $char;
264 my ($self, $val) = @_;
266 $self->{_validate_qual
} = $val;
268 return $self->{_validate_qual
};
272 my ($self, $val) = @_;
274 $self->{_quality_header
} = $val;
276 return $self->{_quality_header
} || 0;
283 # BioPerl module for Bio::SeqIO::fastq
285 # Please direct questions and support issues to <bioperl-l@bioperl.org>
287 # Cared for Chris Fields
289 # Completely refactored from the original FASTQ parser
290 # by Tony Cox <avc@sanger.ac.uk>
292 # Copyright Chris Fields
294 # You may distribute this module under the same terms as perl itself
298 # October 29, 2001 incept data (Tony Cox)
299 # June 20, 2009 updates for Illumina variant FASTQ formats for Solexa and later
300 # Aug 26, 2009 fixed bugs and added tests for fastq.t
302 # POD documentation - main docs before the code
306 Bio::SeqIO::fastq - fastq sequence input/output stream
310 ################## pertains to FASTQ parsing only ##################
312 # grabs the FASTQ parser, specifies the Illumina variant
313 my $in = Bio::SeqIO->new(-format => 'fastq-illumina',
314 -file => 'mydata.fq');
316 # simple 'fastq' format defaults to 'sanger' variant
317 my $out = Bio::SeqIO->new(-format => 'fastq',
318 -file => '>mydata.fq');
320 # $seq is a Bio::Seq::Quality object
321 while (my $seq = $in->next_seq) {
322 $out->write_seq($seq); # convert Illumina 1.3 to Sanger format
325 # for 4x faster parsing, one can do something like this for raw data
326 use Bio::Seq::Quality;
328 # $data is a hash reference containing all arguments to be passed to
329 # the Bio::Seq::Quality constructor
330 while (my $data = $in->next_dataset) {
331 # process $data, such as trim, etc
332 my $seq = Bio::Seq::Quality->new(%$data);
334 # for now, write_seq only accepts Bio::Seq::Quality, but may be modified
335 # to allow raw hash references for speed
336 $out->write_seq($data);
341 This object can transform Bio::Seq and Bio::Seq::Quality objects to and from
342 FASTQ flat file databases.
344 FASTQ is a file format used frequently at the Sanger Centre and in next-gen
345 sequencing to bundle a FASTA sequence and its quality data. A typical FASTQ
346 entry takes the form:
349 GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT.....
351 !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65.....
355 @ = descriptor, followed by one or more sequence lines
356 + = optional descriptor (if present, must match first one), followed by one or
359 When writing FASTQ output the redundant descriptor following the '+' is by
360 default left off to save disk space. If needed, one can set the quality_header()
361 flag in order for this to be printed.
363 =head2 FASTQ and Bio::Seq::Quality mapping
365 FASTQ files have sequence and quality data on single line or multiple lines, and
366 the quality values are single-byte encoded. Data are mapped very simply to
367 Bio::Seq::Quality instances:
369 Data Bio::Seq::Quality method
370 ------------------------------------------------------------------------
371 first non-whitespace chars in descriptor id^
372 descriptor line desc^
375 FASTQ variant namespace
377 ^ first nonwhitespace chars are id(), everything else after (to end of line)
379 * Converted to PHRED quality scores where applicable ('solexa')
381 =head2 FASTQ variants
383 This parser supports all variants of FASTQ, including Illumina v 1.0 and 1.3:
386 -----------------------------------------------------------
388 solexa Solexa, Inc. (2004), aka Illumina 1.0
389 illumina Illumina 1.3
391 The variant can be specified by passing by either passing the additional
392 -variant parameter to the constructor:
394 my $in = Bio::SeqIO->new(-format => 'fastq',
395 -variant => 'solexa',
396 -file => 'mysol.fq');
398 or by passing the format and variant together (Bio::SeqIO will now handle
399 this and convert it accordingly to the proper argument):
401 my $in = Bio::SeqIO->new(-format => 'fastq-solexa',
402 -file => 'mysol.fq');
404 Variants can be converted back and forth from one another; however, due to
405 the difference in scaling for solexa quality reads, converting from 'illumina'
406 or 'sanger' FASTQ to solexa is not recommended.
412 User feedback is an integral part of the evolution of this and other
413 Bioperl modules. Send your comments and suggestions preferably to one
414 of the Bioperl mailing lists. Your participation is much appreciated.
416 bioperl-l@bioperl.org - General discussion
417 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
421 Please direct usage questions or support issues to the mailing list:
423 I<bioperl-l@bioperl.org>
425 rather than to the module maintainer directly. Many experienced and
426 reponsive experts will be able look at the problem and quickly
427 address it. Please include a thorough description of the problem
428 with code and data examples if at all possible.
430 =head2 Reporting Bugs
432 Report bugs to the Bioperl bug tracking system to help us keep track
433 the bugs and their resolution. Bug reports can be submitted via the
436 https://github.com/bioperl/bioperl-live/issues
438 =head1 AUTHORS - Chris Fields (taken over from Tony Cox)
440 Email: cjfields at bioperl dot org
444 The rest of the documentation details each of the object
445 methods. Internal methods are usually preceded with a _
447 =head1 Bio::SeqIO interface methods
452 Usage : $seq = $stream->next_seq()
453 Function : returns the next sequence in the stream
454 Returns : Bio::Seq::Quality object
461 Usage : $stream->write_seq(@seq)
462 Function : writes the $seq object into the stream
463 Returns : 1 for success and 0 for error
464 Args : Bio::Seq::Quality
465 Note : This now conforms to SeqIO spec (module output is same format as
472 Usage : $format = $obj->variant();
473 Function: Get and set method for the quality sequence variant. This is
474 important for indicating the encoding/decoding to be used for
477 Current values accepted are:
478 'sanger' (original FASTQ)
479 ASCII encoding from 33-126, PHRED quality score from 0 to 93
480 'solexa' (aka illumina1.0)
481 ASCII encoding from 59-104, SOLEXA quality score from -5 to 40
482 'illumina' (aka illumina1.3)
483 ASCII encoding from 64-104, PHRED quality score from 0 to 40
485 (Derived from the MAQ website):
486 For 'solexa', scores are converted to PHRED qual scores using:
487 $Q = 10 * log(1 + 10 ** (ord($sq) - 64) / 10.0)) / log(10)
491 Args : new value, string
493 =head1 Plugin-specific methods
498 Usage : $obj->next_dataset
499 Function : returns a hash reference containing the parsed data
500 Returns : hash reference
507 Usage : $stream->write_fastq(@seq)
508 Function: writes the $seq object into the stream
509 Returns : 1 for success and 0 for error
510 Args : Bio::Seq::Quality object
511 Status : Deprecated (delegates to write_seq)
516 Usage : $stream->write_fasta(@seq)
517 Function: writes the $seq object into the stream
518 Returns : 1 for success and 0 for error
519 Args : Bio::Seq object
520 Note : This method does not currently delegate to Bio::SeqIO::fasta
521 (maybe it should?). Not sure whether we should keep this as a
528 Usage : $stream->write_qual(@seq)
529 Function: writes the $seq object into the stream
530 Returns : 1 for success and 0 for error
531 Args : Bio::Seq::Quality object
532 Note : This method does not currently delegate to Bio::SeqIO::qual
533 (maybe it should?). Not sure whether we should keep this as a
540 Usage : $obj->validate(0)
541 Function : flag for format/qual range validation - default is 1, validate
544 Status : Stable (may be moved to interface)
546 =head2 quality_header
548 Title : quality_header
549 Usage : $obj->quality_header
550 Function : flag for printing quality header - default is 0, no header
553 Status : Unstable (name may change dep. on feedback)