maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / SeqIO / fastq.pm
blobf17d4e05ce7f0d48c65f71709135c251343ea45f
1 # POD at __END__, let the code begin...
3 package Bio::SeqIO::fastq;
4 use strict;
6 use Bio::Seq::SeqFactory;
8 use base qw(Bio::SeqIO);
10 our %variant = (
11 sanger => {
12 'offset' => 33,
13 'qual_start' => 0,
14 'qual_end' => 93
16 solexa => {
17 'offset' => 64,
18 'qual_start' => -5,
19 'qual_end' => 62
21 illumina => {
22 'offset' => 64,
23 'qual_start' => 0,
24 'qual_end' => 62
28 sub _initialize {
29 my($self,@args) = @_;
30 $self->SUPER::_initialize(@args);
31 my ($variant, $validate, $header) = $self->_rearrange([qw(VARIANT
32 VALIDATE
33 QUALITY_HEADER)], @args);
34 $variant ||= 'sanger';
35 $self->variant($variant);
36 $self->_init_tables($variant);
37 $validate = defined $validate ? $validate : 1;
38 $self->validate($validate);
39 $header && $self->quality_header($header);
41 if( ! defined $self->sequence_factory ) {
42 $self->sequence_factory(Bio::Seq::SeqFactory->new(
43 -verbose => $self->verbose(),
44 -type => 'Bio::Seq::Quality')
49 sub next_seq {
50 my( $self ) = @_;
51 while (defined(my $data = $self->next_dataset)) {
52 # Are FASTQ sequences w/o any sequence valid? Removing for now
53 # -cjfields 6.22.09
54 my $seq = $self->sequence_factory->create(%$data);
55 return $seq;
57 return;
60 # pure perl version
61 sub next_dataset {
62 my $self = shift;
63 local $/ = "\n";
64 my $data;
65 my $mode = '-seq';
66 # speed this up by directly accessing the filehandle and in-lining the
67 # _readline stuff vs. making the repeated method calls. Tradeoff is speed
68 # over repeated code.
70 # we can probably normalize line endings using PerlIO::eol or
71 # Encode::Newlines
73 my $fh = $self->_fh;
74 my $line = $self->{lastline} || <$fh>;
76 FASTQ:
77 while (defined $line) {
78 $line =~ s/\015\012/\012/;
79 $line =~ tr/\015/\n/;
80 if ($mode eq '-seq' && $line =~ m{^@([^\n]+)$}xmso) {
81 $data->{-descriptor} = $1;
82 my ($id,$fulldesc);
83 if ($data->{-descriptor} =~ /^\s*(\S+)\s*(.*)/) {
84 ($id,$fulldesc) = ($1, $2);
85 } else {
86 $self->throw("Can't parse fastq header");
88 $data->{-id} = $id;
89 $data->{-desc} = $fulldesc;
90 $data->{-namespace} = $self->variant;
91 } elsif ($mode eq '-seq' && $line =~ m{^\+([^\n]*)}xmso) {
92 my $desc = $1;
93 $self->throw("No description line parsed") unless $data->{-descriptor};
94 if ($desc && $data->{-descriptor} ne $desc) {
95 $self->throw("Quality descriptor [$desc] doesn't match seq ".
96 "descriptor ".$data->{-descriptor}.", line: $." );
98 $mode = '-raw_quality';
99 } else {
100 if ($mode eq '-raw_quality' && defined($data->{-raw_quality}) &&
101 (length($data->{-raw_quality}) >= length($data->{-seq}))) {
102 $self->{lastline} = $line;
103 last FASTQ
105 chomp $line;
106 if ($line =~ /^$/) {
107 delete $self->{lastline};
108 last FASTQ;
110 $data->{$mode} .= $line
112 $line = <$fh>;
113 if (!defined $line) {
114 delete $self->{lastline};
115 last FASTQ;
119 return unless $data;
120 if (!$data->{-seq} || !defined($data->{-raw_quality})) {
121 $self->throw("Missing sequence and/or quality data; line: $.");
124 # simple quality control tests
125 if (length $data->{-seq} != length $data->{-raw_quality}) {
126 $self->throw("Quality string [".$data->{-raw_quality}."] of length [".
127 length($data->{-raw_quality})."]\ndoesn't match length of sequence ".
128 $data->{-seq}."\n[".length($data->{-seq})."], line: $.");
131 $data->{-qual} = [map {
132 if ($self->{_validate_qual} && !exists($self->{chr2qual}->{$_})) {
133 $self->throw("Unknown symbol with ASCII value ".ord($_)." outside ".
134 "of quality range")
135 # TODO: fallback?
137 $self->variant eq 'solexa' ?
138 $self->{sol2phred}->{$self->{chr2qual}->{$_}}:
139 $self->{chr2qual}->{$_};
140 } unpack("A1" x length($data->{-raw_quality}), $data->{-raw_quality})];
141 return $data;
144 # This should be creating fastq output only. Bio::SeqIO::fasta and
145 # Bio::SeqIO::qual should be used for that output
147 sub write_seq {
148 my ($self,@seq) = @_;
149 my $var = $self->variant;
150 foreach my $seq (@seq) {
151 unless ($seq->isa("Bio::Seq::Quality")){
152 $self->warn("You can't write FASTQ without supplying a Bio::Seq::".
153 "Quality object! ".ref($seq)."\n");
154 next;
156 my $str = $seq->seq || '';
157 my @qual = @{$seq->qual};
159 # this should be the origin of the sequence (illumina, solexa, sanger)
160 my $ns= $seq->namespace;
162 my $top = $seq->display_id();
163 if (my $desc = $seq->desc()) {
164 $desc =~ s/\n//g;
165 $top .= " $desc";
167 my $qual = '';
168 my $qual_map =
169 ($ns eq 'solexa' && $var eq 'solexa') ? $self->{phred_fp2chr} :
170 ($var eq 'solexa') ? $self->{phred_int2chr} :
171 $self->{qual2chr};
173 my %bad_qual;
174 for my $q (@qual) {
175 $q = sprintf("%.0f", $q) if ($var ne 'solexa' && $ns eq 'solexa');
176 if (exists $qual_map->{$q}) {
177 $qual .= $qual_map->{$q};
178 next;
179 } else {
180 # fuzzy mapping, for edited qual scores
181 my $qr = sprintf("%.0f",$q);
182 my $bounds = sprintf("%.1f-%.1f",$qr-0.5, $qr+0.5);
183 if (exists $self->{fuzzy_qual2chr}->{$bounds}) {
184 $qual .= $self->{fuzzy_qual2chr}->{$bounds};
185 next;
186 } else {
187 my $rep = ($q <= $self->{qual_start}) ?
188 $qual_map->{$self->{qual_start}} : $qual_map->{$self->{qual_end}};
189 $qual .= $rep;
190 $bad_qual{$q}++;
194 if ($self->{_validate_qual} && %bad_qual) {
195 $self->warn("Data loss for $var: following values not found\n".
196 join(',',sort {$a <=> $b} keys %bad_qual))
198 $self->_print("\@",$top,"\n",$str,"\n") or return;
199 $self->_print("+",($self->{_quality_header} ? $top : ''),"\n",$qual,"\n") or return;
201 return 1;
204 sub write_fastq {
205 my ($self,@seq) = @_;
206 return $self->write_seq(@seq);
209 sub write_fasta {
210 my ($self,@seq) = @_;
211 if (!exists($self->{fasta_proxy})) {
212 $self->{fasta_proxy} = Bio::SeqIO->new(-format => 'fasta', -fh => $self->_fh);
214 return $self->{fasta_proxy}->write_seq(@seq);
217 sub write_qual {
218 my ($self,@seq) = @_;
219 if (!exists($self->{qual_proxy})) {
220 $self->{qual_proxy} = Bio::SeqIO->new(-format => 'qual', -fh => $self->_fh);
222 return $self->{qual_proxy}->write_seq(@seq);
225 # variant() method inherited from Bio::Root::IO
227 sub _init_tables {
228 my ($self, $var) = @_;
229 # cache encode/decode values for quicker accession
230 ($self->{qual_start}, $self->{qual_end}, $self->{qual_offset}) =
231 @{ $variant{$var} }{qw(qual_start qual_end offset)};
232 if ($var eq 'solexa') {
233 for my $q ($self->{qual_start} .. $self->{qual_end}) {
234 my $char = chr($q + $self->{qual_offset});
235 $self->{chr2qual}->{$char} = $q;
236 $self->{qual2chr}->{$q} = $char;
237 my $s2p = 10 * log(1 + 10 ** ($q / 10.0)) / log(10);
239 # solexa <=> solexa mapping speedup (retain floating pt precision)
240 $self->{phred_fp2chr}->{$s2p} = $char;
241 $self->{sol2phred}->{$q} = $s2p;
243 # this is for mapping values fuzzily (fallback)
244 $self->{fuzzy_qual2chr}->{sprintf("%.1f-%.1f",$q - 0.5, $q + 0.5)} = $char;
246 next if $q < 0; # skip loop; PHRED scores greater than 0
247 my $p2s = sprintf("%.0f",($q <= 1) ? -5 : 10 * log(-1 + 10 ** ($q / 10.0)) / log(10));
248 # sanger/illumina PHRED <=> Solexa char mapping speedup
249 $self->{phred_int2chr}->{$q} = chr($p2s + $self->{qual_offset});
251 } else {
252 for my $c ($self->{qual_start}..$self->{qual_end}) {
253 # PHRED mapping
254 my $char = chr($c + $self->{qual_offset});
255 $self->{chr2qual}->{$char} = $c;
256 $self->{qual2chr}->{$c} = $char;
257 # this is for mapping values not found with above
258 $self->{fuzzy_qual2chr}->{sprintf("%.1f-%.1f",$c - 0.5, $c + 0.5)} = $char;
263 sub validate {
264 my ($self, $val) = @_;
265 if (defined $val) {
266 $self->{_validate_qual} = $val;
268 return $self->{_validate_qual};
271 sub quality_header{
272 my ($self, $val) = @_;
273 if (defined $val) {
274 $self->{_quality_header} = $val;
276 return $self->{_quality_header} || 0;
281 __END__
283 # BioPerl module for Bio::SeqIO::fastq
285 # Please direct questions and support issues to <bioperl-l@bioperl.org>
287 # Cared for Chris Fields
289 # Completely refactored from the original FASTQ parser
290 # by Tony Cox <avc@sanger.ac.uk>
292 # Copyright Chris Fields
294 # You may distribute this module under the same terms as perl itself
296 # _history
298 # October 29, 2001 incept data (Tony Cox)
299 # June 20, 2009 updates for Illumina variant FASTQ formats for Solexa and later
300 # Aug 26, 2009 fixed bugs and added tests for fastq.t
302 # POD documentation - main docs before the code
304 =head1 NAME
306 Bio::SeqIO::fastq - fastq sequence input/output stream
308 =head1 SYNOPSIS
310 ################## pertains to FASTQ parsing only ##################
312 # grabs the FASTQ parser, specifies the Illumina variant
313 my $in = Bio::SeqIO->new(-format => 'fastq-illumina',
314 -file => 'mydata.fq');
316 # simple 'fastq' format defaults to 'sanger' variant
317 my $out = Bio::SeqIO->new(-format => 'fastq',
318 -file => '>mydata.fq');
320 # $seq is a Bio::Seq::Quality object
321 while (my $seq = $in->next_seq) {
322 $out->write_seq($seq); # convert Illumina 1.3 to Sanger format
325 # for 4x faster parsing, one can do something like this for raw data
326 use Bio::Seq::Quality;
328 # $data is a hash reference containing all arguments to be passed to
329 # the Bio::Seq::Quality constructor
330 while (my $data = $in->next_dataset) {
331 # process $data, such as trim, etc
332 my $seq = Bio::Seq::Quality->new(%$data);
334 # for now, write_seq only accepts Bio::Seq::Quality, but may be modified
335 # to allow raw hash references for speed
336 $out->write_seq($data);
339 =head1 DESCRIPTION
341 This object can transform Bio::Seq and Bio::Seq::Quality objects to and from
342 FASTQ flat file databases.
344 FASTQ is a file format used frequently at the Sanger Centre and in next-gen
345 sequencing to bundle a FASTA sequence and its quality data. A typical FASTQ
346 entry takes the form:
348 @HCDPQ1D0501
349 GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT.....
350 +HCDPQ1D0501
351 !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65.....
353 where:
355 @ = descriptor, followed by one or more sequence lines
356 + = optional descriptor (if present, must match first one), followed by one or
357 more qual lines
359 When writing FASTQ output the redundant descriptor following the '+' is by
360 default left off to save disk space. If needed, one can set the quality_header()
361 flag in order for this to be printed.
363 =head2 FASTQ and Bio::Seq::Quality mapping
365 FASTQ files have sequence and quality data on single line or multiple lines, and
366 the quality values are single-byte encoded. Data are mapped very simply to
367 Bio::Seq::Quality instances:
369 Data Bio::Seq::Quality method
370 ------------------------------------------------------------------------
371 first non-whitespace chars in descriptor id^
372 descriptor line desc^
373 sequence lines seq
374 quality qual*
375 FASTQ variant namespace
377 ^ first nonwhitespace chars are id(), everything else after (to end of line)
378 is in desc()
379 * Converted to PHRED quality scores where applicable ('solexa')
381 =head2 FASTQ variants
383 This parser supports all variants of FASTQ, including Illumina v 1.0 and 1.3:
385 variant note
386 -----------------------------------------------------------
387 sanger original
388 solexa Solexa, Inc. (2004), aka Illumina 1.0
389 illumina Illumina 1.3
391 The variant can be specified by passing by either passing the additional
392 -variant parameter to the constructor:
394 my $in = Bio::SeqIO->new(-format => 'fastq',
395 -variant => 'solexa',
396 -file => 'mysol.fq');
398 or by passing the format and variant together (Bio::SeqIO will now handle
399 this and convert it accordingly to the proper argument):
401 my $in = Bio::SeqIO->new(-format => 'fastq-solexa',
402 -file => 'mysol.fq');
404 Variants can be converted back and forth from one another; however, due to
405 the difference in scaling for solexa quality reads, converting from 'illumina'
406 or 'sanger' FASTQ to solexa is not recommended.
408 =head1 FEEDBACK
410 =head2 Mailing Lists
412 User feedback is an integral part of the evolution of this and other
413 Bioperl modules. Send your comments and suggestions preferably to one
414 of the Bioperl mailing lists. Your participation is much appreciated.
416 bioperl-l@bioperl.org - General discussion
417 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
419 =head2 Support
421 Please direct usage questions or support issues to the mailing list:
423 I<bioperl-l@bioperl.org>
425 rather than to the module maintainer directly. Many experienced and
426 reponsive experts will be able look at the problem and quickly
427 address it. Please include a thorough description of the problem
428 with code and data examples if at all possible.
430 =head2 Reporting Bugs
432 Report bugs to the Bioperl bug tracking system to help us keep track
433 the bugs and their resolution. Bug reports can be submitted via the
434 web:
436 https://github.com/bioperl/bioperl-live/issues
438 =head1 AUTHORS - Chris Fields (taken over from Tony Cox)
440 Email: cjfields at bioperl dot org
442 =head1 APPENDIX
444 The rest of the documentation details each of the object
445 methods. Internal methods are usually preceded with a _
447 =head1 Bio::SeqIO interface methods
449 =head2 next_seq
451 Title : next_seq
452 Usage : $seq = $stream->next_seq()
453 Function : returns the next sequence in the stream
454 Returns : Bio::Seq::Quality object
455 Args : NONE
456 Status : Stable
458 =head2 write_seq
460 Title : write_seq
461 Usage : $stream->write_seq(@seq)
462 Function : writes the $seq object into the stream
463 Returns : 1 for success and 0 for error
464 Args : Bio::Seq::Quality
465 Note : This now conforms to SeqIO spec (module output is same format as
466 next_seq)
467 Status : Stable
469 =head2 variant
471 Title : variant
472 Usage : $format = $obj->variant();
473 Function: Get and set method for the quality sequence variant. This is
474 important for indicating the encoding/decoding to be used for
475 quality data.
477 Current values accepted are:
478 'sanger' (original FASTQ)
479 ASCII encoding from 33-126, PHRED quality score from 0 to 93
480 'solexa' (aka illumina1.0)
481 ASCII encoding from 59-104, SOLEXA quality score from -5 to 40
482 'illumina' (aka illumina1.3)
483 ASCII encoding from 64-104, PHRED quality score from 0 to 40
485 (Derived from the MAQ website):
486 For 'solexa', scores are converted to PHRED qual scores using:
487 $Q = 10 * log(1 + 10 ** (ord($sq) - 64) / 10.0)) / log(10)
490 Returns : string
491 Args : new value, string
493 =head1 Plugin-specific methods
495 =head2 next_dataset
497 Title : next_dataset
498 Usage : $obj->next_dataset
499 Function : returns a hash reference containing the parsed data
500 Returns : hash reference
501 Args : none
502 Status : Stable
504 =head2 write_fastq
506 Title : write_fastq
507 Usage : $stream->write_fastq(@seq)
508 Function: writes the $seq object into the stream
509 Returns : 1 for success and 0 for error
510 Args : Bio::Seq::Quality object
511 Status : Deprecated (delegates to write_seq)
513 =head2 write_fasta
515 Title : write_fasta
516 Usage : $stream->write_fasta(@seq)
517 Function: writes the $seq object into the stream
518 Returns : 1 for success and 0 for error
519 Args : Bio::Seq object
520 Note : This method does not currently delegate to Bio::SeqIO::fasta
521 (maybe it should?). Not sure whether we should keep this as a
522 convenience method.
523 Status : Unstable
525 =head2 write_qual
527 Title : write_qual
528 Usage : $stream->write_qual(@seq)
529 Function: writes the $seq object into the stream
530 Returns : 1 for success and 0 for error
531 Args : Bio::Seq::Quality object
532 Note : This method does not currently delegate to Bio::SeqIO::qual
533 (maybe it should?). Not sure whether we should keep this as a
534 convenience method.
535 Status : Unstable
537 =head2 validate
539 Title : validate
540 Usage : $obj->validate(0)
541 Function : flag for format/qual range validation - default is 1, validate
542 Returns : Bool (0/1)
543 Args : Bool (0/1)
544 Status : Stable (may be moved to interface)
546 =head2 quality_header
548 Title : quality_header
549 Usage : $obj->quality_header
550 Function : flag for printing quality header - default is 0, no header
551 Returns : Bool (0/1)
552 Args : Bool (0/1)
553 Status : Unstable (name may change dep. on feedback)
555 =cut