sync with trunk to r15684
[bioperl-live.git] / Bio / SeqIO / fastq.pm
blobf4e09434dd171d8936ab1ebb715039c2fef545a8
1 # BioPerl module for Bio::SeqIO::fastq
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Tony Cox <avc@sanger.ac.uk>
7 # Copyright Tony Cox
9 # You may distribute this module under the same terms as perl itself
10 # _history
11 # October 29, 2001 incept data
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::SeqIO::fastq - fastq sequence input/output stream
19 =head1 SYNOPSIS
21 Do not use this module directly. Use it via the Bio::SeqIO class.
23 =head1 DESCRIPTION
25 This object can transform Bio::Seq and Bio::Seq::Quality
26 objects to and from fastq flat file databases.
28 Fastq is a file format used frequently at the Sanger Centre to bundle
29 a fasta sequence and its quality data. A typical fastaq entry takes
30 the from:
32 @HCDPQ1D0501
33 GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT.....
34 +HCDPQ1D0501
35 !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65.....
37 Fastq files have sequence and quality data on a single line and the
38 quality values are single-byte encoded. To retrieve the decimal values
39 for qualities you need to subtract 33 (or Octal 41) from each byte and
40 then convert to a '2 digit + 1 space' integer.
42 =head1 FEEDBACK
44 =head2 Mailing Lists
46 User feedback is an integral part of the evolution of this and other
47 Bioperl modules. Send your comments and suggestions preferably to one
48 of the Bioperl mailing lists. Your participation is much appreciated.
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53 =head2 Support
55 Please direct usage questions or support issues to the mailing list:
57 L<bioperl-l@bioperl.org>
59 rather than to the module maintainer directly. Many experienced and
60 reponsive experts will be able look at the problem and quickly
61 address it. Please include a thorough description of the problem
62 with code and data examples if at all possible.
64 =head2 Reporting Bugs
66 Report bugs to the Bioperl bug tracking system to help us keep track
67 the bugs and their resolution. Bug reports can be submitted via the
68 web:
70 http://bugzilla.open-bio.org/
72 =head1 AUTHORS - Tony Cox
74 Email: avc@sanger.ac.uk
77 =head1 APPENDIX
79 The rest of the documentation details each of the object
80 methods. Internal methods are usually preceded with a _
82 =cut
84 # Let the code begin...
86 package Bio::SeqIO::fastq;
87 use strict;
89 use Bio::Seq::SeqFactory;
91 use base qw(Bio::SeqIO);
93 sub _initialize {
94 my($self,@args) = @_;
95 $self->SUPER::_initialize(@args);
96 if( ! defined $self->sequence_factory ) {
97 $self->sequence_factory(Bio::Seq::SeqFactory->new(-verbose => $self->verbose(), -type => 'Bio::Seq::Quality'));
102 =head2 next_seq
104 Title : next_seq
105 Usage : $seq = $stream->next_seq()
106 Function: returns the next sequence in the stream
107 Returns : Bio::Seq::Quality object
108 Args : NONE
110 =cut
112 sub next_seq {
113 my( $self ) = @_;
114 my $seq;
115 my $alphabet;
116 local $/ = "\n";
117 my $seqdata;
118 my @datatype = qw(seqdesc seq qualdesc qual);
119 while (@datatype) {
120 return unless my $line = $self->_readline; # bail if any data is incomplete
121 chomp $line;
122 my $type = shift @datatype;
123 if ($type eq 'seqdesc' || $type eq 'qualdesc') {
124 $self->throw("$line doesn't match fastq descriptor line type") unless
125 $line =~ m{^[\+@](.*)$};
126 $line = $1;
128 $seqdata->{$type} = $line;
130 $self->warn("Seq/Qual descriptions don't match; using sequence description\n")
131 unless $seqdata->{qualdesc} eq '' || $seqdata->{seqdesc} eq $seqdata->{qualdesc};
132 my ($id,$fulldesc) = $seqdata->{seqdesc} =~ /^\s*(\S+)\s*(.*)/
133 or $self->throw("Can't parse fastq header");
134 if ($id eq '') {$id=$fulldesc;} # FIX incase no space between \@ and name
135 $seqdata->{seq} =~ s/\s//g; # Remove whitespace
136 $seqdata->{qual} =~ s/\s//g;
138 if(length($seqdata->{seq}) != length($seqdata->{qual})){
139 $self->warn("Fastq sequence/quality data length mismatch error\n",
140 "Sequence: ",$seqdata->{seqdesc},", seq length: ",length($seqdata->{seq}), " Qual length: ", length($seqdata->{qual}), " \n",
141 $seqdata->{seq},"\n",$seqdata->{qual},"\n");
144 my $seq_data_qual = $seqdata->{qual};
145 my @qual = unpack("A1" x length($seq_data_qual), $seq_data_qual);
146 # my @qual = split('', $seqdata->{qual}); # Speed up above
148 my $qual;
149 foreach (@qual) {$qual .= (unpack("C",$_) - 33) ." "};
151 # for empty sequences we need to know the mol.type
152 $alphabet = $self->alphabet();
153 if(length($seqdata->{seq}) == 0) {
154 if(! defined($alphabet)) {
155 # let's default to dna
156 $alphabet = "dna";
158 } else {
159 # we don't need it really, so disable
160 $alphabet = undef;
163 # create the Quality object
164 $seq = $self->sequence_factory->create(
165 -qual => $qual,
166 -seq => $seqdata->{seq},
167 -id => $id,
168 -primary_id => $id,
169 -desc => $fulldesc,
170 -alphabet => $alphabet
173 # if there wasn't one before, set the guessed type
174 $self->alphabet($seq->alphabet());
176 return $seq;
179 =head2 write_seq
181 Title : write_seq
182 Usage : $stream->write_seq(@seq)
183 Function: writes the $seq object into the stream
184 Returns : 1 for success and 0 for error
185 Args : Bio::Seq::Quality or Bio::Seq object
188 =cut
190 sub write_seq {
191 my ($self,@seq) = @_;
192 foreach my $seq (@seq) {
193 my $str = $seq->seq;
194 my $top = $seq->display_id();
195 if ($seq->can('desc') and my $desc = $seq->desc()) {
196 $desc =~ s/\n//g;
197 $top .= " $desc";
199 if(length($str) > 0) {
200 $str =~ s/(.{1,60})/$1\n/g;
201 } else {
202 $str = "\n";
205 $self->_print (">",$top,"\n",$str) or return;
208 $self->flush if $self->_flush_on_write && defined $self->_fh;
209 return 1;
212 =head2 write_qual
214 Title : write_qual
215 Usage : $stream->write_qual(@seq)
216 Function: writes the $seq object into the stream
217 Returns : 1 for success and 0 for error
218 Args : Bio::Seq::Quality object
221 =cut
223 sub write_qual {
224 my ($self,@seq) = @_;
225 foreach my $seq (@seq) {
226 unless ($seq->isa("Bio::Seq::Quality")){
227 $self->warn("You can write FASTQ without supplying a Bio::Seq::Quality object! ", ref($seq), "\n");
228 next;
230 my @qual = @{$seq->qual};
231 my $top = $seq->display_id();
232 if ($seq->can('desc') and my $desc = $seq->desc()) {
233 $desc =~ s/\n//g;
234 $top .= " $desc";
236 my $qual = "" ;
237 if(scalar(@qual) > 0) {
238 my $max = 60;
239 for (my $q = 0;$q<scalar(@qual);$q++){
240 $qual .= $qual[$q] . " ";
241 if(length($qual) > $max){
242 $qual .= "\n";
243 $max += 60;
246 } else {
247 $qual = "\n";
250 $self->_print (">",$top,"\n",$qual,"\n") or return;
252 return 1;
255 =head2 write_fastq
257 Title : write_fastq
258 Usage : $stream->write_fastq(@seq)
259 Function: writes the $seq object into the stream
260 Returns : 1 for success and 0 for error
261 Args : Bio::Seq::Quality object
264 =cut
266 sub write_fastq {
267 my ($self,@seq) = @_;
268 foreach my $seq (@seq) {
269 unless ($seq->isa("Bio::Seq::Quality")){
270 $self->warn("You can't write FASTQ without supplying a Bio::Seq::Quality object! ", ref($seq), "\n");
271 next;
273 my $str = $seq->seq;
274 my @qual = @{$seq->qual};
275 my $top = $seq->display_id();
276 if ($seq->can('desc') and my $desc = $seq->desc()) {
277 $desc =~ s/\n//g;
278 $top .= " $desc";
280 if(length($str) == 0) {
281 $str = "\n";
283 my $qual = "" ;
284 if(scalar(@qual) > 0) {
285 for (my $q = 0;$q<scalar(@qual);$q++){
286 $qual .= chr($qual[$q] + 33);
288 } else {
289 $qual = "\n";
292 $self->_print ("\@",$top,"\n",$str,"\n") or return;
293 $self->_print ("+",$top,"\n",$qual,"\n") or return;
295 return 1;