spiced_seq() may need LWP
[bioperl-live.git] / Bio / SeqIO / scf.pm
blob49059c7fd44dae45d6b140249d31985247350852
2 # Copyright (c) 1997-2001 bioperl, Chad Matsalla. All Rights Reserved.
3 # This module is free software; you can redistribute it and/or
4 # modify it under the same terms as Perl itself.
6 # Copyright Chad Matsalla
8 # You may distribute this module under the same terms as perl itself
10 # POD documentation - main docs before the code
12 =head1 NAME
14 Bio::SeqIO::scf - .scf file input/output stream
16 =head1 SYNOPSIS
18 Do not use this module directly. Use it via the Bio::SeqIO class, see
19 L<Bio::SeqIO> for more information.
21 =head1 DESCRIPTION
23 This object can transform .scf files to and from Bio::Seq::SequenceTrace
24 objects. Mechanisms are present to retrieve trace data from scf
25 files.
27 =head1 FEEDBACK
29 =head2 Mailing Lists
31 User feedback is an integral part of the evolution of this and other
32 Bioperl modules. Send your comments and suggestions preferably to one
33 of the Bioperl mailing lists. Your participation is much appreciated.
35 bioperl-l@bioperl.org - General discussion
36 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
38 =head2 Support
40 Please direct usage questions or support issues to the mailing list:
42 I<bioperl-l@bioperl.org>
44 rather than to the module maintainer directly. Many experienced and
45 reponsive experts will be able look at the problem and quickly
46 address it. Please include a thorough description of the problem
47 with code and data examples if at all possible.
49 =head2 Reporting Bugs
51 Report bugs to the Bioperl bug tracking system to help us keep track
52 the bugs and their resolution. Bug reports can be submitted via
53 the web:
55 https://github.com/bioperl/bioperl-live/issues
57 =head1 AUTHOR Chad Matsalla
59 Chad Matsalla
60 bioinformatics@dieselwurks.com
62 =head1 CONTRIBUTORS
64 Jason Stajich, jason@bioperl.org
65 Tony Cox, avc@sanger.ac.uk
66 Heikki Lehvaslaiho, heikki-at-bioperl-dot-org
67 Nancy Hansen, nhansen at mail.nih.gov
69 =head1 APPENDIX
71 The rest of the documentation details each of the object
72 methods. Internal methods are usually preceded with a _
74 =cut
76 # Let the code begin...
78 package Bio::SeqIO::scf;
79 use vars qw($DEFAULT_QUALITY);
80 use strict;
81 use Bio::Seq::SeqFactory;
82 use Bio::Seq::SequenceTrace;
83 use Bio::Annotation::Comment;
84 use Dumpvalue;
86 my $dumper = Dumpvalue->new();
87 $dumper->veryCompact(1);
89 BEGIN {
90 $DEFAULT_QUALITY= 10;
93 use base qw(Bio::SeqIO);
95 sub _initialize {
96 my($self,@args) = @_;
97 $self->SUPER::_initialize(@args);
98 if( ! defined $self->sequence_factory ) {
99 $self->sequence_factory(Bio::Seq::SeqFactory->new
100 (-verbose => $self->verbose(),
101 -type => 'Bio::Seq::Quality'));
103 binmode $self->_fh; # for the Win32/Mac crowds
106 =head2 next_seq()
108 Title : next_seq()
109 Usage : $scf = $stream->next_seq()
110 Function: returns the next scf sequence in the stream
111 Returns : a Bio::Seq::SequenceTrace object
112 Args : NONE
113 Notes : Fills the interface specification for SeqIO.
114 The SCF specification does not provide for having more then
115 one sequence in a given scf. So once the filehandle has been open
116 and passed to SeqIO do not expect to run this function more then
117 once on a given scf unless you embraced and extended the SCF
118 standard. SCF comments are accessible through the Bio::SeqI
119 interface method annotation().
121 =cut
124 sub next_seq {
125 my ($self) = @_;
126 my ($seq, $seqc, $fh, $buffer, $offset, $length, $read_bytes, @read,
127 %names);
128 # set up a filehandle to read in the scf
129 return if $self->{_readfile};
130 $fh = $self->_fh();
131 unless ($fh) { # simulate the <> function
132 if ( !fileno(ARGV) or eof(ARGV) ) {
133 return unless my $ARGV = shift;
134 open(ARGV,$ARGV) or
135 $self->throw("Could not open $ARGV for SCF stream reading $!");
137 $fh = \*ARGV;
139 return unless read $fh, $buffer, 128; # no exception; probably end of file
140 # now, the master data structure will be the creator
141 my $creator;
142 # he first thing to do is parse the header. This is common
143 # among all versions of scf.
144 # the rest of the the information is different between the
145 # the different versions of scf.
147 $creator->{header} = $self->_get_header($buffer);
148 if ($creator->{header}->{'version'} lt "3.00") {
149 $self->debug("scf.pm is working with a version 2 scf.\n");
150 # first gather the trace information
151 $length = $creator->{header}->{'samples'} *
152 $creator->{header}->{sample_size}*4;
153 $buffer = $self->read_from_buffer($fh, $buffer, $length,
154 $creator->{header}->{samples_offset});
155 # @read = unpack "n$length",$buffer;
156 # these traces need to be split
157 # returns a reference to a hash
158 $creator->{traces} = $self->_parse_v2_traces(
159 $buffer,$creator->{header}->{sample_size});
160 # now go and get the base information
161 $offset = $creator->{header}->{bases_offset};
162 $length = ($creator->{header}->{bases} * 12);
163 seek $fh,$offset,0;
164 $buffer = $self->read_from_buffer($fh,$buffer,$length,$creator->{header}->{bases_offset});
165 # now distill the information into its fractions.
166 # the old way : $self->_set_v2_bases($buffer);
167 # ref to an array, ref to a hash, string
168 ($creator->{peak_indices},
169 $creator->{qualities},
170 $creator->{sequence},
171 $creator->{accuracies}) = $self->_parse_v2_bases($buffer);
173 } else {
174 $self->debug("scf.pm is working with a version 3+ scf.\n");
175 my $transformed_read;
176 my $current_read_position = $creator->{header}->{sample_offset};
177 $length = $creator->{header}->{'samples'}*
178 $creator->{header}->{sample_size};
179 # $dumper->dumpValue($creator->{header});
180 foreach (qw(a c g t)) {
181 $buffer = $self->read_from_buffer($fh,$buffer,$length,$current_read_position);
182 my $byte = "n";
183 if ($creator->{header}->{sample_size} == 1) {
184 $byte = "c";
186 @read = unpack "${byte}${length}",$buffer;
187 # this little spurt of nonsense is because
188 # the trace values are given in the binary
189 # file as unsigned shorts but they really
190 # are signed deltas. 30000 is an arbitrary number
191 # (will there be any traces with a given
192 # point greater then 30000? I hope not.
193 # once the read is read, it must be changed
194 # from relative
195 foreach (@read) {
196 if ($_ > 30000) {
197 $_ -= 65536;
200 $transformed_read = $self->_delta(\@read,"backward");
201 # For 8-bit data we need to emulate a signed/unsigned
202 # cast that is implicit in the C implementations.....
203 if ($creator->{header}->{sample_size} == 1) {
204 foreach (@{$transformed_read}) {
205 $_ += 256 if ($_ < 0);
208 $current_read_position += $length;
209 $creator->{'traces'}->{$_} = join(' ',@{$transformed_read});
212 # now go and get the peak index information
213 $offset = $creator->{header}->{bases_offset};
214 $length = ($creator->{header}->{bases} * 4);
215 $buffer = $self->read_from_buffer($fh,$buffer,$length,$offset);
216 $creator->{peak_indices} = $self->_get_v3_peak_indices($buffer);
217 $offset += $length;
218 # now go and get the accuracy information
219 $buffer = $self->read_from_buffer($fh,$buffer,$length,$offset);
220 $creator->{accuracies} = $self->_get_v3_base_accuracies($buffer);
221 # OK, now go and get the base information.
222 $offset += $length;
223 $length = $creator->{header}->{bases};
224 $buffer = $self->read_from_buffer($fh,$buffer,$length,$offset);
225 $creator->{'sequence'} = unpack("a$length",$buffer);
226 # now, finally, extract the calls from the accuracy information.
227 $creator->{qualities} = $self->_get_v3_quality(
228 $creator->{'sequence'},$creator->{accuracies});
230 # now go and get the comment information
231 $offset = $creator->{header}->{comments_offset};
232 seek $fh,$offset,0;
233 $length = $creator->{header}->{comment_size};
234 $buffer = $self->read_from_buffer($fh,$buffer,$length);
235 $creator->{comments} = $self->_get_comments($buffer);
236 my @name_comments = grep {$_->tagname() eq 'NAME'}
237 $creator->{comments}->get_Annotations('comment');
238 my $name_comment;
239 if (@name_comments){
240 $name_comment = $name_comments[0]->as_text();
241 $name_comment =~ s/^Comment:\s+//;
244 my $swq = Bio::Seq::Quality->new(
245 -seq => $creator->{'sequence'},
246 -qual => $creator->{'qualities'},
247 -id => $name_comment
249 my $returner = Bio::Seq::SequenceTrace->new(
250 -swq => $swq,
251 -trace_a => $creator->{'traces'}->{'a'},
252 -trace_t => $creator->{'traces'}->{'t'},
253 -trace_g => $creator->{'traces'}->{'g'},
254 -trace_c => $creator->{'traces'}->{'c'},
255 -accuracy_a => $creator->{'accuracies'}->{'a'},
256 -accuracy_t => $creator->{'accuracies'}->{'t'},
257 -accuracy_g => $creator->{'accuracies'}->{'g'},
258 -accuracy_c => $creator->{'accuracies'}->{'c'},
259 -peak_indices => $creator->{'peak_indices'}
262 $returner->annotation($creator->{'comments'}); # add SCF comments
263 $self->{'_readfile'} = 1;
264 return $returner;
268 =head2 _get_v3_quality()
270 Title : _get_v3_quality()
271 Usage : $self->_get_v3_quality()
272 Function: Set the base qualities from version3 scf
273 Returns : Nothing. Alters $self.
274 Args : None.
275 Notes :
277 =cut
280 sub _get_v3_quality {
281 my ($self,$sequence,$accuracies) = @_;
282 my @bases = split//,$sequence;
283 my (@qualities,$currbase,$currqual,$counter);
284 for ($counter=0; $counter <= $#bases ; $counter++) {
285 $currbase = lc($bases[$counter]);
286 if ($currbase eq "a") { $currqual = $accuracies->{'a'}->[$counter]; }
287 elsif ($currbase eq "c") { $currqual = $accuracies->{'c'}->[$counter]; }
288 elsif ($currbase eq "g") { $currqual = $accuracies->{'g'}->[$counter]; }
289 elsif ($currbase eq "t") { $currqual = $accuracies->{'t'}->[$counter]; }
290 else { $currqual = "unknown"; }
291 push @qualities,$currqual;
293 return \@qualities;
296 =head2 _get_v3_peak_indices($buffer)
298 Title : _get_v3_peak_indices($buffer)
299 Usage : $self->_get_v3_peak_indices($buffer);
300 Function: Unpacks the base accuracies for version3 scf
301 Returns : Nothing. Alters $self
302 Args : A scalar containing binary data.
303 Notes :
305 =cut
307 sub _get_v3_peak_indices {
308 my ($self,$buffer) = @_;
309 my $length = length($buffer);
310 my @read = unpack "N$length",$buffer;
311 return join(' ',@read);
314 =head2 _get_v3_base_accuracies($buffer)
316 Title : _get_v3_base_accuracies($buffer)
317 Usage : $self->_get_v3_base_accuracies($buffer)
318 Function: Set the base accuracies for version 3 scf's
319 Returns : Nothing. Alters $self.
320 Args : A scalar containing binary data.
321 Notes :
323 =cut
326 sub _get_v3_base_accuracies {
327 my ($self,$buffer) = @_;
328 my $length = length($buffer);
329 my $qlength = $length/4;
330 my $offset = 0;
331 my (@qualities,@sorter,$counter,$round,$last_base,$accuracies,$currbase);
332 foreach $currbase (qw(a c g t)) {
333 my @read;
334 $last_base = $offset + $qlength;
335 for (;$offset < $last_base; $offset += $qlength) {
336 # a bioperler (perhaps me?) changed the unpack string to include 'n' rather than 'C'
337 # on 040322 I think that 'C' is correct. please email chad if you would like to accuse me of being incorrect
338 @read = unpack "C$qlength", substr($buffer,$offset,$qlength);
339 $accuracies->{$currbase} = \@read;
342 return $accuracies;
346 =head2 _get_comments($buffer)
348 Title : _get_comments($buffer)
349 Usage : $self->_get_comments($buffer);
350 Function: Gather the comments section from the scf and parse it into its
351 components.
352 Returns : a Bio::Annotation::Collection object
353 Args : The buffer. It is expected that the buffer contains a binary
354 string for the comments section of an scf file according to
355 the scf file specifications.
356 Notes :
358 =cut
360 sub _get_comments {
361 my ($self,$buffer) = @_;
362 my $comments = Bio::Annotation::Collection->new();
363 my $size = length($buffer);
364 my $comments_retrieved = unpack "a$size",$buffer;
365 $comments_retrieved =~ s/\0//;
366 my @comments_split = split/\n/,$comments_retrieved;
367 if (@comments_split) {
368 foreach (@comments_split) {
369 /(\w+)=(.*)/;
370 if ($1 && $2) {
371 my ($tagname, $text) = ($1, $2);
372 my $comment_obj = Bio::Annotation::Comment->new(
373 -text => $text,
374 -tagname => $tagname);
376 $comments->add_Annotation('comment', $comment_obj);
380 $self->{'comments'} = $comments;
381 return $comments;
384 =head2 _get_header()
386 Title : _get_header($buffer)
387 Usage : $self->_get_header($buffer);
388 Function: Gather the header section from the scf and parse it into its
389 components.
390 Returns : Reference to a hash containing the header components.
391 Args : The buffer. It is expected that the buffer contains a binary
392 string for the header section of an scf file according to the
393 scf file specifications.
394 Notes : None.
396 =cut
398 sub _get_header {
399 my ($self,$buffer) = @_;
400 my $header;
401 ($header->{'scf'},
402 $header->{'samples'},
403 $header->{'sample_offset'},
404 $header->{'bases'},
405 $header->{'bases_left_clip'},
406 $header->{'bases_right_clip'},
407 $header->{'bases_offset'},
408 $header->{'comment_size'},
409 $header->{'comments_offset'},
410 $header->{'version'},
411 $header->{'sample_size'},
412 $header->{'code_set'},
413 @{$header->{'header_spare'}} ) = unpack "a4 NNNNNNNN a4 NN N20", $buffer;
415 $self->{'header'} = $header;
416 return $header;
419 =head2 _parse_v2_bases($buffer)
421 Title : _parse_v2_bases($buffer)
422 Usage : $self->_parse_v2_bases($buffer);
423 Function: Gather the bases section from the scf and parse it into its
424 components.
425 Returns :
426 Args : The buffer. It is expected that the buffer contains a binary
427 string for the bases section of an scf file according to the
428 scf file specifications.
429 Notes : None.
431 =cut
433 sub _parse_v2_bases {
434 my ($self,$buffer) = @_;
435 my $length = length($buffer);
436 my ($offset2,$currbuff,$currbase,$currqual,$sequence,@qualities,@indices);
437 my (@read,$harvester,$accuracies);
438 for ($offset2=0;$offset2<$length;$offset2+=12) {
439 @read = unpack "N C C C C a C3", substr($buffer,$offset2,$length);
440 push @indices,$read[0];
441 $currbase = lc($read[5]);
442 if ($currbase eq "a") { $currqual = $read[1]; }
443 elsif ($currbase eq "c") { $currqual = $read[2]; }
444 elsif ($currbase eq "g") { $currqual = $read[3]; }
445 elsif ($currbase eq "t") { $currqual = $read[4]; }
446 else { $currqual = "UNKNOWN"; }
447 push @{$accuracies->{"a"}},$read[1];
448 push @{$accuracies->{"c"}},$read[2];
449 push @{$accuracies->{"g"}},$read[3];
450 push @{$accuracies->{"t"}},$read[4];
452 $sequence .= $currbase;
453 push @qualities,$currqual;
455 return (\@indices,\@qualities,$sequence,$accuracies)
458 =head2 _parse_v2_traces(\@traces_array)
460 Title : _pares_v2_traces(\@traces_array)
461 Usage : $self->_parse_v2_traces(\@traces_array);
462 Function: Parses an scf Version2 trace array into its base components.
463 Returns : Nothing. Modifies $self.
464 Args : A reference to an array of the unpacked traces section of an
465 scf version2 file.
467 =cut
469 sub _parse_v2_traces {
470 my ($self,$buffer,$sample_size) = @_;
471 my $byte;
472 if ($sample_size == 1) { $byte = "c"; }
473 else { $byte = "n"; }
474 my $length = CORE::length($buffer);
475 my @read = unpack "${byte}${length}",$buffer;
476 # this will be an array to the reference holding the array
477 my $traces;
478 my $array = 0;
479 for (my $offset2 = 0; $offset2< scalar(@read); $offset2+=4) {
480 push @{$traces->{'a'}},$read[$offset2];
481 push @{$traces->{'c'}},$read[$offset2+1];
482 push @{$traces->{'g'}},$read[$offset2+3];
483 push @{$traces->{'t'}},$read[$offset2+2];
485 return $traces;
489 sub get_trace_deprecated_use_the_sequencetrace_object_instead {
490 # my ($self,$base_channel,$traces) = @_;
491 # $base_channel =~ tr/a-z/A-Z/;
492 # if ($base_channel !~ /A|T|G|C/) {
493 # $self->throw("You tried to ask for a base channel that wasn't A,T,G, or C. Ask for one of those next time.");
494 ##} elsif ($base_channel) {
495 # my @temp = split(' ',$self->{'traces'}->{$base_channel});
496 #return \@temp;
500 sub _deprecated_get_peak_indices_deprecated_use_the_sequencetrace_object_instead {
501 my ($self) = shift;
502 my @temp = split(' ',$self->{'parsed'}->{'peak_indices'});
503 return \@temp;
507 =head2 get_header()
509 Title : get_header()
510 Usage : %header = %{$obj->get_header()};
511 Function: Return the header for this scf.
512 Returns : A reference to a hash containing the header for this scf.
513 Args : None.
514 Notes :
516 =cut
518 sub get_header {
519 my ($self) = shift;
520 return $self->{'header'};
523 =head2 get_comments()
525 Title : get_comments()
526 Usage : %comments = %{$obj->get_comments()};
527 Function: Return the comments for this scf.
528 Returns : A Bio::Annotation::Collection object
529 Args : None.
530 Notes :
532 =cut
534 sub get_comments {
535 my ($self) = shift;
536 return $self->{'comments'};
539 sub _dump_traces_outgoing_deprecated_use_the_sequencetrace_object {
540 my ($self,$transformed) = @_;
541 my (@sA,@sT,@sG,@sC);
542 if ($transformed) {
543 @sA = @{$self->{'text'}->{'t_samples_a'}};
544 @sC = @{$self->{'text'}->{'t_samples_c'}};
545 @sG = @{$self->{'text'}->{'t_samples_g'}};
546 @sT = @{$self->{'text'}->{'t_samples_t'}};
548 else {
549 @sA = @{$self->{'text'}->{'samples_a'}};
550 @sC = @{$self->{'text'}->{'samples_c'}};
551 @sG = @{$self->{'text'}->{'samples_g'}};
552 @sT = @{$self->{'text'}->{'samples_t'}};
554 print ("Count\ta\tc\tg\tt\n");
555 for (my $curr=0; $curr < scalar(@sG); $curr++) {
556 print("$curr\t$sA[$curr]\t$sC[$curr]\t$sG[$curr]\t$sT[$curr]\n");
558 return;
561 sub _dump_traces_incoming_deprecated_use_the_sequencetrace_object {
562 # my ($self) = @_;
563 # my (@sA,@sT,@sG,@sC);
564 # @sA = @{$self->{'traces'}->{'A'}};
565 # @sC = @{$self->{'traces'}->{'C'}};
566 # @sG = @{$self->{'traces'}->{'G'}};
567 # @sT = @{$self->{'traces'}->{'T'}};
568 # @sA = @{$self->get_trace('A')};
569 # @sC = @{$self->get_trace('C')};
570 # @sG = @{$self->get_trace('G')};
571 # @sT = @{$self->get_trace('t')};
572 # print ("Count\ta\tc\tg\tt\n");
573 # for (my $curr=0; $curr < scalar(@sG); $curr++) {
574 # print("$curr\t$sA[$curr]\t$sC[$curr]\t$sG[$curr]\t$sT[$curr]\n");
576 #return;
579 =head2 write_seq
581 Title : write_seq(-target => $swq, <comments>)
582 Usage : $obj->write_seq(
583 -target => $swq,
584 -version => 2,
585 -CONV => "Bioperl-Chads Mighty SCF writer.");
586 Function: Write out an scf.
587 Returns : Nothing.
588 Args : Requires: a reference to a Bio::Seq::Quality object to form the
589 basis for the scf.
590 if -version is provided, it should be "2" or "3". A SCF of that
591 version will be written.
592 Any other arguments are assumed to be comments and are put into
593 the comments section of the scf. Read the specifications for scf
594 to decide what might be good to put in here.
596 Notes :
597 For best results, use a SequenceTrace object.
598 The things that you need to write an scf:
599 a) sequence
600 b) quality
601 c) peak indices
602 d) traces
603 - You _can_ write an scf with just a and b by passing in a
604 Bio::Seq::Quality object- false traces will be synthesized
605 for you.
607 =cut
609 sub write_seq {
610 my ($self,%args) = @_;
611 my %comments;
612 my ($label,$arg);
613 my ($swq) = $self->_rearrange([qw(TARGET)], %args);
614 my $writer_fodder;
615 if (ref($swq) =~ /Bio::Seq::SequenceTrace|Bio::Seq::Quality/) {
616 if (ref($swq) eq "Bio::Seq::Quality") {
617 # this means that the object *has no trace data*
618 # we might as well synthesize some now, ok?
619 $swq = Bio::Seq::SequenceTrace->new(
620 -swq => $swq
624 else {
625 $self->throw("You must pass a Bio::Seq::Quality or a Bio::Seq::SequenceTrace object to write_seq as a parameter named \"target\"");
627 # all of the rest of the arguments are comments for the scf
628 foreach $arg (sort keys %args) {
629 next if ($arg =~ /target/i);
630 ($label = $arg) =~ s/^\-//;
631 $writer_fodder->{comments}->{$label} = $args{$arg};
633 if (!$comments{'NAME'}) { $comments{'NAME'} = $swq->id(); }
634 # HA! Bwahahahaha.
635 $writer_fodder->{comments}->{'CONV'} = "Bioperl-Chads Mighty SCF writer." unless defined $comments{'CONV'};
636 # now deal with the version of scf they want to write
637 if ($writer_fodder->{comments}->{version}) {
638 if ($writer_fodder->{comments}->{version} != 2 && $writer_fodder->{comments}->{version} != 3) {
639 $self->warn("This module can only write version 2.0 or 3.0 scf's. Writing a version 2.0 scf by default.");
640 $writer_fodder->{header}->{version} = "2.00";
642 elsif ($writer_fodder->{comments}->{'version'} > 2) {
643 $writer_fodder->{header}->{'version'} = "3.00";
645 else {
646 $writer_fodder->{header}->{version} = "2";
649 else {
650 $writer_fodder->{header}->{'version'} = "3.00";
652 # set a few things in the header
653 $writer_fodder->{'header'}->{'magic'} = ".scf";
654 $writer_fodder->{'header'}->{'sample_size'} = "2";
655 $writer_fodder->{'header'}->{'bases'} = length($swq->seq());
656 $writer_fodder->{'header'}->{'bases_left_clip'} = "0";
657 $writer_fodder->{'header'}->{'bases_right_clip'} = "0";
658 $writer_fodder->{'header'}->{'sample_size'} = "2";
659 $writer_fodder->{'header'}->{'code_set'} = "9";
660 @{$writer_fodder->{'header'}->{'spare'}} = qw(0 0 0 0 0 0 0 0 0 0
661 0 0 0 0 0 0 0 0 0 0);
662 $writer_fodder->{'header'}->{'samples_offset'} = "128";
663 $writer_fodder->{'header'}->{'samples'} = $swq->trace_length();
664 # create the binary for the comments and file it in writer_fodder
665 $writer_fodder->{comments} = $self->_get_binary_comments(
666 $writer_fodder->{comments});
667 # create the binary and the strings for the traces, bases,
668 # offsets (if necessary), and accuracies (if necessary)
669 $writer_fodder->{traces} = $self->_get_binary_traces(
670 $writer_fodder->{'header'}->{'version'},
671 $swq,$writer_fodder->{'header'}->{'sample_size'});
672 my ($b_base_offsets,$b_base_accuracies,$samples_size,$bases_size);
674 # version 2
676 if ($writer_fodder->{'header'}->{'version'} == 2) {
677 $writer_fodder->{bases} = $self->_get_binary_bases(
679 $swq,
680 $writer_fodder->{'header'}->{'sample_size'});
681 $samples_size = CORE::length($writer_fodder->{traces}->{'binary'});
682 $bases_size = CORE::length($writer_fodder->{bases}->{binary});
683 $writer_fodder->{'header'}->{'bases_offset'} = 128 + $samples_size;
684 $writer_fodder->{'header'}->{'comments_offset'} = 128 +
685 $samples_size + $bases_size;
686 $writer_fodder->{'header'}->{'comments_size'} =
687 length($writer_fodder->{'comments'}->{binary});
688 $writer_fodder->{'header'}->{'private_size'} = "0";
689 $writer_fodder->{'header'}->{'private_offset'} = 128 +
690 $samples_size + $bases_size +
691 $writer_fodder->{'header'}->{'comments_size'};
692 $writer_fodder->{'header'}->{'binary'} =
693 $self->_get_binary_header($writer_fodder->{header});
694 $dumper->dumpValue($writer_fodder) if $self->verbose > 0;
695 $self->_print ($writer_fodder->{'header'}->{'binary'})
696 or print("Could not write binary header...\n");
697 $self->_print ($writer_fodder->{'traces'}->{'binary'})
698 or print("Could not write binary traces...\n");
699 $self->_print ($writer_fodder->{'bases'}->{'binary'})
700 or print("Could not write binary base structures...\n");
701 $self->_print ($writer_fodder->{'comments'}->{'binary'})
702 or print("Could not write binary comments...\n");
704 else {
705 ($writer_fodder->{peak_indices},
706 $writer_fodder->{accuracies},
707 $writer_fodder->{bases},
708 $writer_fodder->{reserved} ) =
709 $self->_get_binary_bases(
711 $swq,
712 $writer_fodder->{'header'}->{'sample_size'}
714 $writer_fodder->{'header'}->{'bases_offset'} = 128 +
715 length($writer_fodder->{'traces'}->{'binary'});
716 $writer_fodder->{'header'}->{'comments_size'} =
717 length($writer_fodder->{'comments'}->{'binary'});
718 # this is:
719 # bases_offset + base_offsets + accuracies + called_bases +
720 # reserved
721 $writer_fodder->{'header'}->{'private_size'} = "0";
723 $writer_fodder->{'header'}->{'comments_offset'} =
724 128+length($writer_fodder->{'traces'}->{'binary'})+
725 length($writer_fodder->{'peak_indices'}->{'binary'})+
726 length($writer_fodder->{'accuracies'}->{'binary'})+
727 length($writer_fodder->{'bases'}->{'binary'})+
728 length($writer_fodder->{'reserved'}->{'binary'});
729 $writer_fodder->{'header'}->{'private_offset'} =
730 $writer_fodder->{'header'}->{'comments_offset'} +
731 $writer_fodder->{'header'}->{'comments_size'};
732 $writer_fodder->{'header'}->{'spare'}->[1] =
733 $writer_fodder->{'header'}->{'comments_offset'} +
734 length($writer_fodder->{'comments'}->{'binary'});
735 $writer_fodder->{header}->{binary} =
736 $self->_get_binary_header($writer_fodder->{header});
737 $self->_print ($writer_fodder->{'header'}->{'binary'})
738 or print("Couldn't write header\n");
739 $self->_print ($writer_fodder->{'traces'}->{'binary'})
740 or print("Couldn't write samples\n");
741 $self->_print ($writer_fodder->{'peak_indices'}->{'binary'})
742 or print("Couldn't write peak offsets\n");
743 $self->_print ($writer_fodder->{'accuracies'}->{'binary'})
744 or print("Couldn't write accuracies\n");
745 $self->_print ($writer_fodder->{'bases'}->{'binary'})
746 or print("Couldn't write called_bases\n");
747 $self->_print ($writer_fodder->{'reserved'}->{'binary'})
748 or print("Couldn't write reserved\n");
749 $self->_print ($writer_fodder->{'comments'}->{'binary'})
750 or print ("Couldn't write comments\n");
753 # kinda unnecessary, given the close() below, but maybe that'll go
754 # away someday.
755 $self->flush if $self->_flush_on_write && defined $self->_fh;
757 $self->close();
758 return 1;
765 =head2 _get_binary_header()
767 Title : _get_binary_header();
768 Usage : $self->_get_binary_header();
769 Function: Provide the binary string that will be used as the header for
770 a scfv2 document.
771 Returns : A binary string.
772 Args : None. Uses the entries in the $self->{'header'} hash. These
773 are set on construction of the object (hopefully correctly!).
774 Notes :
776 =cut
778 sub _get_binary_header {
779 my ($self,$header) = @_;
780 my $binary = pack "a4 NNNNNNNN a4 NN N20",
782 $header->{'magic'},
783 $header->{'samples'},
784 $header->{'samples_offset'},
785 $header->{'bases'},
786 $header->{'bases_left_clip'},
787 $header->{'bases_right_clip'},
788 $header->{'bases_offset'},
789 $header->{'comments_size'},
790 $header->{'comments_offset'},
791 $header->{'version'},
792 $header->{'sample_size'},
793 $header->{'code_set'},
794 @{$header->{'spare'}}
796 return $binary;
799 =head2 _get_binary_traces($version,$ref)
801 Title : _set_binary_tracesbases($version,$ref)
802 Usage : $self->_set_binary_tracesbases($version,$ref);
803 Function: Constructs the trace and base strings for all scfs
804 Returns : Nothing. Alters self.
805 Args : $version - "2" or "3"
806 $sequence - a scalar containing arbitrary sequence data
807 $ref - a reference to either a SequenceTraces or a
808 SequenceWithQuality object.
809 Notes : This is a really complicated thing.
811 =cut
813 sub _get_binary_traces {
814 my ($self,$version,$ref,$sample_size) = @_;
815 # ref _should_ be a Bio::Seq::SequenceTrace, but might be a
816 # Bio::Seq::Quality
817 my $returner;
818 my $sequence = $ref->seq();
819 my $sequence_length = length($sequence);
820 # first of all, do we need to synthesize the trace?
821 # if so, call synthesize_base
822 my ($traceobj,@traces,$current);
823 if ( ref($ref) eq "Bio::Seq::Quality" ) {
824 $traceobj = Bio::Seq::Quality->new(
825 -target => $ref
827 $traceobj->_synthesize_traces();
829 else {
830 $traceobj = $ref;
831 if ($version eq "2") {
832 my $trace_length = $traceobj->trace_length();
833 for ($current = 1; $current <= $trace_length; $current++) {
834 foreach (qw(a c g t)) {
835 push @traces,$traceobj->trace_value_at($_,$current);
839 elsif ($version == 3) {
840 foreach my $current_trace (qw(a c g t)) {
841 my @trace = @{$traceobj->trace($current_trace)};
842 foreach (@trace) {
843 if ($_ > 30000) {
844 $_ -= 65536;
847 my $transformed = $self->_delta(\@trace,"forward");
848 if($sample_size == 1){
849 foreach (@{$transformed}) {
850 $_ += 256 if ($_ < 0);
853 push @traces,@{$transformed};
857 $returner->{version} = $version;
858 $returner->{string} = \@traces;
859 my $length_of_traces = scalar(@traces);
860 my $byte;
861 if ($sample_size == 1) { $byte = "c"; } else { $byte = "n"; }
862 # an unsigned integer should be I, but this is too long
864 $returner->{binary} = pack "n${length_of_traces}",@traces;
865 $returner->{length} = CORE::length($returner->{binary});
866 return $returner;
870 sub _get_binary_bases {
871 my ($self,$version,$trace,$sample_size) = @_;
872 my $byte;
873 if ($sample_size == 1) { $byte = "c"; } else { $byte = "n"; }
874 my ($returner,@current_row,$current_base,$string,$binary);
875 my $length = $trace->length();
876 if ($version == 2) {
877 $returner->{'version'} = "2";
878 for (my $current_base =1; $current_base <= $length; $current_base++) {
879 my @current_row;
880 push @current_row,$trace->peak_index_at($current_base);
881 push @current_row,$trace->accuracy_at("a",$current_base);
882 push @current_row,$trace->accuracy_at("c",$current_base);
883 push @current_row,$trace->accuracy_at("g",$current_base);
884 push @current_row,$trace->accuracy_at("t",$current_base);
885 push @current_row,$trace->baseat($current_base);
886 push @current_row,0,0,0;
887 push @{$returner->{string}},@current_row;
888 $returner->{binary} .= pack "N C C C C a C3",@current_row;
890 return $returner;
892 else {
893 $returner->{'version'} = "3.00";
894 $returner->{peak_indices}->{string} = $trace->peak_indices();
895 my $length = scalar(@{$returner->{peak_indices}->{string}});
896 $returner->{peak_indices}->{binary} =
897 pack "N$length",@{$returner->{peak_indices}->{string}};
898 $returner->{peak_indices}->{length} =
899 CORE::length($returner->{peak_indices}->{binary});
900 my @accuracies;
901 foreach my $base (qw(a c g t)) {
902 $returner->{accuracies}->{$base} = $trace->accuracies($base);
903 push @accuracies,@{$trace->accuracies($base)};
905 $returner->{sequence} = $trace->seq();
906 $length = scalar(@accuracies);
907 # this really is "c" for samplesize == 2
908 $returner->{accuracies}->{binary} = pack "C${length}",@accuracies;
909 $returner->{accuracies}->{length} =
910 CORE::length($returner->{accuracies}->{binary});
911 $length = $trace->seq_obj()->length();
912 for (my $count=0; $count< $length; $count++) {
913 push @{$returner->{reserved}->{string}},0,0,0;
916 $length = scalar(@{$returner->{reserved}->{string}});
917 # this _must_ be "c"
918 $returner->{'reserved'}->{'binary'} =
919 pack "c$length",@{$returner->{reserved}->{string}};
920 $returner->{'reserved'}->{'length'} =
921 CORE::length($returner->{'reserved'}->{'binary'});
922 # $returner->{'bases'}->{'string'} = $trace->seq();
923 my @bases = split('',$trace->seq());
924 $length = $trace->length();
925 $returner->{'bases'}->{'binary'} = $trace->seq();
926 # print("Returning this:\n");
927 # $dumper->dumpValue($returner);
928 return ($returner->{peak_indices},
929 $returner->{accuracies},
930 $returner->{bases},
931 $returner->{reserved});
936 =head2 _make_trace_string($version)
938 Title : _make_trace_string($version)
939 Usage : $self->_make_trace_string($version)
940 Function: Merges trace data for the four bases to produce an scf
941 trace string. _requires_ $version
942 Returns : Nothing. Alters $self.
943 Args : $version - a version number. "2" or "3"
944 Notes :
946 =cut
948 sub _make_trace_string {
949 my ($self,$version) = @_;
950 my @traces;
951 my @traces_view;
952 my @as = @{$self->{'text'}->{'samples_a'}};
953 my @cs = @{$self->{'text'}->{'samples_c'}};
954 my @gs = @{$self->{'text'}->{'samples_g'}};
955 my @ts = @{$self->{'text'}->{'samples_t'}};
956 if ($version == 2) {
957 for (my $curr=0; $curr < scalar(@as); $curr++) {
958 $as[$curr] = $DEFAULT_QUALITY unless defined $as[$curr];
959 $cs[$curr] = $DEFAULT_QUALITY unless defined $cs[$curr];
960 $gs[$curr] = $DEFAULT_QUALITY unless defined $gs[$curr];
961 $ts[$curr] = $DEFAULT_QUALITY unless defined $ts[$curr];
962 push @traces,($as[$curr],$cs[$curr],$gs[$curr],$ts[$curr]);
965 elsif ($version == 3) {
966 @traces = (@as,@cs,@gs,@ts);
968 else {
969 $self->throw("No idea what version required to make traces here. You gave #$version# Bailing.");
971 my $length = scalar(@traces);
972 $self->{'text'}->{'samples_all'} = \@traces;
976 =head2 _get_binary_comments(\@comments)
978 Title : _get_binary_comments(\@comments)
979 Usage : $self->_get_binary_comments(\@comments);
980 Function: Provide a binary string that will be the comments section of
981 the scf file. See the scf specifications for detailed
982 specifications for the comments section of an scf file. Hint:
983 CODE=something\nBODE=something\n\0
984 Returns :
985 Args : A reference to an array containing comments.
986 Notes : None.
988 =cut
990 sub _get_binary_comments {
991 my ($self,$rcomments) = @_;
992 my $returner;
993 my $comments_string = '';
994 my %comments = %$rcomments;
995 foreach my $key (sort keys %comments) {
996 $comments{$key} ||= '';
997 $comments_string .= "$key=$comments{$key}\n";
999 $comments_string .= "\n\0";
1000 my $length = CORE::length($comments_string);
1001 $returner->{length} = $length;
1002 $returner->{string} = $comments_string;
1003 $returner->{binary} = pack "A$length",$comments_string;
1004 return $returner;
1007 #=head2 _fill_missing_data($swq)
1009 # Title : _fill_missing_data($swq)
1010 # Usage : $self->_fill_missing_data($swq);
1011 # Function: If the $swq with quality has no qualities, set all qualities
1012 # to 0.
1013 # If the $swq has no sequence, set the sequence to N's.
1014 # Returns : Nothing. Modifies the Bio::Seq::Quality that was passed as an
1015 # argument.
1016 # Args : A reference to a Bio::Seq::Quality
1017 # Notes : None.
1019 #=cut
1022 #sub _fill_missing_data {
1023 # my ($self,$swq) = @_;
1024 # my $qual_obj = $swq->qual_obj();
1025 # my $seq_obj = $swq->seq_obj();
1026 # if ($qual_obj->length() == 0 && $seq_obj->length() != 0) {
1027 # my $fake_qualities = ("$DEFAULT_QUALITY ")x$seq_obj->length();
1028 # $swq->qual($fake_qualities);
1030 # if ($seq_obj->length() == 0 && $qual_obj->length != 0) {
1031 # my $sequence = ("N")x$qual_obj->length();
1032 # $swq->seq($sequence);
1036 =head2 _delta(\@trace_data,$direction)
1038 Title : _delta(\@trace_data,$direction)
1039 Usage : $self->_delta(\@trace_data,$direction);
1040 Function:
1041 Returns : A reference to an array containing modified trace values.
1042 Args : A reference to an array containing trace data and a string
1043 indicating the direction of conversion. ("forward" or
1044 "backward").
1045 Notes : This code is taken from the specification for SCF3.2.
1046 http://www.mrc-lmb.cam.ac.uk/pubseq/manual/formats_unix_4.html
1048 =cut
1051 sub _delta {
1052 my ($self,$rsamples,$direction) = @_;
1053 my @samples = @$rsamples;
1054 # /* If job == DELTA_IT:
1055 # * change a series of sample points to a series of delta delta values:
1056 # * ie change them in two steps:
1057 # * first: delta = current_value - previous_value
1058 # * then: delta_delta = delta - previous_delta
1059 # * else
1060 # * do the reverse
1061 # */
1062 # int i;
1063 # uint_2 p_delta, p_sample;
1065 my ($i,$num_samples,$p_delta,$p_sample,@samples_converted,$p_sample1,$p_sample2);
1066 my $SLOW_BUT_CLEAR = 0;
1067 $num_samples = scalar(@samples);
1068 # c-programmers are funny people with their single-letter variables
1070 if ( $direction eq "forward" ) {
1071 if($SLOW_BUT_CLEAR){
1072 $p_delta = 0;
1073 for ($i=0; $i < $num_samples; $i++) {
1074 $p_sample = $samples[$i];
1075 $samples[$i] = $samples[$i] - $p_delta;
1076 $p_delta = $p_sample;
1078 $p_delta = 0;
1079 for ($i=0; $i < $num_samples; $i++) {
1080 $p_sample = $samples[$i];
1081 $samples[$i] = $samples[$i] - $p_delta;
1082 $p_delta = $p_sample;
1084 } else {
1085 for ($i = $num_samples-1; $i > 1; $i--){
1086 $samples[$i] = $samples[$i] - 2*$samples[$i-1] + $samples[$i-2];
1088 $samples[1] = $samples[1] - 2*$samples[0];
1091 elsif ($direction eq "backward") {
1092 if($SLOW_BUT_CLEAR){
1093 $p_sample = 0;
1094 for ($i=0; $i < $num_samples; $i++) {
1095 $samples[$i] = $samples[$i] + $p_sample;
1096 $p_sample = $samples[$i];
1098 $p_sample = 0;
1099 for ($i=0; $i < $num_samples; $i++) {
1100 $samples[$i] = $samples[$i] + $p_sample;
1101 $p_sample = $samples[$i];
1103 } else {
1104 $p_sample1 = $p_sample2 = 0;
1105 for ($i = 0; $i < $num_samples; $i++){
1106 $p_sample1 = $p_sample1 + $samples[$i];
1107 $samples[$i] = $p_sample1 + $p_sample2;
1108 $p_sample2 = $samples[$i];
1113 else {
1114 $self->warn("Bad direction. Use \"forward\" or \"backward\".");
1116 return \@samples;
1119 =head2 _unpack_magik($buffer)
1121 Title : _unpack_magik($buffer)
1122 Usage : $self->_unpack_magik($buffer)
1123 Function: What unpack specification should be used? Try them all.
1124 Returns : Nothing.
1125 Args : A buffer containing arbitrary binary data.
1126 Notes : Eliminate the ambiguity and the guesswork. Used in the
1127 adaptation of _delta(), mostly.
1129 =cut
1131 sub _unpack_magik {
1132 my ($self,$buffer) = @_;
1133 my $length = length($buffer);
1134 my (@read,$counter);
1135 foreach (qw(c C s S i I l L n N v V)) {
1136 @read = unpack "$_$length", $buffer;
1137 for ($counter=0; $counter < 20; $counter++) {
1138 print("$read[$counter]\n");
1143 =head2 read_from_buffer($filehandle,$buffer,$length)
1145 Title : read_from_buffer($filehandle,$buffer,$length)
1146 Usage : $self->read_from_buffer($filehandle,$buffer,$length);
1147 Function: Read from the buffer.
1148 Returns : $buffer, containing a read of $length
1149 Args : a filehandle, a buffer, and a read length
1150 Notes : I just got tired of typing
1151 "unless (length($buffer) == $length)" so I put it here.
1153 =cut
1155 sub read_from_buffer {
1156 my ($self,$fh,$buffer,$length,$start_position) = @_;
1157 # print("Reading from a buffer!!! length($length) ");
1158 if ($start_position) {
1159 # print(" startposition($start_position)(".sprintf("%X", $start_position).")\n");
1161 # print("\n");
1162 if ($start_position) {
1163 # print("seeking to this position in the file: (".$start_position.")\n");
1164 seek ($fh,$start_position,0);
1165 # print("done. here is where I am now: (".tell($fh).")\n");
1167 else {
1168 # print("You did not specify a start position. Going from this position (the current position) (".tell($fh).")\n");
1170 read $fh, $buffer, $length;
1171 unless (length($buffer) == $length) {
1172 $self->warn("The read was incomplete! Trying harder.");
1173 my $missing_length = $length - length($buffer);
1174 my $buffer2;
1175 read $fh,$buffer2,$missing_length;
1176 $buffer .= $buffer2;
1177 if (length($buffer) != $length) {
1178 $self->throw("Unexpected end of file while reading from SCF file. I should have read $length but instead got ".length($buffer)."! Current file position is ".tell($fh).".");
1182 return $buffer;
1185 =head2 _dump_keys()
1187 Title : _dump_keys()
1188 Usage : &_dump_keys($a_reference_to_some_hash)
1189 Function: Dump out the keys in a hash.
1190 Returns : Nothing.
1191 Args : A reference to a hash.
1192 Notes : A debugging method.
1194 =cut
1196 sub _dump_keys {
1197 my $rhash = shift;
1198 if ($rhash !~ /HASH/) {
1199 print("_dump_keys: that was not a hash.\nIt was #$rhash# which was this reference:".ref($rhash)."\n");
1200 return;
1202 print("_dump_keys: The keys for $rhash are:\n");
1203 foreach (sort keys %$rhash) {
1204 print("$_\n");
1208 =head2 _dump_base_accuracies()
1210 Title : _dump_base_accuracies()
1211 Usage : $self->_dump_base_accuracies();
1212 Function: Dump out the v3 base accuracies in an easy to read format.
1213 Returns : Nothing.
1214 Args : None.
1215 Notes : A debugging method.
1217 =cut
1219 sub _dump_base_accuracies {
1220 my $self = shift;
1221 print("Dumping base accuracies! for v3\n");
1222 print("There are this many elements in a,c,g,t:\n");
1223 print(scalar(@{$self->{'text'}->{'v3_base_accuracy_a'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_c'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_g'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_t'}})."\n");
1224 my $number_traces = scalar(@{$self->{'text'}->{'v3_base_accuracy_a'}});
1225 for (my $counter=0; $counter < $number_traces; $counter++ ) {
1226 print("$counter\t");
1227 print $self->{'text'}->{'v3_base_accuracy_a'}->[$counter]."\t";
1228 print $self->{'text'}->{'v3_base_accuracy_c'}->[$counter]."\t";
1229 print $self->{'text'}->{'v3_base_accuracy_g'}->[$counter]."\t";
1230 print $self->{'text'}->{'v3_base_accuracy_t'}->[$counter]."\t";
1231 print("\n");
1235 =head2 _dump_peak_indices_incoming()
1237 Title : _dump_peak_indices_incoming()
1238 Usage : $self->_dump_peak_indices_incoming();
1239 Function: Dump out the v3 peak indices in an easy to read format.
1240 Returns : Nothing.
1241 Args : None.
1242 Notes : A debugging method.
1244 =cut
1246 sub _dump_peak_indices_incoming {
1247 my $self = shift;
1248 print("Dump peak indices incoming!\n");
1249 my $length = $self->{'bases'};
1250 print("The length is $length\n");
1251 for (my $count=0; $count < $length; $count++) {
1252 print("$count\t$self->{parsed}->{peak_indices}->[$count]\n");
1256 =head2 _dump_base_accuracies_incoming()
1258 Title : _dump_base_accuracies_incoming()
1259 Usage : $self->_dump_base_accuracies_incoming();
1260 Function: Dump out the v3 base accuracies in an easy to read format.
1261 Returns : Nothing.
1262 Args : None.
1263 Notes : A debugging method.
1265 =cut
1267 sub _dump_base_accuracies_incoming {
1268 my $self = shift;
1269 print("Dumping base accuracies! for v3\n");
1270 # print("There are this many elements in a,c,g,t:\n");
1271 # print(scalar(@{$self->{'parsed'}->{'v3_base_accuracy_a'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_c'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_g'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_t'}})."\n");
1272 my $number_traces = $self->{'bases'};
1273 for (my $counter=0; $counter < $number_traces; $counter++ ) {
1274 print("$counter\t");
1275 foreach (qw(A T G C)) {
1276 print $self->{'parsed'}->{'base_accuracies'}->{$_}->[$counter]."\t";
1278 print("\n");
1283 =head2 _dump_comments()
1285 Title : _dump_comments()
1286 Usage : $self->_dump_comments();
1287 Function: Debug dump the comments section from the scf.
1288 Returns : Nothing.
1289 Args : Nothing.
1290 Notes : None.
1292 =cut
1294 sub _dump_comments {
1295 my ($self) = @_;
1296 warn ("SCF comments:\n");
1297 foreach my $k (keys %{$self->{'comments'}}) {
1298 warn ("\t {$k} ==> ", $self->{'comments'}->{$k}, "\n");
1305 __END__