3 # BioPerl module for Bio::SeqIO::gcg
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
6 # and Lincoln Stein <lstein@cshl.org>
8 # Copyright Ewan Birney & Lincoln Stein
10 # You may distribute this module under the same terms as perl itself
13 # October 18, 1999 Largely rewritten by Lincoln Stein
15 # POD documentation - main docs before the code
19 Bio::SeqIO::gcg - GCG sequence input/output stream
23 Do not use this module directly. Use it via the Bio::SeqIO class.
27 This object can transform Bio::Seq objects to and from GCG flat
34 User feedback is an integral part of the evolution of this and other
35 Bioperl modules. Send your comments and suggestions preferably to one
36 of the Bioperl mailing lists. Your participation is much appreciated.
38 bioperl-l@bioperl.org - General discussion
39 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43 Report bugs to the Bioperl bug tracking system to help us keep track
44 the bugs and their resolution. Bug reports can be submitted via the web:
46 http://bugzilla.open-bio.org/
48 =head1 AUTHORS - Ewan Birney & Lincoln Stein
50 Email: E<lt>birney@ebi.ac.ukE<gt>
51 E<lt>lstein@cshl.orgE<gt>
55 Jason Stajich, jason@bioperl.org
59 The rest of the documentation details each of the object
60 methods. Internal methods are usually preceded with a _
64 # Let the code begin...
66 package Bio
::SeqIO
::gcg
;
69 use Bio
::Seq
::SeqFactory
;
71 use base
qw(Bio::SeqIO);
75 $self->SUPER::_initialize
(@args);
76 if( ! defined $self->sequence_factory ) {
77 $self->sequence_factory(Bio
::Seq
::SeqFactory
->new
78 (-verbose
=> $self->verbose(),
79 -type
=> 'Bio::Seq::RichSeq'));
86 Usage : $seq = $stream->next_seq()
87 Function: returns the next sequence in the stream
88 Returns : Bio::Seq object
94 my ($self,@args) = @_;
95 my($id,$type,$desc,$line,$chksum,$sequence,$date,$len);
97 while( defined($_ = $self->_readline()) ) {
99 ## Get the descriptive info (anything before the line with '..')
100 unless( /\.\.$/ ) { $desc.= $_; }
101 ## Pull ID, Checksum & Type from the line containing '..'
102 /\.\.$/ && do { $line = $_; chomp;
103 if(/Check\:\s(\d+)\s/) { $chksum = $1; }
104 if(/Type:\s(\w)\s/) { $type = $1; }
107 if(/Length:\s+(\d+)\s+(\S.+\S)\s+Type/ )
108 { $len = $1; $date = $2;}
112 return if ( !defined $_);
113 chomp($desc); # remove last "\n"
115 while( defined($_ = $self->_readline()) ) {
117 ## This is where we grab the sequence info.
120 $self->throw("Looks like start of another sequence. See documentation. ");
123 next if($_ eq "\n"); ## skip whitespace lines in formatted seq
124 s/[\d\s\t]//g; ## remove anything that is not alphabet char: preserve anything that is not explicitly specified for removal (Stefan Kirov)
125 # $_ = uc($_); ## uppercase sequence: NO. Keep the case. HL
128 ##If we parsed out a checksum, we might as well test it
130 if(defined $chksum) {
131 unless(_validate_checksum
(uc($sequence),$chksum)) {
132 $self->throw("Checksum failure on parsed sequence.");
136 ## Remove whitespace from identifier because the constructor
137 ## will throw a warning otherwise...
138 if(defined $id) { $id =~ s/\s+//g;}
140 ## Turn our parsed "Type: N" or "Type: P" (if found) into the appropriate
141 ## keyword that the constructor expects...
143 if($type eq "N") { $type = "dna"; }
144 if($type eq "P") { $type = "prot"; }
147 return $self->sequence_factory->create(-seq
=> $sequence,
158 Usage : $stream->write_seq(@seq)
159 Function: writes the formatted $seq object into the stream
160 Returns : 1 for success and 0 for error
161 Args : array of Bio::PrimarySeqI object
167 my ($self,@seq) = @_;
169 $self->throw("Did not provide a valid Bio::PrimarySeqI object")
170 unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI');
172 $self->warn("No whitespace allowed in GCG ID [". $seq->display_id. "]")
173 if $seq->display_id =~ /\s/;
176 my $comment = $seq->desc || '';
178 my $type = ( $seq->alphabet() =~ /[dr]na/i ) ?
'N' : 'P';
181 if( $seq->can('get_dates') ) {
182 ($timestamp) = $seq->get_dates;
184 $timestamp = localtime(time);
186 my($sum,$offset,$len,$i,$j,$cnt,@out);
189 ## Set the offset if we have any non-standard numbering going on
192 $sum = $self->GCG_checksum($seq);
194 #Output the sequence header info
195 push(@out,"$comment\n");
196 push(@out,"$id Length: $len $timestamp Type: $type Check: $sum ..\n\n");
200 for($j = 0 ; $j < $len ; ) {
202 $out[$i] = sprintf("%8d ",($j+$offset)); #numbering
204 $out[$i] .= sprintf("%s",substr($str,$j,10));
206 if( $j < $len && $j % 50 != 0 ) {
208 }elsif($j % 50 == 0 ) {
209 $out[$i++] .= "\n\n";
217 return unless $self->_print(@out);
220 $self->flush if $self->_flush_on_write && defined $self->_fh;
227 Usage : $cksum = $gcgio->GCG_checksum($seq);
228 Function : returns a gcg checksum for the sequence specified
230 This method can also be called as a class method.
232 Returns : a GCG checksum string
233 Argument : a Bio::PrimarySeqI implementing object
238 my ($self,$seqobj) = @_;
243 my $seq = $seqobj->seq();
246 foreach $char ( split(/[\.\-]*/, $seq)) {
248 $checksum += ($index * (unpack("c",$char) || 0) );
254 return ($checksum % 10000);
257 =head2 _validate_checksum
259 Title : _validate_checksum
260 Usage : n/a - internal method
261 Function: if parsed gcg sequence contains a checksum field
262 : we compare it to a value computed here on the parsed
263 : sequence. A checksum mismatch would indicate some
264 : type of parsing failure occured.
266 Returns : 1 for success, 0 for failure
267 Args : string containing parsed seq, value of parsed cheksum
272 sub _validate_checksum
{
273 my($seq,$parsed_sum) = @_;
274 my($i,$len,$computed_sum,$cnt);
278 #Generate the GCG Checksum value
280 for($i=0; $i<$len ;$i++) {
282 $computed_sum += $cnt * ord(substr($seq,$i,1));
283 ($cnt == 57) && ($cnt=0);
285 $computed_sum %= 10000;
287 ## Compare and decide if success or failure
289 if($parsed_sum == $computed_sum) {