[bug 2714]
[bioperl-live.git] / Bio / SeqIO / gcg.pm
blob53e166fd731312237d73cd0af82949a74db24495
1 # $Id$
3 # BioPerl module for Bio::SeqIO::gcg
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
6 # and Lincoln Stein <lstein@cshl.org>
8 # Copyright Ewan Birney & Lincoln Stein
10 # You may distribute this module under the same terms as perl itself
12 # _history
13 # October 18, 1999 Largely rewritten by Lincoln Stein
15 # POD documentation - main docs before the code
17 =head1 NAME
19 Bio::SeqIO::gcg - GCG sequence input/output stream
21 =head1 SYNOPSIS
23 Do not use this module directly. Use it via the Bio::SeqIO class.
25 =head1 DESCRIPTION
27 This object can transform Bio::Seq objects to and from GCG flat
28 file databases.
30 =head1 FEEDBACK
32 =head2 Mailing Lists
34 User feedback is an integral part of the evolution of this and other
35 Bioperl modules. Send your comments and suggestions preferably to one
36 of the Bioperl mailing lists. Your participation is much appreciated.
38 bioperl-l@bioperl.org - General discussion
39 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
41 =head2 Reporting Bugs
43 Report bugs to the Bioperl bug tracking system to help us keep track
44 the bugs and their resolution. Bug reports can be submitted via the web:
46 http://bugzilla.open-bio.org/
48 =head1 AUTHORS - Ewan Birney & Lincoln Stein
50 Email: E<lt>birney@ebi.ac.ukE<gt>
51 E<lt>lstein@cshl.orgE<gt>
53 =head1 CONTRIBUTORS
55 Jason Stajich, jason@bioperl.org
57 =head1 APPENDIX
59 The rest of the documentation details each of the object
60 methods. Internal methods are usually preceded with a _
62 =cut
64 # Let the code begin...
66 package Bio::SeqIO::gcg;
67 use strict;
69 use Bio::Seq::SeqFactory;
71 use base qw(Bio::SeqIO);
73 sub _initialize {
74 my($self,@args) = @_;
75 $self->SUPER::_initialize(@args);
76 if( ! defined $self->sequence_factory ) {
77 $self->sequence_factory(Bio::Seq::SeqFactory->new
78 (-verbose => $self->verbose(),
79 -type => 'Bio::Seq::RichSeq'));
83 =head2 next_seq
85 Title : next_seq
86 Usage : $seq = $stream->next_seq()
87 Function: returns the next sequence in the stream
88 Returns : Bio::Seq object
89 Args :
91 =cut
93 sub next_seq {
94 my ($self,@args) = @_;
95 my($id,$type,$desc,$line,$chksum,$sequence,$date,$len);
97 while( defined($_ = $self->_readline()) ) {
99 ## Get the descriptive info (anything before the line with '..')
100 unless( /\.\.$/ ) { $desc.= $_; }
101 ## Pull ID, Checksum & Type from the line containing '..'
102 /\.\.$/ && do { $line = $_; chomp;
103 if(/Check\:\s(\d+)\s/) { $chksum = $1; }
104 if(/Type:\s(\w)\s/) { $type = $1; }
105 if(/(\S+)\s+Length/)
106 { $id = $1; }
107 if(/Length:\s+(\d+)\s+(\S.+\S)\s+Type/ )
108 { $len = $1; $date = $2;}
109 last;
112 return if ( !defined $_);
113 chomp($desc); # remove last "\n"
115 while( defined($_ = $self->_readline()) ) {
117 ## This is where we grab the sequence info.
119 if( /\.\.$/ ) {
120 $self->throw("Looks like start of another sequence. See documentation. ");
123 next if($_ eq "\n"); ## skip whitespace lines in formatted seq
124 s/[\d\s\t]//g; ## remove anything that is not alphabet char: preserve anything that is not explicitly specified for removal (Stefan Kirov)
125 # $_ = uc($_); ## uppercase sequence: NO. Keep the case. HL
126 $sequence .= $_;
128 ##If we parsed out a checksum, we might as well test it
130 if(defined $chksum) {
131 unless(_validate_checksum(uc($sequence),$chksum)) {
132 $self->throw("Checksum failure on parsed sequence.");
136 ## Remove whitespace from identifier because the constructor
137 ## will throw a warning otherwise...
138 if(defined $id) { $id =~ s/\s+//g;}
140 ## Turn our parsed "Type: N" or "Type: P" (if found) into the appropriate
141 ## keyword that the constructor expects...
142 if(defined $type) {
143 if($type eq "N") { $type = "dna"; }
144 if($type eq "P") { $type = "prot"; }
147 return $self->sequence_factory->create(-seq => $sequence,
148 -id => $id,
149 -desc => $desc,
150 -type => $type,
151 -dates => [ $date ]
155 =head2 write_seq
157 Title : write_seq
158 Usage : $stream->write_seq(@seq)
159 Function: writes the formatted $seq object into the stream
160 Returns : 1 for success and 0 for error
161 Args : array of Bio::PrimarySeqI object
164 =cut
166 sub write_seq {
167 my ($self,@seq) = @_;
168 for my $seq (@seq) {
169 $self->throw("Did not provide a valid Bio::PrimarySeqI object")
170 unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI');
172 $self->warn("No whitespace allowed in GCG ID [". $seq->display_id. "]")
173 if $seq->display_id =~ /\s/;
175 my $str = $seq->seq;
176 my $comment = $seq->desc || '';
177 my $id = $seq->id;
178 my $type = ( $seq->alphabet() =~ /[dr]na/i ) ? 'N' : 'P';
179 my $timestamp;
181 if( $seq->can('get_dates') ) {
182 ($timestamp) = $seq->get_dates;
183 } else {
184 $timestamp = localtime(time);
186 my($sum,$offset,$len,$i,$j,$cnt,@out);
188 $len = length($str);
189 ## Set the offset if we have any non-standard numbering going on
190 $offset=1;
191 # checksum
192 $sum = $self->GCG_checksum($seq);
194 #Output the sequence header info
195 push(@out,"$comment\n");
196 push(@out,"$id Length: $len $timestamp Type: $type Check: $sum ..\n\n");
198 #Format the sequence
199 $i = $#out + 1;
200 for($j = 0 ; $j < $len ; ) {
201 if( $j % 50 == 0) {
202 $out[$i] = sprintf("%8d ",($j+$offset)); #numbering
204 $out[$i] .= sprintf("%s",substr($str,$j,10));
205 $j += 10;
206 if( $j < $len && $j % 50 != 0 ) {
207 $out[$i] .= " ";
208 }elsif($j % 50 == 0 ) {
209 $out[$i++] .= "\n\n";
212 local($^W) = 0;
213 if($j % 50 != 0 ) {
214 $out[$i] .= "\n";
216 $out[$i] .= "\n";
217 return unless $self->_print(@out);
220 $self->flush if $self->_flush_on_write && defined $self->_fh;
221 return 1;
224 =head2 GCG_checksum
226 Title : GCG_checksum
227 Usage : $cksum = $gcgio->GCG_checksum($seq);
228 Function : returns a gcg checksum for the sequence specified
230 This method can also be called as a class method.
231 Example :
232 Returns : a GCG checksum string
233 Argument : a Bio::PrimarySeqI implementing object
235 =cut
237 sub GCG_checksum {
238 my ($self,$seqobj) = @_;
239 my $index = 0;
240 my $checksum = 0;
241 my $char;
243 my $seq = $seqobj->seq();
244 $seq =~ tr/a-z/A-Z/;
246 foreach $char ( split(/[\.\-]*/, $seq)) {
247 $index++;
248 $checksum += ($index * (unpack("c",$char) || 0) );
249 if( $index == 57 ) {
250 $index = 0;
254 return ($checksum % 10000);
257 =head2 _validate_checksum
259 Title : _validate_checksum
260 Usage : n/a - internal method
261 Function: if parsed gcg sequence contains a checksum field
262 : we compare it to a value computed here on the parsed
263 : sequence. A checksum mismatch would indicate some
264 : type of parsing failure occured.
266 Returns : 1 for success, 0 for failure
267 Args : string containing parsed seq, value of parsed cheksum
270 =cut
272 sub _validate_checksum {
273 my($seq,$parsed_sum) = @_;
274 my($i,$len,$computed_sum,$cnt);
276 $len = length($seq);
278 #Generate the GCG Checksum value
280 for($i=0; $i<$len ;$i++) {
281 $cnt++;
282 $computed_sum += $cnt * ord(substr($seq,$i,1));
283 ($cnt == 57) && ($cnt=0);
285 $computed_sum %= 10000;
287 ## Compare and decide if success or failure
289 if($parsed_sum == $computed_sum) {
290 return 1;
291 } else { return 0; }