sync with trunk (to r15946)
[bioperl-live.git] / Bio / SeqIO / gcg.pm
blob066b260bc9615ac3477e15be47efbd66040d6752
1 # $Id$
3 # BioPerl module for Bio::SeqIO::gcg
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # and Lincoln Stein <lstein@cshl.org>
10 # Copyright Ewan Birney & Lincoln Stein
12 # You may distribute this module under the same terms as perl itself
14 # _history
15 # October 18, 1999 Largely rewritten by Lincoln Stein
17 # POD documentation - main docs before the code
19 =head1 NAME
21 Bio::SeqIO::gcg - GCG sequence input/output stream
23 =head1 SYNOPSIS
25 Do not use this module directly. Use it via the Bio::SeqIO class.
27 =head1 DESCRIPTION
29 This object can transform Bio::Seq objects to and from GCG flat
30 file databases.
32 =head1 FEEDBACK
34 =head2 Mailing Lists
36 User feedback is an integral part of the evolution of this and other
37 Bioperl modules. Send your comments and suggestions preferably to one
38 of the Bioperl mailing lists. Your participation is much appreciated.
40 bioperl-l@bioperl.org - General discussion
41 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43 =head2 Support
45 Please direct usage questions or support issues to the mailing list:
47 L<bioperl-l@bioperl.org>
49 rather than to the module maintainer directly. Many experienced and
50 reponsive experts will be able look at the problem and quickly
51 address it. Please include a thorough description of the problem
52 with code and data examples if at all possible.
54 =head2 Reporting Bugs
56 Report bugs to the Bioperl bug tracking system to help us keep track
57 the bugs and their resolution. Bug reports can be submitted via the web:
59 http://bugzilla.open-bio.org/
61 =head1 AUTHORS - Ewan Birney & Lincoln Stein
63 Email: E<lt>birney@ebi.ac.ukE<gt>
64 E<lt>lstein@cshl.orgE<gt>
66 =head1 CONTRIBUTORS
68 Jason Stajich, jason@bioperl.org
70 =head1 APPENDIX
72 The rest of the documentation details each of the object
73 methods. Internal methods are usually preceded with a _
75 =cut
77 # Let the code begin...
79 package Bio::SeqIO::gcg;
80 use strict;
82 use Bio::Seq::SeqFactory;
84 use base qw(Bio::SeqIO);
86 sub _initialize {
87 my($self,@args) = @_;
88 $self->SUPER::_initialize(@args);
89 if( ! defined $self->sequence_factory ) {
90 $self->sequence_factory(Bio::Seq::SeqFactory->new
91 (-verbose => $self->verbose(),
92 -type => 'Bio::Seq::RichSeq'));
96 =head2 next_seq
98 Title : next_seq
99 Usage : $seq = $stream->next_seq()
100 Function: returns the next sequence in the stream
101 Returns : Bio::Seq object
102 Args :
104 =cut
106 sub next_seq {
107 my ($self,@args) = @_;
108 my($id,$type,$desc,$line,$chksum,$sequence,$date,$len);
110 while( defined($_ = $self->_readline()) ) {
112 ## Get the descriptive info (anything before the line with '..')
113 unless( /\.\.$/ ) { $desc.= $_; }
114 ## Pull ID, Checksum & Type from the line containing '..'
115 /\.\.$/ && do { $line = $_; chomp;
116 if(/Check\:\s(\d+)\s/) { $chksum = $1; }
117 if(/Type:\s(\w)\s/) { $type = $1; }
118 if(/(\S+)\s+Length/)
119 { $id = $1; }
120 if(/Length:\s+(\d+)\s+(\S.+\S)\s+Type/ )
121 { $len = $1; $date = $2;}
122 last;
125 return if ( !defined $_);
126 chomp($desc); # remove last "\n"
128 while( defined($_ = $self->_readline()) ) {
130 ## This is where we grab the sequence info.
132 if( /\.\.$/ ) {
133 $self->throw("Looks like start of another sequence. See documentation. ");
136 next if($_ eq "\n"); ## skip whitespace lines in formatted seq
137 s/[\d\s\t]//g; ## remove anything that is not alphabet char: preserve anything that is not explicitly specified for removal (Stefan Kirov)
138 # $_ = uc($_); ## uppercase sequence: NO. Keep the case. HL
139 $sequence .= $_;
141 ##If we parsed out a checksum, we might as well test it
143 if(defined $chksum) {
144 unless(_validate_checksum(uc($sequence),$chksum)) {
145 $self->throw("Checksum failure on parsed sequence.");
149 ## Remove whitespace from identifier because the constructor
150 ## will throw a warning otherwise...
151 if(defined $id) { $id =~ s/\s+//g;}
153 ## Turn our parsed "Type: N" or "Type: P" (if found) into the appropriate
154 ## keyword that the constructor expects...
155 if(defined $type) {
156 if($type eq "N") { $type = "dna"; }
157 if($type eq "P") { $type = "prot"; }
160 return $self->sequence_factory->create(-seq => $sequence,
161 -id => $id,
162 -desc => $desc,
163 -type => $type,
164 -dates => [ $date ]
168 =head2 write_seq
170 Title : write_seq
171 Usage : $stream->write_seq(@seq)
172 Function: writes the formatted $seq object into the stream
173 Returns : 1 for success and 0 for error
174 Args : array of Bio::PrimarySeqI object
177 =cut
179 sub write_seq {
180 my ($self,@seq) = @_;
181 for my $seq (@seq) {
182 $self->throw("Did not provide a valid Bio::PrimarySeqI object")
183 unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI');
185 $self->warn("No whitespace allowed in GCG ID [". $seq->display_id. "]")
186 if $seq->display_id =~ /\s/;
188 my $str = $seq->seq;
189 my $comment = $seq->desc || '';
190 my $id = $seq->id;
191 my $type = ( $seq->alphabet() =~ /[dr]na/i ) ? 'N' : 'P';
192 my $timestamp;
194 if( $seq->can('get_dates') ) {
195 ($timestamp) = $seq->get_dates;
196 } else {
197 $timestamp = localtime(time);
199 my($sum,$offset,$len,$i,$j,$cnt,@out);
201 $len = length($str);
202 ## Set the offset if we have any non-standard numbering going on
203 $offset=1;
204 # checksum
205 $sum = $self->GCG_checksum($seq);
207 #Output the sequence header info
208 push(@out,"$comment\n");
209 push(@out,"$id Length: $len $timestamp Type: $type Check: $sum ..\n\n");
211 #Format the sequence
212 $i = $#out + 1;
213 for($j = 0 ; $j < $len ; ) {
214 if( $j % 50 == 0) {
215 $out[$i] = sprintf("%8d ",($j+$offset)); #numbering
217 $out[$i] .= sprintf("%s",substr($str,$j,10));
218 $j += 10;
219 if( $j < $len && $j % 50 != 0 ) {
220 $out[$i] .= " ";
221 }elsif($j % 50 == 0 ) {
222 $out[$i++] .= "\n\n";
225 local($^W) = 0;
226 if($j % 50 != 0 ) {
227 $out[$i] .= "\n";
229 $out[$i] .= "\n";
230 return unless $self->_print(@out);
233 $self->flush if $self->_flush_on_write && defined $self->_fh;
234 return 1;
237 =head2 GCG_checksum
239 Title : GCG_checksum
240 Usage : $cksum = $gcgio->GCG_checksum($seq);
241 Function : returns a gcg checksum for the sequence specified
243 This method can also be called as a class method.
244 Example :
245 Returns : a GCG checksum string
246 Argument : a Bio::PrimarySeqI implementing object
248 =cut
250 sub GCG_checksum {
251 my ($self,$seqobj) = @_;
252 my $index = 0;
253 my $checksum = 0;
254 my $char;
256 my $seq = $seqobj->seq();
257 $seq =~ tr/a-z/A-Z/;
259 foreach $char ( split(/[\.\-]*/, $seq)) {
260 $index++;
261 $checksum += ($index * (unpack("c",$char) || 0) );
262 if( $index == 57 ) {
263 $index = 0;
267 return ($checksum % 10000);
270 =head2 _validate_checksum
272 Title : _validate_checksum
273 Usage : n/a - internal method
274 Function: if parsed gcg sequence contains a checksum field
275 : we compare it to a value computed here on the parsed
276 : sequence. A checksum mismatch would indicate some
277 : type of parsing failure occured.
279 Returns : 1 for success, 0 for failure
280 Args : string containing parsed seq, value of parsed cheksum
283 =cut
285 sub _validate_checksum {
286 my($seq,$parsed_sum) = @_;
287 my($i,$len,$computed_sum,$cnt);
289 $len = length($seq);
291 #Generate the GCG Checksum value
293 for($i=0; $i<$len ;$i++) {
294 $cnt++;
295 $computed_sum += $cnt * ord(substr($seq,$i,1));
296 ($cnt == 57) && ($cnt=0);
298 $computed_sum %= 10000;
300 ## Compare and decide if success or failure
302 if($parsed_sum == $computed_sum) {
303 return 1;
304 } else { return 0; }