t/AlignIO/AlignIO.t: fix number of tests in plan (fixup c523e6bed866)
[bioperl-live.git] / Bio / SeqIO / gcg.pm
blobc95059aa1470c496bbcb7d452a3597846597ec9c
2 # BioPerl module for Bio::SeqIO::gcg
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # and Lincoln Stein <lstein@cshl.org>
9 # Copyright Ewan Birney & Lincoln Stein
11 # You may distribute this module under the same terms as perl itself
13 # _history
14 # October 18, 1999 Largely rewritten by Lincoln Stein
16 # POD documentation - main docs before the code
18 =head1 NAME
20 Bio::SeqIO::gcg - GCG sequence input/output stream
22 =head1 SYNOPSIS
24 Do not use this module directly. Use it via the Bio::SeqIO class.
26 =head1 DESCRIPTION
28 This object can transform Bio::Seq objects to and from GCG flat
29 file databases.
31 =head1 FEEDBACK
33 =head2 Mailing Lists
35 User feedback is an integral part of the evolution of this and other
36 Bioperl modules. Send your comments and suggestions preferably to one
37 of the Bioperl mailing lists. Your participation is much appreciated.
39 bioperl-l@bioperl.org - General discussion
40 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
42 =head2 Support
44 Please direct usage questions or support issues to the mailing list:
46 I<bioperl-l@bioperl.org>
48 rather than to the module maintainer directly. Many experienced and
49 reponsive experts will be able look at the problem and quickly
50 address it. Please include a thorough description of the problem
51 with code and data examples if at all possible.
53 =head2 Reporting Bugs
55 Report bugs to the Bioperl bug tracking system to help us keep track
56 the bugs and their resolution. Bug reports can be submitted via the web:
58 https://github.com/bioperl/bioperl-live/issues
60 =head1 AUTHORS - Ewan Birney & Lincoln Stein
62 Email: E<lt>birney@ebi.ac.ukE<gt>
63 E<lt>lstein@cshl.orgE<gt>
65 =head1 CONTRIBUTORS
67 Jason Stajich, jason@bioperl.org
69 =head1 APPENDIX
71 The rest of the documentation details each of the object
72 methods. Internal methods are usually preceded with a _
74 =cut
76 # Let the code begin...
78 package Bio::SeqIO::gcg;
79 use strict;
81 use Bio::Seq::SeqFactory;
83 use base qw(Bio::SeqIO);
85 sub _initialize {
86 my($self,@args) = @_;
87 $self->SUPER::_initialize(@args);
88 if( ! defined $self->sequence_factory ) {
89 $self->sequence_factory(Bio::Seq::SeqFactory->new
90 (-verbose => $self->verbose(),
91 -type => 'Bio::Seq::RichSeq'));
95 =head2 next_seq
97 Title : next_seq
98 Usage : $seq = $stream->next_seq()
99 Function: returns the next sequence in the stream
100 Returns : Bio::Seq object
101 Args :
103 =cut
105 sub next_seq {
106 my ($self,@args) = @_;
107 my($id,$type,$desc,$line,$chksum,$sequence,$date,$len);
109 while( defined($_ = $self->_readline()) ) {
111 ## Get the descriptive info (anything before the line with '..')
112 unless( /\.\.$/ ) { $desc.= $_; }
113 ## Pull ID, Checksum & Type from the line containing '..'
114 /\.\.$/ && do { $line = $_; chomp;
115 if(/Check\:\s(\d+)\s/) { $chksum = $1; }
116 if(/Type:\s(\w)\s/) { $type = $1; }
117 if(/(\S+)\s+Length/)
118 { $id = $1; }
119 if(/Length:\s+(\d+)\s+(\S.+\S)\s+Type/ )
120 { $len = $1; $date = $2;}
121 last;
124 return if ( !defined $_);
125 chomp($desc); # remove last "\n"
127 while( defined($_ = $self->_readline()) ) {
129 ## This is where we grab the sequence info.
131 if( /\.\.$/ ) {
132 $self->throw("Looks like start of another sequence. See documentation. ");
135 next if($_ eq "\n"); ## skip whitespace lines in formatted seq
136 s/[\d\s\t]//g; ## remove anything that is not alphabet char: preserve anything that is not explicitly specified for removal (Stefan Kirov)
137 # $_ = uc($_); ## uppercase sequence: NO. Keep the case. HL
138 $sequence .= $_;
140 ##If we parsed out a checksum, we might as well test it
142 if(defined $chksum) {
143 unless(_validate_checksum(uc($sequence),$chksum)) {
144 $self->throw("Checksum failure on parsed sequence.");
148 ## Remove whitespace from identifier because the constructor
149 ## will throw a warning otherwise...
150 if(defined $id) { $id =~ s/\s+//g;}
152 ## Turn our parsed "Type: N" or "Type: P" (if found) into the appropriate
153 ## keyword that the constructor expects...
154 if(defined $type) {
155 if($type eq "N") { $type = "dna"; }
156 if($type eq "P") { $type = "prot"; }
159 return $self->sequence_factory->create(-seq => $sequence,
160 -id => $id,
161 -desc => $desc,
162 -type => $type,
163 -dates => [ $date ]
167 =head2 write_seq
169 Title : write_seq
170 Usage : $stream->write_seq(@seq)
171 Function: writes the formatted $seq object into the stream
172 Returns : 1 for success and 0 for error
173 Args : array of Bio::PrimarySeqI object
176 =cut
178 sub write_seq {
179 my ($self,@seq) = @_;
180 for my $seq (@seq) {
181 $self->throw("Did not provide a valid Bio::PrimarySeqI object")
182 unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI');
184 $self->warn("No whitespace allowed in GCG ID [". $seq->display_id. "]")
185 if $seq->display_id =~ /\s/;
187 my $str = $seq->seq;
188 my $comment = $seq->desc || '';
189 my $id = $seq->id;
190 my $type = ( $seq->alphabet() =~ /[dr]na/i ) ? 'N' : 'P';
191 my $timestamp;
193 if( $seq->can('get_dates') ) {
194 ($timestamp) = $seq->get_dates;
195 } else {
196 $timestamp = localtime(time);
198 my($sum,$offset,$len,$i,$j,$cnt,@out);
200 $len = length($str);
201 ## Set the offset if we have any non-standard numbering going on
202 $offset=1;
203 # checksum
204 $sum = $self->GCG_checksum($seq);
206 #Output the sequence header info
207 push(@out,"$comment\n");
208 push(@out,"$id Length: $len $timestamp Type: $type Check: $sum ..\n\n");
210 #Format the sequence
211 $i = $#out + 1;
212 for($j = 0 ; $j < $len ; ) {
213 if( $j % 50 == 0) {
214 $out[$i] = sprintf("%8d ",($j+$offset)); #numbering
216 $out[$i] .= sprintf("%s",substr($str,$j,10));
217 $j += 10;
218 if( $j < $len && $j % 50 != 0 ) {
219 $out[$i] .= " ";
220 }elsif($j % 50 == 0 ) {
221 $out[$i++] .= "\n\n";
224 local($^W) = 0;
225 if($j % 50 != 0 ) {
226 $out[$i] .= "\n";
228 $out[$i] .= "\n";
229 return unless $self->_print(@out);
232 $self->flush if $self->_flush_on_write && defined $self->_fh;
233 return 1;
236 =head2 GCG_checksum
238 Title : GCG_checksum
239 Usage : $cksum = $gcgio->GCG_checksum($seq);
240 Function : returns a gcg checksum for the sequence specified
242 This method can also be called as a class method.
243 Example :
244 Returns : a GCG checksum string
245 Argument : a Bio::PrimarySeqI implementing object
247 =cut
249 sub GCG_checksum {
250 my ($self,$seqobj) = @_;
251 my $index = 0;
252 my $checksum = 0;
253 my $char;
255 my $seq = $seqobj->seq();
256 $seq =~ tr/a-z/A-Z/;
258 foreach $char ( split(/[\.\-]*/, $seq)) {
259 $index++;
260 $checksum += ($index * (unpack("c",$char) || 0) );
261 if( $index == 57 ) {
262 $index = 0;
266 return ($checksum % 10000);
269 =head2 _validate_checksum
271 Title : _validate_checksum
272 Usage : n/a - internal method
273 Function: if parsed gcg sequence contains a checksum field
274 : we compare it to a value computed here on the parsed
275 : sequence. A checksum mismatch would indicate some
276 : type of parsing failure occurred.
278 Returns : 1 for success, 0 for failure
279 Args : string containing parsed seq, value of parsed checksum
282 =cut
284 sub _validate_checksum {
285 my($seq,$parsed_sum) = @_;
286 my($i,$len,$computed_sum,$cnt);
288 $len = length($seq);
290 #Generate the GCG Checksum value
292 for($i=0; $i<$len ;$i++) {
293 $cnt++;
294 $computed_sum += $cnt * ord(substr($seq,$i,1));
295 ($cnt == 57) && ($cnt=0);
297 $computed_sum %= 10000;
299 ## Compare and decide if success or failure
301 if($parsed_sum == $computed_sum) {
302 return 1;
303 } else { return 0; }