tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / AlignIO / msf.pm
blob1cb446076561d23a0a5aa895592c8415abdf3db2
1 # $Id$
3 # BioPerl module for Bio::AlignIO::msf
4 # based on the Bio::SeqIO::msf module
5 # by Ewan Birney <birney@ebi.ac.uk>
6 # and Lincoln Stein <lstein@cshl.org>
8 # and the SimpleAlign.pm module of Ewan Birney
10 # Copyright Peter Schattner
12 # You may distribute this module under the same terms as perl itself
13 # _history
14 # September 5, 2000
15 # POD documentation - main docs before the code
17 =head1 NAME
19 Bio::AlignIO::msf - msf sequence input/output stream
21 =head1 SYNOPSIS
23 Do not use this module directly. Use it via the L<Bio::AlignIO> class.
25 =head1 DESCRIPTION
27 This object can transform L<Bio::Align::AlignI> objects to and from msf
28 flat file databases.
30 =head1 FEEDBACK
32 =head2 Support
34 Please direct usage questions or support issues to the mailing list:
36 I<bioperl-l@bioperl.org>
38 rather than to the module maintainer directly. Many experienced and
39 reponsive experts will be able look at the problem and quickly
40 address it. Please include a thorough description of the problem
41 with code and data examples if at all possible.
43 =head2 Reporting Bugs
45 Report bugs to the Bioperl bug tracking system to help us keep track
46 the bugs and their resolution. Bug reports can be submitted via the
47 web:
49 http://bugzilla.open-bio.org/
51 =head1 AUTHORS - Peter Schattner
53 Email: schattner@alum.mit.edu
56 =head1 APPENDIX
58 The rest of the documentation details each of the object
59 methods. Internal methods are usually preceded with a _
61 =cut
63 # Let the code begin...
65 package Bio::AlignIO::msf;
66 use vars qw(%valid_type);
67 use strict;
69 use Bio::SeqIO::gcg; # for GCG_checksum()
70 use Bio::SimpleAlign;
72 use base qw(Bio::AlignIO);
74 BEGIN {
75 %valid_type = qw( dna N rna N protein P );
78 =head2 next_aln
80 Title : next_aln
81 Usage : $aln = $stream->next_aln()
82 Function: returns the next alignment in the stream. Tries to read *all* MSF
83 It reads all non whitespace characters in the alignment
84 area. For MSFs with weird gaps (eg ~~~) map them by using
85 $aln->map_chars('~','-')
86 Returns : Bio::Align::AlignI object
87 Args : NONE
89 =cut
91 sub next_aln {
92 my $self = shift;
93 my $entry;
94 my (%hash,$name,$str,@names,$seqname,$start,$end,$count,$seq);
96 my $aln = Bio::SimpleAlign->new(-source => 'gcg' );
98 while( $entry = $self->_readline) {
99 $entry =~ m{//} && last; # move to alignment section
100 $entry =~ /Name:\s+(\S+)/ && do { $name = $1;
101 $hash{$name} = ""; # blank line
102 push(@names,$name); # we need it ordered!
104 # otherwise - skip
107 # alignment section
109 while( $entry = $self->_readline) {
110 next if ( $entry =~ /^\s+(\d+)/ ) ;
111 $entry =~ /^\s*(\S+)\s+(.*)$/ && do {
112 $name = $1;
113 $str = $2;
114 if( ! exists $hash{$name} ) {
115 $self->throw("$name exists as an alignment line but not in the header. Not confident of what is going on!");
117 $str =~ s/\s//g;
118 $str =~ s/~/-/g;
119 $hash{$name} .= $str;
123 # return 0 if scalar @names < 1;
124 if (scalar(@names) < 1) {
125 undef $aln;
126 return $aln;
129 # now got this as a name - sequence hash. Let's make some sequences!
131 foreach $name ( @names ) {
132 if( $name =~ m{(\S+)/(\d+)-(\d+)} ) {
133 $seqname = $1;
134 $start = $2;
135 $end = $3;
136 } else {
137 $seqname = $name;
138 $start = 1;
139 $str = $hash{$name};
140 $str =~ s/[^0-9A-Za-z$Bio::LocatableSeq::OTHER_SYMBOLS]//g;
142 $end = length($str);
145 $seq = Bio::LocatableSeq->new(-seq => $hash{$name},
146 -id => $seqname,
147 -start => $start,
148 -end => $end,
150 $aln->add_seq($seq);
152 # If $end <= 0, we have either reached the end of
153 # file in <> or we have encountered some other error
156 return $aln if $aln->num_sequences;
157 return;
161 =head2 write_aln
163 Title : write_aln
164 Usage : $stream->write_aln(@aln)
165 Function: writes the $aln object into the stream in MSF format
166 Sequence type of the alignment is determined by the first sequence.
167 Returns : 1 for success and 0 for error
168 Args : Bio::Align::AlignI object
171 =cut
173 sub write_aln {
174 my ($self,@aln) = @_;
175 my $msftag;
176 my $type;
177 my $count = 0;
178 my $maxname;
179 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index);
180 foreach my $aln (@aln) {
181 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
182 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
183 next;
185 $date = localtime(time);
186 $msftag = "MSF";
187 $type = $valid_type{$aln->get_seq_by_pos(1)->alphabet};
188 $maxname = $aln->maxdisplayname_length();
189 $length = $aln->length();
190 $name = $aln->id();
191 if( !defined $name ) {
192 $name = "Align";
195 $self->_print (sprintf("\n%s MSF: %d Type: %s %s Check: 00 ..\n\n",
196 $name, $aln->num_sequences, $type, $date));
198 my $seqCountFormat = "%".($maxname > 20 ? $maxname + 2: 22)."s%-27d%27d\n";
199 my $seqNameFormat = "%-".($maxname > 20 ? $maxname : 20)."s ";
201 foreach $seq ( $aln->each_seq() ) {
202 $name = $aln->displayname($seq->get_nse());
203 $miss = $maxname - length ($name);
204 $miss += 2;
205 $pad = " " x $miss;
207 $self->_print (sprintf(" Name: %s%sLen: %d Check: %d Weight: 1.00\n",$name,$pad,length $seq->seq(), Bio::SeqIO::gcg->GCG_checksum($seq)));
209 $hash{$name} = $seq->seq();
210 push(@arr,$name);
212 # ok - heavy handed, but there you go.
214 $self->_print ("\n//\n\n\n");
216 while( $count < $length ) {
217 # there is another block to go!
218 $self->_print (sprintf($seqCountFormat,' ',$count+1,$count+50));
219 foreach $name ( @arr ) {
220 $self->_print (sprintf($seqNameFormat,$name));
222 $tempcount = $count;
223 $index = 0;
224 while( ($tempcount + 10 < $length) && ($index < 5) ) {
226 $self->_print (sprintf("%s ",substr($hash{$name},
227 $tempcount,10)));
229 $tempcount += 10;
230 $index++;
232 # ok, could be the very last guy ;)
234 if( $index < 5) {
235 # space to print!
237 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
238 $tempcount += 10;
240 $self->_print ("\n");
242 $self->_print ("\n\n");
243 $count = $tempcount;
246 $self->flush if $self->_flush_on_write && defined $self->_fh;
247 return 1;