match '*' and '?' as well
[bioperl-live.git] / Bio / AlignIO / msf.pm
blobaf7c4804cb3eb6d808aa63480895c7ac86c2f13f
1 # $Id$
3 # BioPerl module for Bio::AlignIO::msf
4 # based on the Bio::SeqIO::msf module
5 # by Ewan Birney <birney@ebi.ac.uk>
6 # and Lincoln Stein <lstein@cshl.org>
8 # and the SimpleAlign.pm module of Ewan Birney
10 # Copyright Peter Schattner
12 # You may distribute this module under the same terms as perl itself
13 # _history
14 # September 5, 2000
15 # POD documentation - main docs before the code
17 =head1 NAME
19 Bio::AlignIO::msf - msf sequence input/output stream
21 =head1 SYNOPSIS
23 Do not use this module directly. Use it via the L<Bio::AlignIO> class.
25 =head1 DESCRIPTION
27 This object can transform L<Bio::Align::AlignI> objects to and from msf
28 flat file databases.
30 =head1 FEEDBACK
32 =head2 Reporting Bugs
34 Report bugs to the Bioperl bug tracking system to help us keep track
35 the bugs and their resolution. Bug reports can be submitted via the
36 web:
38 http://bugzilla.open-bio.org/
40 =head1 AUTHORS - Peter Schattner
42 Email: schattner@alum.mit.edu
45 =head1 APPENDIX
47 The rest of the documentation details each of the object
48 methods. Internal methods are usually preceded with a _
50 =cut
52 # Let the code begin...
54 package Bio::AlignIO::msf;
55 use vars qw(%valid_type);
56 use strict;
58 use Bio::SeqIO::gcg; # for GCG_checksum()
59 use Bio::SimpleAlign;
61 use base qw(Bio::AlignIO);
63 BEGIN {
64 %valid_type = qw( dna N rna N protein P );
67 =head2 next_aln
69 Title : next_aln
70 Usage : $aln = $stream->next_aln()
71 Function: returns the next alignment in the stream. Tries to read *all* MSF
72 It reads all non whitespace characters in the alignment
73 area. For MSFs with weird gaps (eg ~~~) map them by using
74 $aln->map_chars('~','-')
75 Returns : Bio::Align::AlignI object
76 Args : NONE
78 =cut
80 sub next_aln {
81 my $self = shift;
82 my $entry;
83 my (%hash,$name,$str,@names,$seqname,$start,$end,$count,$seq);
85 my $aln = Bio::SimpleAlign->new(-source => 'gcg' );
87 while( $entry = $self->_readline) {
88 $entry =~ m{//} && last; # move to alignment section
89 $entry =~ /Name:\s+(\S+)/ && do { $name = $1;
90 $hash{$name} = ""; # blank line
91 push(@names,$name); # we need it ordered!
93 # otherwise - skip
96 # alignment section
98 while( $entry = $self->_readline) {
99 next if ( $entry =~ /^\s+(\d+)/ ) ;
100 $entry =~ /^\s*(\S+)\s+(.*)$/ && do {
101 $name = $1;
102 $str = $2;
103 if( ! exists $hash{$name} ) {
104 $self->throw("$name exists as an alignment line but not in the header. Not confident of what is going on!");
106 $str =~ s/\s//g;
107 $str =~ s/~/-/g;
108 $hash{$name} .= $str;
112 # return 0 if scalar @names < 1;
113 if (scalar(@names) < 1) {
114 undef $aln;
115 return $aln;
118 # now got this as a name - sequence hash. Let's make some sequences!
120 foreach $name ( @names ) {
121 if( $name =~ m{(\S+)/(\d+)-(\d+)} ) {
122 $seqname = $1;
123 $start = $2;
124 $end = $3;
125 } else {
126 $seqname = $name;
127 $start = 1;
128 $str = $hash{$name};
129 $str =~ s/[^0-9A-Za-z$Bio::LocatableSeq::OTHER_SYMBOLS]//g;
131 $end = length($str);
134 $seq = Bio::LocatableSeq->new(-seq => $hash{$name},
135 -id => $seqname,
136 -start => $start,
137 -end => $end,
139 $aln->add_seq($seq);
141 # If $end <= 0, we have either reached the end of
142 # file in <> or we have encountered some other error
144 # if ($end <= 0) { undef $aln;}
147 return $aln;
151 =head2 write_aln
153 Title : write_aln
154 Usage : $stream->write_aln(@aln)
155 Function: writes the $aln object into the stream in MSF format
156 Sequence type of the alignment is determined by the first sequence.
157 Returns : 1 for success and 0 for error
158 Args : Bio::Align::AlignI object
161 =cut
163 sub write_aln {
164 my ($self,@aln) = @_;
165 my $msftag;
166 my $type;
167 my $count = 0;
168 my $maxname;
169 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index);
170 foreach my $aln (@aln) {
171 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
172 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
173 next;
175 $date = localtime(time);
176 $msftag = "MSF";
177 $type = $valid_type{$aln->get_seq_by_pos(1)->alphabet};
178 $maxname = $aln->maxdisplayname_length();
179 $length = $aln->length();
180 $name = $aln->id();
181 if( !defined $name ) {
182 $name = "Align";
185 $self->_print (sprintf("\n%s MSF: %d Type: %s %s Check: 00 ..\n\n",
186 $name, $aln->no_sequences, $type, $date));
188 my $seqCountFormat = "%".($maxname > 20 ? $maxname + 2: 22)."s%-27d%27d\n";
189 my $seqNameFormat = "%-".($maxname > 20 ? $maxname : 20)."s ";
191 foreach $seq ( $aln->each_seq() ) {
192 $name = $aln->displayname($seq->get_nse());
193 $miss = $maxname - length ($name);
194 $miss += 2;
195 $pad = " " x $miss;
197 $self->_print (sprintf(" Name: %s%sLen: %d Check: %d Weight: 1.00\n",$name,$pad,length $seq->seq(), Bio::SeqIO::gcg->GCG_checksum($seq)));
199 $hash{$name} = $seq->seq();
200 push(@arr,$name);
202 # ok - heavy handed, but there you go.
204 $self->_print ("\n//\n\n\n");
206 while( $count < $length ) {
207 # there is another block to go!
208 $self->_print (sprintf($seqCountFormat,' ',$count+1,$count+50));
209 foreach $name ( @arr ) {
210 $self->_print (sprintf($seqNameFormat,$name));
212 $tempcount = $count;
213 $index = 0;
214 while( ($tempcount + 10 < $length) && ($index < 5) ) {
216 $self->_print (sprintf("%s ",substr($hash{$name},
217 $tempcount,10)));
219 $tempcount += 10;
220 $index++;
222 # ok, could be the very last guy ;)
224 if( $index < 5) {
225 # space to print!
227 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
228 $tempcount += 10;
230 $self->_print ("\n");
232 $self->_print ("\n\n");
233 $count = $tempcount;
236 $self->flush if $self->_flush_on_write && defined $self->_fh;
237 return 1;