t/AlignIO/AlignIO.t: fix number of tests in plan (fixup c523e6bed866)
[bioperl-live.git] / Bio / AlignIO / msf.pm
blob17d0069605c775cc3ef250ff1cd54d0f5abdaa0f
2 # BioPerl module for Bio::AlignIO::msf
3 # based on the Bio::SeqIO::msf module
4 # by Ewan Birney <birney@ebi.ac.uk>
5 # and Lincoln Stein <lstein@cshl.org>
7 # and the SimpleAlign.pm module of Ewan Birney
9 # Copyright Peter Schattner
11 # You may distribute this module under the same terms as perl itself
12 # _history
13 # September 5, 2000
14 # POD documentation - main docs before the code
16 =head1 NAME
18 Bio::AlignIO::msf - msf sequence input/output stream
20 =head1 SYNOPSIS
22 Do not use this module directly. Use it via the L<Bio::AlignIO> class.
24 =head1 DESCRIPTION
26 This object can transform L<Bio::Align::AlignI> objects to and from msf
27 flat file databases.
29 =head1 FEEDBACK
31 =head2 Support
33 Please direct usage questions or support issues to the mailing list:
35 I<bioperl-l@bioperl.org>
37 rather than to the module maintainer directly. Many experienced and
38 reponsive experts will be able look at the problem and quickly
39 address it. Please include a thorough description of the problem
40 with code and data examples if at all possible.
42 =head2 Reporting Bugs
44 Report bugs to the Bioperl bug tracking system to help us keep track
45 the bugs and their resolution. Bug reports can be submitted via the
46 web:
48 https://github.com/bioperl/bioperl-live/issues
50 =head1 AUTHORS - Peter Schattner
52 Email: schattner@alum.mit.edu
55 =head1 APPENDIX
57 The rest of the documentation details each of the object
58 methods. Internal methods are usually preceded with a _
60 =cut
62 # Let the code begin...
64 package Bio::AlignIO::msf;
65 use vars qw(%valid_type);
66 use strict;
68 use Bio::SeqIO::gcg; # for GCG_checksum()
69 use Bio::SimpleAlign;
71 use base qw(Bio::AlignIO);
73 BEGIN {
74 %valid_type = qw( dna N rna N protein P );
77 =head2 next_aln
79 Title : next_aln
80 Usage : $aln = $stream->next_aln()
81 Function: returns the next alignment in the stream. Tries to read *all* MSF
82 It reads all non whitespace characters in the alignment
83 area. For MSFs with weird gaps (eg ~~~) map them by using
84 $aln->map_chars('~','-')
85 Returns : Bio::Align::AlignI object
86 Args : NONE
88 =cut
90 sub next_aln {
91 my $self = shift;
92 my $entry;
93 my (%hash,$name,$str,@names,$seqname,$start,$end,$count,$seq);
95 my $aln = Bio::SimpleAlign->new(-source => 'gcg' );
97 while( $entry = $self->_readline) {
98 $entry =~ m{//} && last; # move to alignment section
99 $entry =~ /Name:\s+(\S+)/ && do { $name = $1;
100 $hash{$name} = ""; # blank line
101 push(@names,$name); # we need it ordered!
103 # otherwise - skip
106 # alignment section
108 while( $entry = $self->_readline) {
109 next if ( $entry =~ /^\s+(\d+)/ ) ;
110 $entry =~ /^\s*(\S+)\s+(.*)$/ && do {
111 $name = $1;
112 $str = $2;
113 if( ! exists $hash{$name} ) {
114 $self->throw("$name exists as an alignment line but not in the header. Not confident of what is going on!");
116 $str =~ s/\s//g;
117 $str =~ s/~/-/g;
118 $hash{$name} .= $str;
122 return if @names < 1;
124 # now got this as a name - sequence hash. Let's make some sequences!
126 for $name ( @names ) {
127 if( $name =~ m{(\S+)/(\d+)-(\d+)} ) {
128 $seqname = $1;
129 $start = $2;
130 $end = $3;
131 } else {
132 $seqname = $name;
133 $start = 1;
134 $str = $hash{$name};
135 $str =~ s/[^0-9A-Za-z$Bio::LocatableSeq::OTHER_SYMBOLS]//g;
137 $end = length($str);
140 $seq = Bio::LocatableSeq->new('-seq' => $hash{$name},
141 '-display_id' => $seqname,
142 '-start' => $start,
143 '-end' => $end,
144 '-alphabet' => $self->alphabet,
146 $aln->add_seq($seq);
148 # If $end <= 0, we have either reached the end of
149 # file in <> or we have encountered some other error
152 return $aln if $aln->num_sequences;
153 return;
157 =head2 write_aln
159 Title : write_aln
160 Usage : $stream->write_aln(@aln)
161 Function: writes the $aln object into the stream in MSF format
162 Sequence type of the alignment is determined by the first sequence.
163 Returns : 1 for success and 0 for error
164 Args : Bio::Align::AlignI object
167 =cut
169 sub write_aln {
170 my ($self,@aln) = @_;
171 my $msftag;
172 my $type;
173 my $count = 0;
174 my $maxname;
175 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index);
176 foreach my $aln (@aln) {
177 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
178 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
179 next;
181 $date = localtime(time);
182 $msftag = "MSF";
183 $type = $valid_type{$aln->get_seq_by_pos(1)->alphabet};
184 $maxname = $aln->maxdisplayname_length();
185 $length = $aln->length();
186 $name = $aln->id();
187 if( !defined $name ) {
188 $name = "Align";
191 $self->_print (sprintf("\n%s MSF: %d Type: %s %s Check: 00 ..\n\n",
192 $name, $aln->num_sequences, $type, $date));
194 my $seqCountFormat = "%".($maxname > 20 ? $maxname + 2: 22)."s%-27d%27d\n";
195 my $seqNameFormat = "%-".($maxname > 20 ? $maxname : 20)."s ";
197 foreach $seq ( $aln->each_seq() ) {
198 $name = $aln->displayname($seq->get_nse());
199 $miss = $maxname - length ($name);
200 $miss += 2;
201 $pad = " " x $miss;
203 $self->_print (sprintf(" Name: %s%sLen: %d Check: %d Weight: 1.00\n",$name,$pad,length $seq->seq(), Bio::SeqIO::gcg->GCG_checksum($seq)));
205 $hash{$name} = $seq->seq();
206 push(@arr,$name);
208 # ok - heavy handed, but there you go.
210 $self->_print ("\n//\n\n\n");
212 while( $count < $length ) {
213 # there is another block to go!
214 $self->_print (sprintf($seqCountFormat,' ',$count+1,$count+50));
215 foreach $name ( @arr ) {
216 $self->_print (sprintf($seqNameFormat,$name));
218 $tempcount = $count;
219 $index = 0;
220 while( ($tempcount + 10 < $length) && ($index < 5) ) {
222 $self->_print (sprintf("%s ",substr($hash{$name},
223 $tempcount,10)));
225 $tempcount += 10;
226 $index++;
228 # ok, could be the very last guy ;)
230 if( $index < 5) {
231 # space to print!
233 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
234 $tempcount += 10;
236 $self->_print ("\n");
238 $self->_print ("\n\n");
239 $count = $tempcount;
242 $self->flush if $self->_flush_on_write && defined $self->_fh;
243 return 1;