nexml.t: Added a missing XML::Twig requirement.
[bioperl-live.git] / Bio / AlignIO / emboss.pm
blob655d69e9e78dad050b8fa0200c73c726fc3bf5b3
2 # BioPerl module for Bio::AlignIO::emboss
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::AlignIO::emboss - Parse EMBOSS alignment output (from applications water and needle)
18 =head1 SYNOPSIS
20 # do not use the object directly
21 use Bio::AlignIO;
22 # read in an alignment from the EMBOSS program water
23 my $in = Bio::AlignIO->new(-format => 'emboss',
24 -file => 'seq.water');
25 while( my $aln = $in->next_aln ) {
26 # do something with the alignment
29 =head1 DESCRIPTION
31 This object handles parsing and writing pairwise sequence alignments
32 from the EMBOSS suite.
34 =head1 FEEDBACK
36 =head2 Mailing Lists
38 User feedback is an integral part of the evolution of this and other
39 Bioperl modules. Send your comments and suggestions preferably to
40 the Bioperl mailing list. Your participation is much appreciated.
42 bioperl-l@bioperl.org - General discussion
43 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
45 =head2 Support
47 Please direct usage questions or support issues to the mailing list:
49 I<bioperl-l@bioperl.org>
51 rather than to the module maintainer directly. Many experienced and
52 reponsive experts will be able look at the problem and quickly
53 address it. Please include a thorough description of the problem
54 with code and data examples if at all possible.
56 =head2 Reporting Bugs
58 Report bugs to the Bioperl bug tracking system to help us keep track
59 of the bugs and their resolution. Bug reports can be submitted via the
60 web:
62 https://github.com/bioperl/bioperl-live/issues
64 =head1 AUTHOR - Jason Stajich
66 Email jason@bioperl.org
68 =head1 APPENDIX
70 The rest of the documentation details each of the object methods.
71 Internal methods are usually preceded with a _
73 =cut
76 # Let the code begin...
79 package Bio::AlignIO::emboss;
80 use vars qw($EMBOSSTitleLen $EMBOSSLineLen);
81 use strict;
83 use Bio::LocatableSeq;
85 use base qw(Bio::AlignIO);
87 BEGIN {
88 $EMBOSSTitleLen = 13;
89 $EMBOSSLineLen = 50;
92 sub _initialize {
93 my($self,@args) = @_;
94 $self->SUPER::_initialize(@args);
95 $self->{'_type'} = undef;
98 =head2 next_aln
100 Title : next_aln
101 Usage : $aln = $stream->next_aln()
102 Function: returns the next alignment in the stream.
103 Returns : L<Bio::Align::AlignI> object - returns 0 on end of file
104 or on error
105 Args : NONE
107 =cut
109 sub next_aln {
110 my ($self) = @_;
111 my $seenbegin = 0;
112 my %data = ( 'seq1' => {
113 'start'=> undef,
114 'end'=> undef,
115 'name' => '',
116 'data' => '' },
117 'seq2' => {
118 'start'=> undef,
119 'end'=> undef,
120 'name' => '',
121 'data' => '' },
122 'align' => '',
123 'type' => $self->{'_type'}, # to restore type from
124 # previous aln if possible
126 my %names;
127 while( defined($_ = $self->_readline) ) {
128 next if( /^\#?\s+$/ || /^\#+\s*$/ );
129 if( /^\#(\=|\-)+\s*$/) {
130 last if( $seenbegin);
131 } elsif( /(Local|Global):\s*(\S+)\s+vs\s+(\S+)/ ||
132 /^\#\s+Program:\s+(\S+)/ )
134 my ($name1,$name2) = ($2,$3);
135 if( ! defined $name1 ) { # Handle EMBOSS 2.2.X
136 $data{'type'} = $1;
137 $name1 = $name2 = '';
138 } else {
139 $data{'type'} = $1 eq 'Local' ? 'water' : 'needle';
141 $data{'seq1'}->{'name'} = $name1;
142 $data{'seq2'}->{'name'} = $name2;
144 $self->{'_type'} = $data{'type'};
146 } elsif( /Score:\s+(\S+)/ ) {
147 $data{'score'} = $1;
148 } elsif( /^\#\s+(1|2):\s+(\S+)/ && ! $data{"seq$1"}->{'name'} ) {
149 my $nm = $2;
150 $nm = substr($nm,0,$EMBOSSTitleLen); # emboss has a max seq length
151 if( $names{$nm} ) {
152 $nm .= "-". $names{$nm};
154 $names{$nm}++;
155 $data{"seq$1"}->{'name'} = $nm;
156 } elsif( $data{'seq1'}->{'name'} &&
157 /^\Q$data{'seq1'}->{'name'}/ ) {
158 my $count = 0;
159 $seenbegin = 1;
160 my @current;
161 while( defined ($_) ) {
162 my $align_other = '';
163 my $delayed;
164 if($count == 0 || $count == 2 ) {
165 my @l = split;
166 my ($seq,$align,$start,$end);
167 if( $count == 2 && $data{'seq2'}->{'name'} eq '' ) {
168 # weird boundary condition
169 ($start,$align,$end) = @l;
170 } elsif( @l == 3 ) {
171 $align = '';
172 ($seq,$start,$end) = @l
173 } else {
174 ($seq,$start,$align,$end) = @l;
177 my $seqname = sprintf("seq%d", ($count == 0) ? '1' : '2');
178 $data{$seqname}->{'data'} .= $align;
179 $data{$seqname}->{'start'} ||= $start;
180 $data{$seqname}->{'end'} = $end;
181 $current[$count] = [ $start,$align || ''];
182 } else {
183 s/^\s+//;
184 s/\s+$//;
185 $data{'align'} .= $_;
188 BOTTOM:
189 last if( $count++ == 2);
190 $_ = $self->_readline();
193 if( $data{'type'} eq 'needle' ) {
194 # which ever one is shorter we want to bring it up to
195 # length. Man this stinks.
196 my ($s1,$s2) = ($data{'seq1'}, $data{'seq2'});
198 my $d = length($current[0]->[1]) - length($current[2]->[1]);
199 if( $d < 0 ) { # s1 is smaller, need to add some
200 # compare the starting points for this alignment line
201 if( $current[0]->[0] <= 1 ) {
202 $s1->{'data'} = ('-' x abs($d)) . $s1->{'data'};
203 $data{'align'} = (' 'x abs($d)).$data{'align'};
204 } else {
205 $s1->{'data'} .= '-' x abs($d);
206 $data{'align'} .= ' 'x abs($d);
208 } elsif( $d > 0) { # s2 is smaller, need to add some
209 if( $current[2]->[0] <= 1 ) {
210 $s2->{'data'} = ('-' x abs($d)) . $s2->{'data'};
211 $data{'align'} = (' 'x abs($d)).$data{'align'};
212 } else {
213 $s2->{'data'} .= '-' x abs($d);
214 $data{'align'} .= ' 'x abs($d);
221 return unless $seenbegin;
222 my $aln = Bio::SimpleAlign->new(-verbose => $self->verbose(),
223 -score => $data{'score'},
224 -source => "EMBOSS-".$data{'type'});
226 foreach my $seqname ( qw(seq1 seq2) ) {
227 return unless ( defined $data{$seqname} );
228 $data{$seqname}->{'name'} ||= $seqname;
229 my $seq = Bio::LocatableSeq->new
230 ('-seq' => $data{$seqname}->{'data'},
231 '-display_id' => $data{$seqname}->{'name'},
232 '-start' => $data{$seqname}->{'start'},
233 '-end' => $data{$seqname}->{'end'},
234 '-alphabet' => $self->alphabet,
236 $aln->add_seq($seq);
238 return $aln;
241 =head2 write_aln
243 Title : write_aln
244 Usage : $stream->write_aln(@aln)
245 Function: writes the $aln object into the stream in emboss format
246 Returns : 1 for success and 0 for error
247 Args : L<Bio::Align::AlignI> object
250 =cut
252 sub write_aln {
253 my ($self,@aln) = @_;
255 $self->throw("Sorry: writing emboss output is not currently available! \n");