add some tests for RT ticket, not a true bug but indicates parsing is correct
[bioperl-live.git] / Bio / AlignIO / po.pm
blob2975099354de3638b43cc97cd9e9e7300f4c13c8
1 # $Id: po.pm
3 # BioPerl module for Bio::AlignIO::po
5 # based on the Bio::AlignIO::fasta module
6 # by Peter Schattner (and others?)
8 # and the SimpleAlign.pm module of Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::AlignIO::po - po MSA Sequence input/output stream
18 =head1 SYNOPSIS
20 Do not use this module directly. Use it via the L<Bio::AlignIO> class.
22 =head1 DESCRIPTION
24 This object can transform L<Bio::SimpleAlign> objects to and from
25 'po' format flat file databases. 'po' format is the native format of
26 the POA alignment program (Lee C, Grasso C, Sharlow MF, 'Multiple
27 sequence alignment using partial order graphs', Bioinformatics (2002),
28 18(3):452-64).
30 =head1 FEEDBACK
32 =head2 Support
34 Please direct usage questions or support issues to the mailing list:
36 I<bioperl-l@bioperl.org>
38 rather than to the module maintainer directly. Many experienced and
39 reponsive experts will be able look at the problem and quickly
40 address it. Please include a thorough description of the problem
41 with code and data examples if at all possible.
43 =head2 Reporting Bugs
45 Report bugs to the Bioperl bug tracking system to help us keep track
46 the bugs and their resolution. Bug reports can be submitted via the
47 web:
49 https://github.com/bioperl/bioperl-live/issues
51 =head1 AUTHORS - Matthew Betts
53 Email: matthew.betts@ii.uib.no
55 =head1 APPENDIX
57 The rest of the documentation details each of the object
58 methods. Internal methods are usually preceded with a _
60 =cut
62 # Let the code begin...
64 package Bio::AlignIO::po;
65 use strict;
67 use Bio::SimpleAlign;
69 use base qw(Bio::AlignIO);
72 =head2 next_aln
74 Title : next_aln
75 Usage : $aln = $stream->next_aln()
76 Function: returns the next alignment in the stream.
77 Returns : L<Bio::Align::AlignI> object - returns undef on end of file
78 or on error
79 Args : NONE
81 =cut
83 sub next_aln {
84 my $self = shift;
86 my $aln;
87 my $entry;
88 my $name;
89 my $seqs;
90 my $seq;
91 my $nodes;
92 my $list;
93 my $node;
94 my @chars;
95 my $s;
96 my $a;
98 $aln = Bio::SimpleAlign->new();
100 # get to the first 'VERSION' line
101 while(defined($entry = $self->_readline)) {
102 if($entry =~ /^VERSION=(\S+)/) {
103 $aln->source($1);
105 if(defined($entry = $self->_readline) and $entry =~ /^NAME=(\S+)/) {
106 $aln->id($1);
109 last;
113 # read in the sequence names and node data, up to the end of
114 # the file or the next 'VERSION' line, whichever comes first
115 $seqs = [];
116 $nodes = [];
117 while(defined($entry = $self->_readline)) {
118 if($entry =~ /^VERSION/) {
119 # start of a new alignment, so...
120 $self->_pushback($entry);
121 last;
123 elsif($entry =~ /^SOURCENAME=(\S+)/) {
124 $name = $1;
126 if($name =~ /(\S+)\/(\d+)-(\d+)/) {
127 $seq = Bio::LocatableSeq->new(
128 '-display_id' => $1,
129 '-start' => $2,
130 '-end' => $3,
131 '-alphabet' => $self->alphabet,
134 } else {
135 $seq = Bio::LocatableSeq->new('-display_id'=> $name,
136 '-alphabet' => $self->alphabet);
139 # store sequences in a list initially, because can't guarantee
140 # that will get them back from SimpleAlign object in the order
141 # they were read, and also can't add them to the SimpleAlign
142 # object here because their sequences are currently unknown
143 push @{$seqs}, {
144 'seq' => $seq,
145 'str' => '',
148 elsif($entry =~ /^SOURCEINFO=(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.*)/) {
149 $seq->desc($5);
151 elsif($entry =~ /^(\S):(\S+)/) {
152 $node = {
153 'aa' => $1,
154 'L' => [],
155 'S' => [],
156 'A' => [],
157 'status' => 'unvisited',
160 $list = $2;
161 if($list =~ /^([L\d]*)([S\d]*)([A\d]*)/) {
162 push(@{$node->{'L'}}, split(/L/, $1));
163 push(@{$node->{'S'}}, split(/S/, $2));
164 push(@{$node->{'A'}}, split(/A/, $3));
166 (@{$node->{'L'}} > 0) and shift @{$node->{'L'}};
167 (@{$node->{'S'}} > 0) and shift @{$node->{'S'}};
168 (@{$node->{'A'}} > 0) and shift @{$node->{'A'}};
171 push @{$nodes}, $node;
175 # process the nodes
176 foreach $node (@{$nodes}) {
177 ($node->{'status'} ne 'unvisited') and next;
179 @chars = ($aln->gap_char) x @{$seqs}; # char for each seq defaults to a gap
181 # set the character for each sequence represented by this node
182 foreach $s (@{$node->{'S'}}) {
183 $chars[$s] = $node->{'aa'};
185 $node->{'status'} = 'visited';
187 # do the same for each node in the same align ring
188 while(defined($a = $node->{'A'}->[0])) {
189 $node = $nodes->[$a];
190 ($node->{'status'} ne 'unvisited') and last;
192 foreach $s (@{$node->{'S'}}) {
193 $chars[$s] = $node->{'aa'};
196 $node->{'status'} = 'visited';
199 # update the sequences
200 foreach $seq (@{$seqs}) {
201 $seq->{'str'} .= shift @chars;
205 # set the sequences of the bioperl objects
206 # and add them to the alignment
207 foreach $seq (@{$seqs}) {
208 $seq->{'seq'}->seq($seq->{'str'});
209 $aln->add_seq($seq->{'seq'});
212 # has an alignment been read?...
214 return $aln if $aln->num_sequences;
215 return;
218 =head2 write_aln
220 Title : write_aln
221 Usage : $stream->write_aln(@aln)
222 Function: writes the $aln object into the stream in po format
223 Returns : 1 for success and 0 for error
224 Args : L<Bio::Align::AlignI> object
226 =cut
228 sub write_aln {
229 my $self = shift;
230 my @alns = @_;
232 my $aln;
233 my $seqs;
234 my $nodes;
235 my $seq;
236 my $node;
237 my $col;
238 my $ring;
239 my $i;
240 my $char;
242 foreach $aln (@alns) {
243 if(!$aln or !$aln->isa('Bio::Align::AlignI')) {
244 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
245 next;
248 # store the seqs on a list, because po format
249 # refers to them by position on this list
250 $seqs = [];
251 foreach $seq ($aln->each_seq()) {
252 push @{$seqs}, {
253 'seq' => $seq,
254 'n_nodes' => 0,
255 'first' => undef,
256 'previous' => undef,
260 # go through each column in the alignment and construct
261 # the nodes for the equivalent poa alignment ring
262 $nodes = [];
263 for($col = 0; $col < $aln->length; $col++) {
264 $ring = {
265 'nodes' => {},
266 'first' => scalar @{$nodes},
267 'last' => scalar @{$nodes},
270 for($i = 0; $i < @{$seqs}; $i++) {
271 $seq = $seqs->[$i];
273 $char = $seq->{'seq'}->subseq($col + 1, $col + 1);
275 ($char eq $aln->gap_char) and next;
277 if(!defined($node = $ring->{'nodes'}->{$char})) {
278 $node = {
279 'n' => scalar @{$nodes},
280 'aa' => $char,
281 'L' => {},
282 'S' => [],
283 'A' => [],
286 # update the ring
287 $ring->{'nodes'}->{$char} = $node;
288 $ring->{'last'} = $node->{'n'};
290 # add the node to the node list
291 push @{$nodes}, $node;
294 # add the sequence to the node
295 push @{$node->{'S'}}, $i;
297 # add the node to the sequence
298 defined($seq->{'first'}) or ($seq->{'first'} = $node);
299 $seq->{'n_nodes'}++;
301 # add an edge from the previous node in the sequence to this one.
302 # Then set the previous node to the current one, ready for the next
303 # residue in this sequence
304 defined($seq->{'previous'}) and ($node->{'L'}->{$seq->{'previous'}->{'n'}} = $seq->{'previous'});
305 $seq->{'previous'} = $node;
308 # set the 'next node in ring' field for each node in the ring
309 if($ring->{'first'} < $ring->{'last'}) {
310 for($i = $ring->{'first'}; $i < $ring->{'last'}; $i++) {
311 push @{$nodes->[$i]->{'A'}}, $i + 1;
313 push @{$nodes->[$ring->{'last'}]->{'A'}}, $ring->{'first'};
317 # print header information
318 $self->_print(
319 'VERSION=', ($aln->source and ($aln->source !~ /\A\s*\Z/)) ? $aln->source : 'bioperl', "\n",
320 'NAME=', $aln->id, "\n",
321 'TITLE=', ($seqs->[0]->{'seq'}->description or $aln->id), "\n",
322 'LENGTH=', scalar @{$nodes}, "\n",
323 'SOURCECOUNT=', scalar @{$seqs}, "\n",
326 # print sequence information
327 foreach $seq (@{$seqs}) {
328 $self->_print(
329 'SOURCENAME=', $seq->{'seq'}->display_id, "\n",
330 'SOURCEINFO=',
331 $seq->{'n_nodes'}, ' ', # number of nodes in the sequence
332 $seq->{'first'}->{'n'}, ' ', # index of first node containing the sequence
333 0, ' ', # FIXME - sequence weight?
334 -1, ' ', # FIXME - index of bundle containing sequence?
335 ($seq->{'seq'}->description or 'untitled'), "\n",
339 # print node information
340 foreach $node (@{$nodes}) {
341 $self->_print($node->{'aa'}, ':');
342 (keys %{$node->{'L'}} > 0) and $self->_print('L', join('L', sort {$a <=> $b} keys %{$node->{'L'}}));
343 (@{$node->{'S'}} > 0) and $self->_print('S', join('S', @{$node->{'S'}}));
344 (@{$node->{'A'}} > 0) and $self->_print('A', join('A', @{$node->{'A'}}));
345 $self->_print("\n");
348 $self->flush if $self->_flush_on_write && defined $self->_fh;
350 return 1;