sync w/ main trunk
[bioperl-live.git] / Bio / AlignIO / po.pm
blobdd779daa6a40809d6c1b2c8cd59a321e3a660739
1 # $Id: po.pm
3 # BioPerl module for Bio::AlignIO::po
5 # based on the Bio::AlignIO::fasta module
6 # by Peter Schattner (and others?)
8 # and the SimpleAlign.pm module of Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::AlignIO::po - po MSA Sequence input/output stream
18 =head1 SYNOPSIS
20 Do not use this module directly. Use it via the L<Bio::AlignIO> class.
22 =head1 DESCRIPTION
24 This object can transform L<Bio::SimpleAlign> objects to and from
25 'po' format flat file databases. 'po' format is the native format of
26 the POA alignment program (Lee C, Grasso C, Sharlow MF, 'Multiple
27 sequence alignment using partial order graphs', Bioinformatics (2002),
28 18(3):452-64).
30 =head1 FEEDBACK
32 =head2 Support
34 Please direct usage questions or support issues to the mailing list:
36 L<bioperl-l@bioperl.org>
38 rather than to the module maintainer directly. Many experienced and
39 reponsive experts will be able look at the problem and quickly
40 address it. Please include a thorough description of the problem
41 with code and data examples if at all possible.
43 =head2 Reporting Bugs
45 Report bugs to the Bioperl bug tracking system to help us keep track
46 the bugs and their resolution. Bug reports can be submitted via the
47 web:
49 http://bugzilla.open-bio.org/
51 =head1 AUTHORS - Matthew Betts
53 Email: matthew.betts@ii.uib.no
55 =head1 APPENDIX
57 The rest of the documentation details each of the object
58 methods. Internal methods are usually preceded with a _
60 =cut
62 # Let the code begin...
64 package Bio::AlignIO::po;
65 use strict;
67 use Bio::SimpleAlign;
69 use base qw(Bio::AlignIO);
72 =head2 next_aln
74 Title : next_aln
75 Usage : $aln = $stream->next_aln()
76 Function: returns the next alignment in the stream.
77 Returns : L<Bio::Align::AlignI> object - returns undef on end of file
78 or on error
79 Args : NONE
81 =cut
83 sub next_aln {
84 my $self = shift;
86 my $aln;
87 my $entry;
88 my $name;
89 my $seqs;
90 my $seq;
91 my $nodes;
92 my $list;
93 my $node;
94 my @chars;
95 my $s;
96 my $a;
98 $aln = Bio::SimpleAlign->new();
100 # get to the first 'VERSION' line
101 while(defined($entry = $self->_readline)) {
102 if($entry =~ /^VERSION=(\S+)/) {
103 $aln->source($1);
105 if(defined($entry = $self->_readline) and $entry =~ /^NAME=(\S+)/) {
106 $aln->id($1);
109 last;
113 # read in the sequence names and node data, up to the end of
114 # the file or the next 'VERSION' line, whichever comes first
115 $seqs = [];
116 $nodes = [];
117 while(defined($entry = $self->_readline)) {
118 if($entry =~ /^VERSION/) {
119 # start of a new alignment, so...
120 $self->_pushback($entry);
121 last;
123 elsif($entry =~ /^SOURCENAME=(\S+)/) {
124 $name = $1;
126 if($name =~ /(\S+)\/(\d+)-(\d+)/) {
127 $seq = Bio::LocatableSeq->new(
128 '-display_id' => $1,
129 '-start' => $2,
130 '-end' => $3,
133 } else {
134 $seq = Bio::LocatableSeq->new('-display_id' => $name);
137 # store sequences in a list initially, because can't guarantee
138 # that will get them back from SimpleAlign object in the order
139 # they were read, and also can't add them to the SimpleAlign
140 # object here because their sequences are currently unknown
141 push @{$seqs}, {
142 'seq' => $seq,
143 'str' => '',
146 elsif($entry =~ /^SOURCEINFO=(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.*)/) {
147 $seq->desc($5);
149 elsif($entry =~ /^(\S):(\S+)/) {
150 $node = {
151 'aa' => $1,
152 'L' => [],
153 'S' => [],
154 'A' => [],
155 'status' => 'unvisited',
158 $list = $2;
159 if($list =~ /^([L\d]*)([S\d]*)([A\d]*)/) {
160 push(@{$node->{'L'}}, split(/L/, $1));
161 push(@{$node->{'S'}}, split(/S/, $2));
162 push(@{$node->{'A'}}, split(/A/, $3));
164 (@{$node->{'L'}} > 0) and shift @{$node->{'L'}};
165 (@{$node->{'S'}} > 0) and shift @{$node->{'S'}};
166 (@{$node->{'A'}} > 0) and shift @{$node->{'A'}};
169 push @{$nodes}, $node;
173 # process the nodes
174 foreach $node (@{$nodes}) {
175 ($node->{'status'} ne 'unvisited') and next;
177 @chars = ($aln->gap_char) x @{$seqs}; # char for each seq defaults to a gap
179 # set the character for each sequence represented by this node
180 foreach $s (@{$node->{'S'}}) {
181 $chars[$s] = $node->{'aa'};
183 $node->{'status'} = 'visited';
185 # do the same for each node in the same align ring
186 while(defined($a = $node->{'A'}->[0])) {
187 $node = $nodes->[$a];
188 ($node->{'status'} ne 'unvisited') and last;
190 foreach $s (@{$node->{'S'}}) {
191 $chars[$s] = $node->{'aa'};
194 $node->{'status'} = 'visited';
197 # update the sequences
198 foreach $seq (@{$seqs}) {
199 $seq->{'str'} .= shift @chars;
203 # set the sequences of the bioperl objects
204 # and add them to the alignment
205 foreach $seq (@{$seqs}) {
206 $seq->{'seq'}->seq($seq->{'str'});
207 $aln->add_seq($seq->{'seq'});
210 # has an alignment been read?...
211 ($aln->no_sequences == 0) and ($aln = undef);
213 return $aln;
216 =head2 write_aln
218 Title : write_aln
219 Usage : $stream->write_aln(@aln)
220 Function: writes the $aln object into the stream in po format
221 Returns : 1 for success and 0 for error
222 Args : L<Bio::Align::AlignI> object
224 =cut
226 sub write_aln {
227 my $self = shift;
228 my @alns = @_;
230 my $aln;
231 my $seqs;
232 my $nodes;
233 my $seq;
234 my $node;
235 my $col;
236 my $ring;
237 my $i;
238 my $char;
240 foreach $aln (@alns) {
241 if(!$aln or !$aln->isa('Bio::Align::AlignI')) {
242 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
243 next;
246 # store the seqs on a list, because po format
247 # refers to them by position on this list
248 $seqs = [];
249 foreach $seq ($aln->each_seq()) {
250 push @{$seqs}, {
251 'seq' => $seq,
252 'n_nodes' => 0,
253 'first' => undef,
254 'previous' => undef,
258 # go through each column in the alignment and construct
259 # the nodes for the equivalent poa alignment ring
260 $nodes = [];
261 for($col = 0; $col < $aln->length; $col++) {
262 $ring = {
263 'nodes' => {},
264 'first' => scalar @{$nodes},
265 'last' => scalar @{$nodes},
268 for($i = 0; $i < @{$seqs}; $i++) {
269 $seq = $seqs->[$i];
271 $char = $seq->{'seq'}->subseq($col + 1, $col + 1);
273 ($char eq $aln->gap_char) and next;
275 if(!defined($node = $ring->{'nodes'}->{$char})) {
276 $node = {
277 'n' => scalar @{$nodes},
278 'aa' => $char,
279 'L' => {},
280 'S' => [],
281 'A' => [],
284 # update the ring
285 $ring->{'nodes'}->{$char} = $node;
286 $ring->{'last'} = $node->{'n'};
288 # add the node to the node list
289 push @{$nodes}, $node;
292 # add the sequence to the node
293 push @{$node->{'S'}}, $i;
295 # add the node to the sequence
296 defined($seq->{'first'}) or ($seq->{'first'} = $node);
297 $seq->{'n_nodes'}++;
299 # add an edge from the previous node in the sequence to this one.
300 # Then set the previous node to the current one, ready for the next
301 # residue in this sequence
302 defined($seq->{'previous'}) and ($node->{'L'}->{$seq->{'previous'}->{'n'}} = $seq->{'previous'});
303 $seq->{'previous'} = $node;
306 # set the 'next node in ring' field for each node in the ring
307 if($ring->{'first'} < $ring->{'last'}) {
308 for($i = $ring->{'first'}; $i < $ring->{'last'}; $i++) {
309 push @{$nodes->[$i]->{'A'}}, $i + 1;
311 push @{$nodes->[$ring->{'last'}]->{'A'}}, $ring->{'first'};
315 # print header information
316 $self->_print(
317 'VERSION=', ($aln->source and ($aln->source !~ /\A\s*\Z/)) ? $aln->source : 'bioperl', "\n",
318 'NAME=', $aln->id, "\n",
319 'TITLE=', ($seqs->[0]->{'seq'}->description or $aln->id), "\n",
320 'LENGTH=', scalar @{$nodes}, "\n",
321 'SOURCECOUNT=', scalar @{$seqs}, "\n",
324 # print sequence information
325 foreach $seq (@{$seqs}) {
326 $self->_print(
327 'SOURCENAME=', $seq->{'seq'}->display_id, "\n",
328 'SOURCEINFO=',
329 $seq->{'n_nodes'}, ' ', # number of nodes in the sequence
330 $seq->{'first'}->{'n'}, ' ', # index of first node containing the sequence
331 0, ' ', # FIXME - sequence weight?
332 -1, ' ', # FIXME - index of bundle containing sequence?
333 ($seq->{'seq'}->description or 'untitled'), "\n",
337 # print node information
338 foreach $node (@{$nodes}) {
339 $self->_print($node->{'aa'}, ':');
340 (keys %{$node->{'L'}} > 0) and $self->_print('L', join('L', sort {$a <=> $b} keys %{$node->{'L'}}));
341 (@{$node->{'S'}} > 0) and $self->_print('S', join('S', @{$node->{'S'}}));
342 (@{$node->{'A'}} > 0) and $self->_print('A', join('A', @{$node->{'A'}}));
343 $self->_print("\n");
346 $self->flush if $self->_flush_on_write && defined $self->_fh;
348 return 1;