Merge pull request #181 from bioperl/limit-dockerhub-trigger
[bioperl-live.git] / Bio / AlignIO / mega.pm
blobaed01f95a406c3c31339b94564f1a1acbc335a69
2 # BioPerl module for Bio::AlignIO::mega
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::AlignIO::mega - Parse and Create MEGA format data files
18 =head1 SYNOPSIS
20 use Bio::AlignIO;
21 my $alignio = Bio::AlignIO->new(-format => 'mega',
22 -file => 't/data/hemoglobinA.meg');
24 while( my $aln = $alignio->next_aln ) {
25 # process each alignment or convert to another format like NEXUS
28 =head1 DESCRIPTION
30 This object handles reading and writing data streams in the MEGA
31 format (Kumar and Nei).
34 =head1 FEEDBACK
36 =head2 Mailing Lists
38 User feedback is an integral part of the evolution of this and other
39 Bioperl modules. Send your comments and suggestions preferably to
40 the Bioperl mailing list. Your participation is much appreciated.
42 bioperl-l@bioperl.org - General discussion
43 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
45 =head2 Support
47 Please direct usage questions or support issues to the mailing list:
49 I<bioperl-l@bioperl.org>
51 rather than to the module maintainer directly. Many experienced and
52 reponsive experts will be able look at the problem and quickly
53 address it. Please include a thorough description of the problem
54 with code and data examples if at all possible.
56 =head2 Reporting Bugs
58 Report bugs to the Bioperl bug tracking system to help us keep track
59 of the bugs and their resolution. Bug reports can be submitted the
60 web:
62 https://github.com/bioperl/bioperl-live/issues
64 =head1 AUTHOR - Jason Stajich
66 Email jason-at-bioperl.org
68 =head1 APPENDIX
70 The rest of the documentation details each of the object methods.
71 Internal methods are usually preceded with a _
73 =cut
76 # Let the code begin...
79 package Bio::AlignIO::mega;
80 use vars qw($MEGANAMELEN %VALID_TYPES $LINELEN $BLOCKLEN);
81 use strict;
83 use Bio::SimpleAlign;
84 use Bio::LocatableSeq;
86 # symbols are changed due to MEGA's use of '.' for redundant sequences
88 BEGIN {
89 $MEGANAMELEN = 10;
90 $LINELEN = 60;
91 $BLOCKLEN = 10;
92 %VALID_TYPES = map {$_, 1} qw( dna rna protein standard);
94 use base qw(Bio::AlignIO);
97 =head2 next_aln
99 Title : next_aln
100 Usage : $aln = $stream->next_aln()
101 Function: returns the next alignment in the stream.
102 Supports the following MEGA format features:
103 - The file has to start with '#mega'
104 - Reads in the name of the alignment from a comment
105 (anything after '!TITLE: ') .
106 - Reads in the format parameters datatype
108 Returns : L<Bio::Align::AlignI> object - returns 0 on end of file
109 or on error
110 Args : NONE
113 =cut
115 sub next_aln{
116 my ($self) = @_;
117 my $entry;
118 my ($alphabet,%seqs);
119 local $Bio::LocatableSeq::OTHER_SYMBOLS = '\*\?\.';
120 local $Bio::LocatableSeq::GAP_SYMBOLS = '\-';
121 my $aln = Bio::SimpleAlign->new(-source => 'mega');
123 while( defined($entry = $self->_readline()) && ($entry =~ /^\s+$/) ) {}
125 $self->throw("Not a valid MEGA file! [#mega] not starting the file!")
126 unless $entry =~ /^#mega/i;
128 while( defined($entry = $self->_readline() ) ) {
129 local($_) = $entry;
130 if(/\!Title:\s*([^\;]+)\s*/i) { $aln->id($1)}
131 elsif( s/\!Format\s+([^\;]+)\s*/$1/ ) {
132 my (@fields) = split(/\s+/,$1);
133 foreach my $f ( @fields ) {
134 my ($name,$value) = split(/\=/,$f);
135 if( $name eq 'datatype' ) {
136 $alphabet = $value;
137 } elsif( $name eq 'identical' ) {
138 $aln->match_char($value);
139 } elsif( $name eq 'indel' ) {
140 $aln->gap_char($value);
143 } elsif( /^\#/ ) {
144 last;
147 my @order;
148 while( defined($entry) ) {
149 if( $entry !~ /^\s+$/ ) {
150 # this is to skip the leading '#'
151 my $seqname = substr($entry,1,$MEGANAMELEN-1);
152 $seqname =~ s/(\S+)\s+$/$1/g;
153 my $line = substr($entry,$MEGANAMELEN);
154 $line =~ s/\s+//g;
155 if( ! defined $seqs{$seqname} ) {push @order, $seqname; }
156 $seqs{$seqname} .= $line;
158 $entry = $self->_readline();
161 foreach my $seqname ( @order ) {
162 my $s = $seqs{$seqname};
163 $s =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]+//g;
164 my $end = length($s);
165 my $seq = Bio::LocatableSeq->new('-alphabet' => $alphabet,
166 '-display_id' => $seqname,
167 '-seq' => $seqs{$seqname},
168 '-start' => 1,
169 '-end' => $end);
171 $aln->add_seq($seq);
173 $aln->unmatch;
174 return $aln if $aln->num_sequences;
175 return;
178 =head2 write_aln
180 Title : write_aln
181 Usage : $stream->write_aln(@aln)
182 Function: writes the $aln object into the stream in MEGA format
183 Returns : 1 for success and 0 for error
184 Args : L<Bio::Align::AlignI> object
186 =cut
188 sub write_aln{
189 my ($self,@aln) = @_;
190 my $count = 0;
191 my $wrapped = 0;
192 my $maxname;
194 foreach my $aln ( @aln ) {
195 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
196 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
197 return 0;
198 } elsif( ! $aln->is_flush($self->verbose) ) {
199 $self->warn("All Sequences in the alignment must be the same length");
200 return 0;
202 $aln->match();
203 my $len = $aln->length();
204 my $format = sprintf('datatype=%s identical=%s indel=%s;',
205 $aln->get_seq_by_pos(1)->alphabet(),
206 $aln->match_char, $aln->gap_char);
208 $self->_print(sprintf("#mega\n!Title: %s;\n!Format %s\n\n\n",
209 $aln->id, $format));
211 my ($count, $blockcount,$length) = ( 0,0,$aln->length());
212 $aln->set_displayname_flat();
213 while( $count < $length ) {
214 foreach my $seq ( $aln->each_seq ) {
215 my $seqchars = $seq->seq();
216 $blockcount = 0;
217 my $substring = substr($seqchars, $count, $LINELEN);
218 my @blocks;
219 while( $blockcount < length($substring) ) {
220 push @blocks, substr($substring, $blockcount,$BLOCKLEN);
221 $blockcount += $BLOCKLEN;
223 $self->_print(sprintf("#%-".($MEGANAMELEN-1)."s%s\n",
224 substr($aln->displayname($seq->get_nse()),
225 0,$MEGANAMELEN-2),
226 join(' ', @blocks)));
228 $self->_print("\n");
229 $count += $LINELEN;
232 $self->flush if $self->_flush_on_write && defined $self->_fh;
233 return 1;