maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / Matrix / PSM / IO / masta.pm
blob9f418f71163be8acf0148c66b43a075c9dd3f6de
1 #---------------------------------------------------------
3 =head1 NAME
5 Bio::Matrix::PSM::IO::masta - motif fasta format parser
7 =head1 SYNOPSIS
9 MASTA is a position frequency matrix format similar to
10 fasta. It contains one ID row just like fasta and then the actual
11 data, which is tab delimited:
13 0.1 0.62 .017 0.11
14 0.22 0.13 0.54 0.11
16 Or A,C,G and T could be horizontally positioned (positioning is
17 automatically detected). Please note masta will parse only DNA at the
18 moment.
20 It will also convert a set of aligned sequences:
21 ACATGCAT
22 ACAGGGAT
23 ACAGGCAT
24 ACCGGCAT
26 to a PFM (SiteMatrix object). When writing if you supply SEQ it will
27 write 10 random instances, which represent correctly the frequency and
28 can be used as an input for weblogo creation purposes.
30 See Bio::Matrix::PSM::IO for detailed documentation on how to use masta parser
32 =head1 DESCRIPTION
34 Parser for meme.
36 =head1 FEEDBACK
38 =head2 Mailing Lists
40 User feedback is an integral part of the evolution of this
41 and other Bioperl modules. Send your comments and suggestions preferably
42 to one of the Bioperl mailing lists.
43 Your participation is much appreciated.
45 bioperl-l@bioperl.org - General discussion
46 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
48 =head2 Support
50 Please direct usage questions or support issues to the mailing list:
52 I<bioperl-l@bioperl.org>
54 rather than to the module maintainer directly. Many experienced and
55 reponsive experts will be able look at the problem and quickly
56 address it. Please include a thorough description of the problem
57 with code and data examples if at all possible.
59 =head2 Reporting Bugs
61 Report bugs to the Bioperl bug tracking system to help us keep track
62 the bugs and their resolution. Bug reports can be submitted via the
63 web:
65 https://github.com/bioperl/bioperl-live/issues
67 =head1 AUTHOR - Stefan Kirov
69 Email skirov@utk.edu
71 =head1 APPENDIX
73 =cut
76 # Let the code begin...
77 package Bio::Matrix::PSM::IO::masta;
78 use Bio::Matrix::PSM::SiteMatrix;
79 use vars qw(@HEADER);
80 use strict;
82 use base qw(Bio::Matrix::PSM::IO Bio::Root::Root);
86 =head2 new
88 Title : new
89 Usage : my $psmIO = Bio::Matrix::PSM::IO->new(-format=> 'masta',
90 -file => $file,
91 -mtype => 'PWM');
92 Function: Associates a file with the appropriate parser
93 Throws :
94 Example :
95 Args : hash
96 Returns : "Bio::Matrix::PSM::$format"->new(@args);
98 =cut
100 sub new {
101 my($class, @args)=@_;
102 my $self = $class->SUPER::new(@args);
103 my ($file)=$self->_rearrange(['FILE'], @args);
104 my ($query,$tr1)=split(/\./,$file,2);
105 $self->{file} = $file;
106 $self->{_end} = 0;
107 $self->{mtype} = uc($self->_rearrange(['MTYPE'], @args) || "PFM");
108 $self->_initialize_io(@args) || $self->warn("Did you intend to use STDIN?"); #Read only for now
109 return $self;
112 =head2 write_psm
114 Title : write_psm
115 Usage :
116 Function: writes a pfm/pwm/raw sequence in a simple masta format
117 Throws :
118 Example :
119 Args : SiteMatrix object, type (optional string: PWM, SEQ or PFM)
120 Returns :
122 =cut
124 sub write_psm {
125 my ($self,$matrix,$type)=@_;
126 $self->{mtype} = uc($type) if ($type);
127 my $idline=">". $matrix->id . "\n";
128 $self->_print($idline);
129 unless ($self->{mtype} eq 'SEQ') {
130 while (my %h=$matrix->next_pos) {
131 my $row=$self->{mtype} eq 'PWM' ? join("\t",$h{lA},$h{lC},$h{lG},$h{lT},"\n"):join("\t",$h{pA},$h{pC},$h{pG},$h{pT},"\n");
132 $self->_print ($row);
134 } else {
135 my @seq;
136 while (my %h=$matrix->next_pos) {
137 my ($a,$c,$g,$t)=_freq_to_count(\%h);
138 $self->throw("Could not convert from frequency to count\n") if (($a+$c+$g+$t) !=10);
139 for my $i (0..$a-1) {$seq[$i].='A';}
140 my $m=$a+$c;
141 for my $i ($a..$m-1) {$seq[$i].='C';}
142 my $n=$a+$c+$g;
143 for my $i ($m..$n-1) {$seq[$i].='G';}
144 for my $i ($n..9) {$seq[$i].='T';}
146 foreach my $s (@seq) {
147 $s.="\n";
148 $self->_print ($s);
153 =head2 next_matrix
155 Title : next_matrix
156 Usage : my $matrix = $psmio->next_matrix;
157 Function: Alias of next_psm function
159 =cut
161 sub next_matrix {
162 shift->next_psm(@_);
165 =head2 next_psm
167 Title : next_psm
168 Usage : my $matrix=$psmio->next_psm;
169 Function: returns the next matrix in the stream
170 Throws : If there is you mix different types, for example weights and
171 frequencies occur in the same entry You can mix weights, but these
172 should be designated by different ID lines
173 Example :
174 Args :
175 Returns : Bio::Matrix::PSM::SiteMatrix
177 =cut
179 sub next_psm {
180 my $self=shift;
181 return if ($self->{_end});
182 my $line=$self->_readline;
183 $self->throw("No ID line- wrong format\n") unless ($line=~/^>/);
184 my ($id,$desc)=split(/[\t\s]+/,$line,2);
185 $id=~s/>//;
186 my ($mtype,$format,@mdata,$len);
187 $self->{_mtype} = 0;
188 while ($line=$self->_readline) {
189 next if $line =~ /^\s+$/;# There should not be empty lines, but just in case...
190 chomp $line;
191 if ($line =~ /^>/) {
192 $self->_pushback($line);
193 last;
196 if ($line !~ /[^ACGTacgt]/g) {
197 # This is a set of aligned sequences
198 $self->throw("Mixing between types is not allowed or a parsing error occurred\n")
199 if (($self->{_mtype} != 3) && ($mtype)) ;
200 $self->throw("Bad sequence- different length: $line\n")
201 if (($len) && ($len!=length($line)));
202 $len=length($line) unless ($len);
203 push @mdata,$line;
204 $self->{_mtype}=3;
205 } else {
206 # do not strip 'e's since they are part of number notation for small/big numbers
207 $line=~s/[a-df-zA-DF-Z]//g; #Well we may wanna do a hash and auto check for letter order if there is a really boring talk...
208 $line=~s/^[\s\t]+//;
209 $line=~s/[\s\t]+/\t/g;
210 my @data=split(/[\s\t]+/,$line);
211 if ($#data==3) {
212 $self->throw("Mixing between types is not allowed or a parsing error occurred\n") if (($mtype)&&($self->{_mtype} !=1)) ;
213 $self->{_mtype}=1;
214 $mtype=1;
216 else {
217 $self->throw("Mixing between types is not allowedor a parsing error occurred\n") if (($mtype)&&($self->{_mtype} !=2)) ;
218 $self->{_mtype}=2;
219 $mtype=1;
221 push @mdata,\@data;
224 $self->{_end} = 1 if (!defined $line || $line !~ /^>/);
225 return _make_matrix(\@mdata,$self->{_mtype},$id,$desc);
228 sub _make_matrix {
229 my ($mdata,$type,$id,$desc)=@_;
230 if ($type==1) {
231 my @rearr=_rearrange_matrix($mdata);
232 $mdata=\@rearr;
234 #Auto recognition for what type is this entry (PFM, PWM or simple count)
235 #A bit dangerous, I hate too much auto stuff, but I want to be able to mix different
236 #types in a single file
237 my $mformat='count';
238 my ($a,$c,$g,$t);
239 if ($type == 3 ) {
240 ($a,$c,$g,$t)= &_count_positions($mdata);
241 } else {
242 ($a,$c,$g,$t)=@{$mdata};
243 my $k=$a->[0]+$c->[0]+$g->[0]+$t->[0];
244 my $l= ($a->[0]+$c->[0]+$g->[0]+$t->[0]) -
245 (abs($a->[0])+abs($c->[0])+abs($g->[0])+abs($t->[0]));
246 $mformat='freq' if (($k==1) && ($l==0));
247 $mformat='pwm' if ($l!=0);
249 my (@fa,@fc,@fg,@ft,%mparam);
251 if ($mformat eq 'pwm') {
252 foreach my $i (0..$#{$a}) {
253 my $ca=exp $a->[$i];
254 my $cc=exp $c->[$i];
255 my $cg=exp $g->[$i];
256 my $ct=exp $t->[$i];
257 my $all=$ca+$cc+$cg+$ct;
258 push @fa,($ca/$all)*100;
259 push @fc,($cc/$all)*100;
260 push @fg,($cg/$all)*100;
261 push @ft,($ct/$all)*100;
264 $desc.=", source is $mformat";
265 if ($mformat eq 'pwm') {
266 $desc=~s/^pwm//;
267 %mparam=(-pA=>\@fa,-pC=>\@fc,-pG=>\@fg,-pT=>\@ft,-id=>$id,-desc=>$desc,
268 -lA=>$a,-lC=>$c,-lG=>$g,-lT=>$t);
270 else {
271 %mparam=(-pA=>$a,-pC=>$c,-pG=>$g,-pT=>$t,-id=>$id,-desc=>$desc);
273 return new Bio::Matrix::PSM::SiteMatrix(%mparam);
276 sub _rearrange_matrix {
277 my $mdata=shift;
278 my (@a,@c,@g,@t);
279 foreach my $entry (@{$mdata}) {
280 my ($a,$c,$g,$t)=@$entry;
281 push @a,$a;
282 push @c,$c;
283 push @g,$g;
284 push @t,$t;
286 return \@a,\@c,\@g,\@t;
290 sub _count_positions {
291 my $seq=shift;
292 my %pos;
293 my $l=length($seq->[0])-1;
294 for( my $i = 0; $i <= $l; $i++ ) {
295 for ( qw(A C G T) ) {
296 $pos{$_}->[$i] = 0;
299 foreach my $sequence (@{$seq}) {
300 my @let= split(//,$sequence);
301 for my $i (0..$#let) {
302 $pos{uc($let[$i])}->[$i]++;
305 return $pos{A},$pos{C},$pos{G},$pos{T};
309 sub _freq_to_count {
310 my $h=shift;
311 my $a=int(10*$h->{pA}+0.5);
312 my $c=int(10*$h->{pC}+0.5);
313 my $g=int(10*$h->{pG}+0.5);
314 my $t=int(10*$h->{pT}+0.5);
315 return ($a,$c,$g,$t);