1 #---------------------------------------------------------
5 Bio::Matrix::PSM::IO::masta - motif fasta format parser
9 MASTA is a position frequency matrix format similar to
10 fasta. It contains one ID row just like fasta and then the actual
11 data, which is tab delimited:
16 Or A,C,G and T could be horizontally positioned (positioning is
17 automatically detected). Please note masta will parse only DNA at the
20 It will also convert a set of aligned sequences:
26 to a PFM (SiteMatrix object). When writing if you supply SEQ it will
27 write 10 random instances, which represent correctly the frequency and
28 can be used as an input for weblogo creation purposes.
30 See Bio::Matrix::PSM::IO for detailed documentation on how to use masta parser
40 User feedback is an integral part of the evolution of this
41 and other Bioperl modules. Send your comments and suggestions preferably
42 to one of the Bioperl mailing lists.
43 Your participation is much appreciated.
45 bioperl-l@bioperl.org - General discussion
46 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
50 Please direct usage questions or support issues to the mailing list:
52 I<bioperl-l@bioperl.org>
54 rather than to the module maintainer directly. Many experienced and
55 reponsive experts will be able look at the problem and quickly
56 address it. Please include a thorough description of the problem
57 with code and data examples if at all possible.
61 Report bugs to the Bioperl bug tracking system to help us keep track
62 the bugs and their resolution. Bug reports can be submitted via the
65 https://github.com/bioperl/bioperl-live/issues
67 =head1 AUTHOR - Stefan Kirov
76 # Let the code begin...
77 package Bio
::Matrix
::PSM
::IO
::masta
;
78 use Bio
::Matrix
::PSM
::SiteMatrix
;
82 use base qw(Bio::Matrix::PSM::IO Bio::Root::Root);
89 Usage : my $psmIO = Bio::Matrix::PSM::IO->new(-format=> 'masta',
92 Function: Associates a file with the appropriate parser
96 Returns : "Bio::Matrix::PSM::$format"->new(@args);
101 my($class, @args)=@_;
102 my $self = $class->SUPER::new
(@args);
103 my ($file)=$self->_rearrange(['FILE'], @args);
104 my ($query,$tr1)=split(/\./,$file,2);
105 $self->{file
} = $file;
107 $self->{mtype
} = uc($self->_rearrange(['MTYPE'], @args) || "PFM");
108 $self->_initialize_io(@args) || $self->warn("Did you intend to use STDIN?"); #Read only for now
116 Function: writes a pfm/pwm/raw sequence in a simple masta format
119 Args : SiteMatrix object, type (optional string: PWM, SEQ or PFM)
125 my ($self,$matrix,$type)=@_;
126 $self->{mtype
} = uc($type) if ($type);
127 my $idline=">". $matrix->id . "\n";
128 $self->_print($idline);
129 unless ($self->{mtype
} eq 'SEQ') {
130 while (my %h=$matrix->next_pos) {
131 my $row=$self->{mtype
} eq 'PWM' ?
join("\t",$h{lA
},$h{lC
},$h{lG
},$h{lT
},"\n"):join("\t",$h{pA
},$h{pC
},$h{pG
},$h{pT
},"\n");
132 $self->_print ($row);
136 while (my %h=$matrix->next_pos) {
137 my ($a,$c,$g,$t)=_freq_to_count
(\
%h);
138 $self->throw("Could not convert from frequency to count\n") if (($a+$c+$g+$t) !=10);
139 for my $i (0..$a-1) {$seq[$i].='A';}
141 for my $i ($a..$m-1) {$seq[$i].='C';}
143 for my $i ($m..$n-1) {$seq[$i].='G';}
144 for my $i ($n..9) {$seq[$i].='T';}
146 foreach my $s (@seq) {
156 Usage : my $matrix = $psmio->next_matrix;
157 Function: Alias of next_psm function
168 Usage : my $matrix=$psmio->next_psm;
169 Function: returns the next matrix in the stream
170 Throws : If there is you mix different types, for example weights and
171 frequencies occur in the same entry You can mix weights, but these
172 should be designated by different ID lines
175 Returns : Bio::Matrix::PSM::SiteMatrix
181 return if ($self->{_end
});
182 my $line=$self->_readline;
183 $self->throw("No ID line- wrong format\n") unless ($line=~/^>/);
184 my ($id,$desc)=split(/[\t\s]+/,$line,2);
186 my ($mtype,$format,@mdata,$len);
188 while ($line=$self->_readline) {
189 next if $line =~ /^\s+$/;# There should not be empty lines, but just in case...
192 $self->_pushback($line);
196 if ($line !~ /[^ACGTacgt]/g) {
197 # This is a set of aligned sequences
198 $self->throw("Mixing between types is not allowed or a parsing error occurred\n")
199 if (($self->{_mtype
} != 3) && ($mtype)) ;
200 $self->throw("Bad sequence- different length: $line\n")
201 if (($len) && ($len!=length($line)));
202 $len=length($line) unless ($len);
206 # do not strip 'e's since they are part of number notation for small/big numbers
207 $line=~s/[a-df-zA-DF-Z]//g; #Well we may wanna do a hash and auto check for letter order if there is a really boring talk...
209 $line=~s/[\s\t]+/\t/g;
210 my @data=split(/[\s\t]+/,$line);
212 $self->throw("Mixing between types is not allowed or a parsing error occurred\n") if (($mtype)&&($self->{_mtype
} !=1)) ;
217 $self->throw("Mixing between types is not allowedor a parsing error occurred\n") if (($mtype)&&($self->{_mtype
} !=2)) ;
224 $self->{_end
} = 1 if (!defined $line || $line !~ /^>/);
225 return _make_matrix
(\
@mdata,$self->{_mtype
},$id,$desc);
229 my ($mdata,$type,$id,$desc)=@_;
231 my @rearr=_rearrange_matrix
($mdata);
234 #Auto recognition for what type is this entry (PFM, PWM or simple count)
235 #A bit dangerous, I hate too much auto stuff, but I want to be able to mix different
236 #types in a single file
240 ($a,$c,$g,$t)= &_count_positions
($mdata);
242 ($a,$c,$g,$t)=@
{$mdata};
243 my $k=$a->[0]+$c->[0]+$g->[0]+$t->[0];
244 my $l= ($a->[0]+$c->[0]+$g->[0]+$t->[0]) -
245 (abs($a->[0])+abs($c->[0])+abs($g->[0])+abs($t->[0]));
246 $mformat='freq' if (($k==1) && ($l==0));
247 $mformat='pwm' if ($l!=0);
249 my (@fa,@fc,@fg,@ft,%mparam);
251 if ($mformat eq 'pwm') {
252 foreach my $i (0..$#{$a}) {
257 my $all=$ca+$cc+$cg+$ct;
258 push @fa,($ca/$all)*100;
259 push @fc,($cc/$all)*100;
260 push @fg,($cg/$all)*100;
261 push @ft,($ct/$all)*100;
264 $desc.=", source is $mformat";
265 if ($mformat eq 'pwm') {
267 %mparam=(-pA
=>\
@fa,-pC
=>\
@fc,-pG
=>\
@fg,-pT
=>\
@ft,-id
=>$id,-desc
=>$desc,
268 -lA
=>$a,-lC
=>$c,-lG
=>$g,-lT
=>$t);
271 %mparam=(-pA
=>$a,-pC
=>$c,-pG
=>$g,-pT
=>$t,-id
=>$id,-desc
=>$desc);
273 return new Bio
::Matrix
::PSM
::SiteMatrix
(%mparam);
276 sub _rearrange_matrix
{
279 foreach my $entry (@
{$mdata}) {
280 my ($a,$c,$g,$t)=@
$entry;
286 return \
@a,\
@c,\
@g,\
@t;
290 sub _count_positions
{
293 my $l=length($seq->[0])-1;
294 for( my $i = 0; $i <= $l; $i++ ) {
295 for ( qw(A C G T) ) {
299 foreach my $sequence (@
{$seq}) {
300 my @let= split(//,$sequence);
301 for my $i (0..$#let) {
302 $pos{uc($let[$i])}->[$i]++;
305 return $pos{A
},$pos{C
},$pos{G
},$pos{T
};
311 my $a=int(10*$h->{pA
}+0.5);
312 my $c=int(10*$h->{pC
}+0.5);
313 my $g=int(10*$h->{pG
}+0.5);
314 my $t=int(10*$h->{pT
}+0.5);
315 return ($a,$c,$g,$t);