tag fourth (and hopefully last) alpha
[bioperl-live.git] / branch-1-6 / Bio / Matrix / PSM / IO / mast.pm
blob2087de3f5cc03451504a52f5100948e71db7d6b5
1 # $Id$
3 =head1 NAME
5 Bio::Matrix::PSM::IO::mast - PSM mast parser implementation
7 =head1 SYNOPSIS
9 See Bio::Matrix::PSM::IO for detailed documentation on how to
10 use PSM parsers
12 =head1 DESCRIPTION
14 Parser for mast. This driver unlike meme or transfac for example is
15 dedicated more to PSM sequence matches, than to PSM themselves.
17 =head1 TO DO
19 Section III should be parsed too, otherwise no real sequence is
20 available, so we supply 'NNNNN....' as a seq which is not right.
22 =head1 FEEDBACK
24 =head2 Mailing Lists
26 User feedback is an integral part of the evolution of this
27 and other Bioperl modules. Send your comments and suggestions preferably
28 to one of the Bioperl mailing lists. Your participation is much appreciated.
30 bioperl-l@bioperl.org - General discussion
31 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
33 =head2 Support
35 Please direct usage questions or support issues to the mailing list:
37 I<bioperl-l@bioperl.org>
39 rather than to the module maintainer directly. Many experienced and
40 reponsive experts will be able look at the problem and quickly
41 address it. Please include a thorough description of the problem
42 with code and data examples if at all possible.
44 =head2 Reporting Bugs
46 Report bugs to the Bioperl bug tracking system to help us keep track
47 the bugs and their resolution. Bug reports can be submitted via the
48 web:
50 http://bugzilla.open-bio.org/
52 =head1 AUTHOR - Stefan Kirov
54 Email skirov@utk.edu
56 =head1 APPENDIX
58 The rest of the documentation details each of the object
59 methods. Internal methods are usually preceded with a _
61 =cut
63 # Let the code begin...
64 package Bio::Matrix::PSM::IO::mast;
65 use Bio::Matrix::PSM::InstanceSite;
66 use Bio::Matrix::PSM::Psm;
67 use Bio::Root::Root;
68 use strict;
70 use base qw(Bio::Matrix::PSM::PsmHeader Bio::Matrix::PSM::IO);
72 =head2 new
74 Title : new
75 Usage : my $psmIO = Bio::Matrix::PSM::IO->new(-format=>'mast',
76 -file=>$file);
77 Function: Associates a file with the appropriate parser
78 Throws : Throws if the file passed is in HTML format or if
79 some criteria for the file
80 format are not met.
81 Example :
82 Returns : psm object, associated with a file with matrix file
83 Args : hash
84 return : "Bio::Matrix::PSM::$format"->new(@args);
86 =cut
89 sub new {
90 my($class, @args)=@_;
91 my $self = $class->SUPER::new(@args);
92 my (%instances,@header,$n);
93 my ($file)=$self->_rearrange(['FILE'], @args);
94 $self->{file} = $file;
95 $self->{_factor}=1;
96 $self->_initialize_io(@args) || warn "Did you intend to use STDIN?"; #Read only for now
97 $self->{_end}=0;
98 undef $self->{hid};
99 return $self if ($file=~/^>/);#Just writing
100 my $buf=$self->_readline;
101 $self->throw('Cannot parse HTML format yet') if ($buf =~/^<HTML>/);
102 # this should probably be moved to its own function
103 while ( defined($buf=$self->_readline)) {
104 chomp($buf);
105 if ($buf=~/DATABASE AND MOTIFS/) {
106 while ($buf=$self->_readline) {
107 if ($buf=~/DATABASE/) {
108 $buf=~s/^[\s\t]+//;
109 chomp $buf;
110 ($n,$self->{_dbname},$self->{_dbtype})=split(/\s/,$buf);
111 $self->{_dbtype}=~s/[\(\)]//g;
113 if ($buf=~/MOTIFS/) {
114 $buf=~s/^[\s\t]+//;
115 chomp $buf;
116 ($n,$self->{_mrsc},$self->{_msrctype})=split(/\s/,$buf);
117 $self->{_msrctype}=~s/[\(\)]//g;
118 last;
121 if ($self->{_msrctype} ne $self->{_dbtype}) {#Assume we have protein motifs, nuc DB (not handling opp.)
122 $self->{_factor}=3;
123 $self->{_mixquery}=1;
126 if ($buf=~m/MOTIF WIDTH BEST POSSIBLE MATCH/) {
127 $self->_readline;
128 while (defined($buf=$self->_readline)) {
129 last if ($buf!~/\w/);
130 $buf=~s/\t+//g;
131 $buf=~s/^\s+//g;
132 my ($id,$width,$seq)=split(/\s+/,$buf);
133 push @{$self->{hid}},$id;
134 $self->{length}->{$id}=$width;
135 $self->{seq}->{$id}=$seq;
137 next;
139 if ($buf=~m/section i:/i) {
140 $self->_readline;
141 $self->_readline;
142 $self->_readline;
143 %instances=_get_genes($self);
144 $self->{instances}=\%instances;
145 if (!(%instances)) {
146 $self->warn ("Your MAST analysis did not find any matches satisfying the current threshold.\nSee MAST documentation for more information.\n");
147 return $self; #The header might be useful so we return the object, not undef
149 next;
151 if ($buf=~m/section ii:/i) {
152 $self->_readline;
153 $self->_readline;
154 $self->_readline;
155 last;
157 $buf=~s/[\t+\s+]/ /g;
158 push @header,$buf unless (($buf=~/\*{10,}/)||($buf!~/\w/));
160 $self->throw('Could not read Section I, probably wrong format, make sure it is not HTML, giving up...') if !(%instances);
161 $self->warn( "This file might be an unreadable version, proceed with caution!\n") if (!grep(/\s+MAST\s+version\s+3/,@header));
163 $self->{unstructured} = \@header;
164 $self->_initialize;
165 return $self;
169 # Get the file header and put store it as a hash, which later we'll use to create
170 # the header for each Psm. See Bio::Matrix::PSM::PsmI for header function.
171 sub _get_genes {
172 my $self=shift;
173 my %llid;
174 my $ok=0;
175 my $i=0;
176 my %instances;
177 while (my $line=$self->_readline) {
178 last if ($line=~/^[\s\t*]/); # Well, ids can be nearly anything...???
179 chomp($line);
180 $i++;
181 next if ($line eq '');
182 $line=~s/\s+/,/g;
183 my ($id,$key,$eval,$len)=split(/,/,$line);
184 unless ($len) {
185 warn "Malformed data found: $line\n";
186 next;
188 $instances{$id}=Bio::Matrix::PSM::InstanceSite->new(-id=>$id,
189 -desc=>$key,
190 -score=>$eval,
191 -width=>$len,
192 -seq=>'ACGT');
194 return %instances;
198 =head2 next_psm
200 Title : next_psm
201 Usage : my $psm=$psmIO->next_psm();
202 Function: Reads the next PSM from the input file, associated with this object
203 Throws : Throws if there ara format violations in the input file (checking is not
204 very strict with all drivers).
205 Example :
206 Returns : Bio::Matrix::PSM::Psm object
207 Args : none
209 =cut
212 sub next_psm {
213 my $self=shift;
214 return if ($self->{_end}==1);
215 my (@lmotifsm,%index,$eval,$scheme,$sid);
216 %index= %{$self->{length}};
217 my (@instances,%instances);
218 my $line=$self->_readline;
219 $line=~s/[\t\n]//;
220 if ($line =~ /\*{10,}/) { #Endo of Section II if we do only section II
221 $self->{_end}=1;
222 return ;
224 do {
225 if ($line!~/^\s/) {
226 ($sid,$eval,$scheme)=split(/\s+/,$line,3);
228 else
229 { $scheme .=$line; }
230 $line=$self->_readline;
231 $line=~s/[\t\n]//;
232 } until ($line!~/^\s/);
233 my $pos=1;
234 $scheme=~s/\s+//g;
235 $scheme=~s/\n//g;
236 my @motifs=split(/_/,$scheme);
237 while (@motifs) {
238 my $next=shift(@motifs);
239 if (!($next=~/\D/)) {
240 last if (!@motifs);
241 $pos+=$next;
242 next;
244 my $id=$next;
245 my $score= $id=~m/\[/ ? 'strong' : 'weak' ;
246 my $frame;
247 my $strand = $id =~ m/\-\d/ ? -1 : 1 ;
248 if ($self->{_mixquery}) {
249 $frame = 0 if $id =~ m/\d+a/ ;
250 $frame = 1 if $id =~ m/\d+b/ ;
251 $frame = 2 if $id =~ m/\d+c/ ;
253 $id=~s/\D+//g;
255 my @s;
256 my $width=$index{$id};
257 #We don't know the sequence, but we know the length
258 my $seq='N' x ($width*$self->{_factor}); #Future version will have to parse Section tree nad get the real seq
259 my $instance=Bio::Matrix::PSM::InstanceSite->new
260 ( -id=>"$id\@$sid",
261 -mid=>$id,
262 -accession_number=>$sid,
263 -desc=>"Motif $id occurrance in $sid",
264 -score=>$score,
265 -seq=>$seq,
266 -alphabet => 'dna',
267 -start=>$pos,
268 -strand=>$strand);
269 $instance->frame($frame) if ($self->{_mixquery});
270 push @instances,$instance;
271 $pos+=$index{$id}*$self->{_factor};
273 my $psm= Bio::Matrix::PSM::Psm->new(-instances=> \@instances,
274 -e_val => $eval,
275 -id => $sid);
276 $self->_pushback($line);
277 return $psm;
281 =head2 write_psm
283 Title : write_psm
284 Usage : #Get SiteMatrix object somehow (see Bio::Matrix::PSM::SiteMatrix)
285 my $matrix=$psmin->next_matrix;
286 #Create the stream
287 my $psmio=new(-file=>">psms.mast",-format=>'mast');
288 $psmio->write_psm($matrix);
289 #Will warn if only PFM data is contained in $matrix, recalculate the PWM
290 #based on normal distribution (A=>0.25, C=>0.25, etc)
291 Function: writes pwm in mast format
292 Throws :
293 Example :
294 Args : SiteMatrix object
295 Returns :
297 =cut
299 sub write_psm {
300 my ($self,$matrix)=@_;
301 # my $idline=">". $matrix->id . "\n";
302 my $w=$matrix->width;
303 my $header="ALPHABET= ACGT\nlog-odds matrix: alength= 4 w= $w\n";
304 $self->_print($header);
305 unless ($matrix->get_logs_array('A')) {
306 warn "No log-odds data, available, using normal distribution to recalculate the PWM";
307 $matrix->calc_weight({A=>0.25, C=>0.25, G=>0.25,T=>0.25});
309 while (my %h=$matrix->next_pos) {
310 $self->_print (join("\t",$h{lA},$h{lC},$h{lG},$h{lT},"\n"));