5 Bio::Matrix::PSM::IO::mast - PSM mast parser implementation
9 See Bio::Matrix::PSM::IO for detailed documentation on how to
14 Parser for mast. This driver unlike meme or transfac for example is
15 dedicated more to PSM sequence matches, than to PSM themselves.
19 Section III should be parsed too, otherwise no real sequence is
20 available, so we supply 'NNNNN....' as a seq which is not right.
26 User feedback is an integral part of the evolution of this
27 and other Bioperl modules. Send your comments and suggestions preferably
28 to one of the Bioperl mailing lists. Your participation is much appreciated.
30 bioperl-l@bioperl.org - General discussion
31 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
35 Please direct usage questions or support issues to the mailing list:
37 I<bioperl-l@bioperl.org>
39 rather than to the module maintainer directly. Many experienced and
40 reponsive experts will be able look at the problem and quickly
41 address it. Please include a thorough description of the problem
42 with code and data examples if at all possible.
46 Report bugs to the Bioperl bug tracking system to help us keep track
47 the bugs and their resolution. Bug reports can be submitted via the
50 http://bugzilla.open-bio.org/
52 =head1 AUTHOR - Stefan Kirov
58 The rest of the documentation details each of the object
59 methods. Internal methods are usually preceded with a _
63 # Let the code begin...
64 package Bio
::Matrix
::PSM
::IO
::mast
;
65 use Bio
::Matrix
::PSM
::InstanceSite
;
66 use Bio
::Matrix
::PSM
::Psm
;
70 use base
qw(Bio::Matrix::PSM::PsmHeader Bio::Matrix::PSM::IO);
75 Usage : my $psmIO = Bio::Matrix::PSM::IO->new(-format=>'mast',
77 Function: Associates a file with the appropriate parser
78 Throws : Throws if the file passed is in HTML format or if
79 some criteria for the file
82 Returns : psm object, associated with a file with matrix file
84 return : "Bio::Matrix::PSM::$format"->new(@args);
91 my $self = $class->SUPER::new
(@args);
92 my (%instances,@header,$n);
93 my ($file)=$self->_rearrange(['FILE'], @args);
94 $self->{file
} = $file;
96 $self->_initialize_io(@args) || warn "Did you intend to use STDIN?"; #Read only for now
99 return $self if ($file=~/^>/);#Just writing
100 my $buf=$self->_readline;
101 $self->throw('Cannot parse HTML format yet') if ($buf =~/^<HTML>/);
102 # this should probably be moved to its own function
103 while ( defined($buf=$self->_readline)) {
105 if ($buf=~/DATABASE AND MOTIFS/) {
106 while ($buf=$self->_readline) {
107 if ($buf=~/DATABASE/) {
110 ($n,$self->{_dbname
},$self->{_dbtype
})=split(/\s/,$buf);
111 $self->{_dbtype
}=~s/[\(\)]//g;
113 if ($buf=~/MOTIFS/) {
116 ($n,$self->{_mrsc
},$self->{_msrctype
})=split(/\s/,$buf);
117 $self->{_msrctype
}=~s/[\(\)]//g;
121 if ($self->{_msrctype
} ne $self->{_dbtype
}) {#Assume we have protein motifs, nuc DB (not handling opp.)
123 $self->{_mixquery
}=1;
126 if ($buf=~m/MOTIF WIDTH BEST POSSIBLE MATCH/) {
128 while (defined($buf=$self->_readline)) {
129 last if ($buf!~/\w/);
132 my ($id,$width,$seq)=split(/\s+/,$buf);
133 push @
{$self->{hid
}},$id;
134 $self->{length}->{$id}=$width;
135 $self->{seq
}->{$id}=$seq;
139 if ($buf=~m/section i:/i) {
143 %instances=_get_genes
($self);
144 $self->{instances
}=\
%instances;
146 $self->warn ("Your MAST analysis did not find any matches satisfying the current threshold.\nSee MAST documentation for more information.\n");
147 return $self; #The header might be useful so we return the object, not undef
151 if ($buf=~m/section ii:/i) {
157 $buf=~s/[\t+\s+]/ /g;
158 push @header,$buf unless (($buf=~/\*{10,}/)||($buf!~/\w/));
160 $self->throw('Could not read Section I, probably wrong format, make sure it is not HTML, giving up...') if !(%instances);
161 $self->warn( "This file might be an unreadable version, proceed with caution!\n") if (!grep(/\s+MAST\s+version\s+3/,@header));
163 $self->{unstructured
} = \
@header;
169 # Get the file header and put store it as a hash, which later we'll use to create
170 # the header for each Psm. See Bio::Matrix::PSM::PsmI for header function.
177 while (my $line=$self->_readline) {
178 last if ($line=~/^[\s\t*]/); # Well, ids can be nearly anything...???
181 next if ($line eq '');
183 my ($id,$key,$eval,$len)=split(/,/,$line);
185 warn "Malformed data found: $line\n";
188 $instances{$id}=Bio
::Matrix
::PSM
::InstanceSite
->new(-id
=>$id,
201 Usage : my $psm=$psmIO->next_psm();
202 Function: Reads the next PSM from the input file, associated with this object
203 Throws : Throws if there ara format violations in the input file (checking is not
204 very strict with all drivers).
206 Returns : Bio::Matrix::PSM::Psm object
214 return if ($self->{_end
}==1);
215 my (@lmotifsm,%index,$eval,$scheme,$sid);
216 %index= %{$self->{length}};
217 my (@instances,%instances);
218 my $line=$self->_readline;
220 if ($line =~ /\*{10,}/) { #Endo of Section II if we do only section II
226 ($sid,$eval,$scheme)=split(/\s+/,$line,3);
230 $line=$self->_readline;
232 } until ($line!~/^\s/);
236 my @motifs=split(/_/,$scheme);
238 my $next=shift(@motifs);
239 if (!($next=~/\D/)) {
245 my $score= $id=~m/\[/ ?
'strong' : 'weak' ;
247 my $strand = $id =~ m/\-\d/ ?
-1 : 1 ;
248 if ($self->{_mixquery
}) {
249 $frame = 0 if $id =~ m/\d+a/ ;
250 $frame = 1 if $id =~ m/\d+b/ ;
251 $frame = 2 if $id =~ m/\d+c/ ;
256 my $width=$index{$id};
257 #We don't know the sequence, but we know the length
258 my $seq='N' x
($width*$self->{_factor
}); #Future version will have to parse Section tree nad get the real seq
259 my $instance=Bio
::Matrix
::PSM
::InstanceSite
->new
262 -accession_number
=>$sid,
263 -desc
=>"Motif $id occurrance in $sid",
269 $instance->frame($frame) if ($self->{_mixquery
});
270 push @instances,$instance;
271 $pos+=$index{$id}*$self->{_factor
};
273 my $psm= Bio
::Matrix
::PSM
::Psm
->new(-instances
=> \
@instances,
276 $self->_pushback($line);
284 Usage : #Get SiteMatrix object somehow (see Bio::Matrix::PSM::SiteMatrix)
285 my $matrix=$psmin->next_matrix;
287 my $psmio=new(-file=>">psms.mast",-format=>'mast');
288 $psmio->write_psm($matrix);
289 #Will warn if only PFM data is contained in $matrix, recalculate the PWM
290 #based on normal distribution (A=>0.25, C=>0.25, etc)
291 Function: writes pwm in mast format
294 Args : SiteMatrix object
300 my ($self,$matrix)=@_;
301 # my $idline=">". $matrix->id . "\n";
302 my $w=$matrix->width;
303 my $header="ALPHABET= ACGT\nlog-odds matrix: alength= 4 w= $w\n";
304 $self->_print($header);
305 unless ($matrix->get_logs_array('A')) {
306 warn "No log-odds data, available, using normal distribution to recalculate the PWM";
307 $matrix->calc_weight({A
=>0.25, C
=>0.25, G
=>0.25,T
=>0.25});
309 while (my %h=$matrix->next_pos) {
310 $self->_print (join("\t",$h{lA
},$h{lC
},$h{lG
},$h{lT
},"\n"));