sync w/ main trunk
[bioperl-live.git] / Bio / Tools / HMM.pm
blobac089831a04fb3400dd019d97a46ecf5b6aa4254
1 ## $Id$
3 # BioPerl module for Bio::Tools::HMM
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Yee Man Chan <ymc@yahoo.com>
9 # Copyright Yee Man Chan
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
15 =head1 NAME
17 Bio::Tools::HMM - Perl extension to perform Hidden Markov Model calculations
19 =head1 SYNOPSIS
21 use Bio::Tools::HMM;
22 use Bio::SeqIO;
23 use Bio::Matrix::Scoring;
25 # create a HMM object
26 # ACGT are the bases NC mean non-coding and coding
27 $hmm = Bio::Tools::HMM->new('-symbols' => "ACGT", '-states' => "NC");
29 # initialize some training observation sequences
30 $seq1 = Bio::SeqIO->new(-file => $ARGV[0], -format => 'fasta');
31 $seq2 = Bio::SeqIO->new(-file => $ARGV[1], -format => 'fasta');
32 @seqs = ($seq1, $seq2);
34 # train the HMM with the observation sequences
35 $hmm->baum_welch_training(\@seqs);
37 # get parameters
38 $init = $hmm->init_prob; # returns an array reference
39 $matrix1 = $hmm->transition_prob; # returns Bio::Matrix::Scoring
40 $matrix2 = $hmm->emission_prob; # returns Bio::Matrix::Scoring
42 # initialize training hidden state sequences
43 $hs1 = "NCNCNNNNNCCCCNNCCCNNNNC";
44 $hs2 = "NCNNCNNNNNNCNCNCNNNCNCN";
45 @hss = ($hs1, $hs2);
47 # train the HMM with both observation sequences and hidden state
48 # sequences
49 $hmm->statistical_training(\@seqs, \@hss);
51 # with the newly calibrated HMM, we can use viterbi algorithm
52 # to obtain the hidden state sequence underlying an observation
53 # sequence
54 $hss = $hmm->viterbi($seq); # returns a string of hidden states
56 =head1 DESCRIPTION
58 Hidden Markov Model (HMM) was first introduced by Baum and his colleagues
59 in a series of classic papers in the late 1960s and 1970s. It was first
60 applied to the field of speech recognition with great success in the 1970s.
62 Explosion in the amount sequencing data in the 1990s opened the field
63 of Biological Sequence Analysis. Seeing HMM's effectiveness in detecing
64 signals in biological sequences, Krogh, Mian and Haussler used HMM to find
65 genes in E. coli DNA in a classical paper in 1994. Since then, there have
66 been extensive application of HMM to other area of Biology, for example,
67 multiple sequence alignment, CpG island detection and so on.
69 =head1 DEPENDENCIES
71 This package comes with the main bioperl distribution. You also need
72 to install the lastest bioperl-ext package which contains the XS code
73 that implements the algorithms. This package won't work if you haven't
74 compiled the bioperl-ext package.
76 =head1 TO-DO
79 =over 3
81 =item 1.
83 Allow people to set and get the tolerance level in the EM algorithm.
85 =item 2.
87 Allow people to set and get the maximum number of iterations
88 to run in the EM algorithm.
90 =item 3.
92 A function to calculate the probability of an observation sequence
94 =item 4.
96 A function to do posterior decoding, ie to find the probabilty of
97 seeing a certain observation symbol at position i.
99 =back
101 =head1 FEEDBACK
103 =head2 Mailing Lists
105 User feedback is an integral part of the evolution of this and other
106 Bioperl modules. Send your comments and suggestions preferably to one
107 of the Bioperl mailing lists. Your participation is much appreciated.
109 bioperl-l@bioperl.org - General discussion
110 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
112 =head2 Support
114 Please direct usage questions or support issues to the mailing list:
116 L<bioperl-l@bioperl.org>
118 rather than to the module maintainer directly. Many experienced and
119 reponsive experts will be able look at the problem and quickly
120 address it. Please include a thorough description of the problem
121 with code and data examples if at all possible.
123 =head2 Reporting Bugs
125 Report bugs to the Bioperl bug tracking system to help us keep track
126 the bugs and their resolution. Bug reports can be submitted via the
127 web:
129 http://bugzilla.open-bio.org/
131 =head1 AUTHOR
133 This implementation was written by Yee Man Chan (ymc@yahoo.com).
134 Copyright (c) 2005 Yee Man Chan. All rights reserved. This program
135 is free software; you can redistribute it and/or modify it under
136 the same terms as Perl itself. All the code are written by Yee
137 Man Chan without borrowing any code from anywhere.
139 =cut
141 package Bio::Tools::HMM;
143 use base qw(Bio::Root::Root);
145 BEGIN {
146 eval {
147 require Bio::Ext::HMM;
149 if ( $@ ) {
150 die("\nThe C-compiled engine for Hidden Markov Model (HMM) has not been installed.\n Please read the install the bioperl-ext package\n\n");
151 exit(1);
155 sub new {
156 my ($class, @args) = @_;
158 my $self = $class->SUPER::new(@args);
160 my ($symbols, $states, $init, $a_mat, $e_mat) = $self->_rearrange([qw(SYMBOLS
161 STATES
162 INIT
163 AMAT
164 EMAT
165 )], @args);
167 $self->throw("Observation Symbols are not defined!") unless defined $symbols;
168 $self->throw("Hidden States are not defined!") unless defined $states;
170 if (defined $symbols) {
171 if (scalar($symbols)) {
172 # check duplicate symbols
173 if ($self->symbols($symbols) < 0) {
174 $self->throw("Duplicate symbols!\n");
177 else {
178 $self->throw("We don't support list of symbols in this version.\n");
182 if (defined $states) {
183 if (scalar($states)) {
184 # check duplicate states
185 if ($self->states($states) < 0) {
186 $self->throw("Duplicate states!\n");
189 else {
190 $self->throw("We don't support list of states in this version.\n");
194 $self->{'hmm'} = Bio::Ext::HMM::HMM->new($symbols, $states);
195 return $self;
198 =head2 likelihood
200 Title : likelihood
201 Usage : $prob = $hmm->likelihood($seq)
202 Function: Calculate the probability of an observation sequence given an HMM
203 Returns : An floating point number that is the logarithm of the probability
204 of an observation sequence given an HMM
205 Args : The only argument is a string that is the observation sequence
206 you are interested in. Note that the sequence must not contain
207 any character that is not in the alphabet of observation symbols.
209 =cut
211 sub likelihood {
212 my ($self, $seq) = @_;
213 my $valid_symbols;
215 if( ! defined $seq) {
216 $self->warn("Cannot calculate without supply an observation sequence!");
217 return;
219 my $s = $self->{'symbols'};
220 $_ = $seq;
221 $valid_symbols = eval "tr/$s//;";
222 if ($valid_symbols != length($seq)) {
223 $self->throw("Observation Sequence contains characters that is not in the
224 alphabet of observation symbols!\n");
226 return Bio::Ext::HMM->HMM_likelihood($self->{'hmm'}, $seq);
229 =head2 statistical_training
231 Title : statistical_training
232 Usage : $hmm->statistical_training(\@seqs, \@hss)
233 Function: Estimate the parameters of an HMM given an array of observation
234 sequence and an array of the corresponding hidden state
235 sequences
236 Returns : Returns nothing. The parameters of the HMM will be set to the
237 estimated values
238 Args : The first argument is a reference to an array of observation
239 sequences. The second argument is a reference to an array of
240 hidden state sequences. Note that the lengths of an observation
241 sequence and a hidden state sequence must be the same.
243 =cut
245 sub statistical_training {
246 my ($self, $seqs, $hss) = @_;
247 my $valid_symbols;
248 my $seq_cnt, $hs_cnt;
249 my $i;
251 if( ! defined $seqs or ! defined $hss) {
252 $self->warn("Cannot calculate without supply an observation and a hidden state sequence!");
253 return;
255 $seq_cnt = @{$seqs};
256 $hs_cnt = @{$seqs};
257 if ($seq_cnt != $hs_cnt) {
258 $self->throw("There must be the same number of observation sequences and
259 hidden state sequences!\n");
261 for ($i = 0; $i < $seq_cnt; ++$i) {
262 if (length(@{$seqs}[$i]) != length(@{$hss}[$i])) {
263 $self->throw("The corresponding observation sequences and hidden state sequences must be of the same length!\n");
266 foreach $seq (@{$seqs}) {
267 my $s = $self->{'symbols'};
268 $_ = $seq;
269 $valid_symbols = eval "tr/$s//;";
270 if ($valid_symbols != length($seq)) {
271 $self->throw("Observation Sequence contains characters that is not in the
272 alphabet of observation symbols!\n");
275 foreach $seq (@{$hss}) {
276 my $s = $self->{'states'};
277 $_ = $seq;
278 $valid_symbols = eval "tr/$s//;";
279 if ($valid_symbols != length($seq)) {
280 $self->throw("Hidden State Sequence contains characters that is not in the
281 alphabet of hidden states!\n");
284 Bio::Ext::HMM->HMM_statistical_training($self->{'hmm'}, $seqs, $hss);
287 =head2 baum_welch_training
289 Title : baum_welch_training
290 Usage : $hmm->baum_welch_training(\@seqs)
291 Function: Estimate the parameters of an HMM given an array of observation
292 sequence
293 Returns : Returns nothing. The parameters of the HMM will be set to the
294 estimated values
295 Args : The only argument is a reference to an array of observation
296 sequences.
298 =cut
300 sub baum_welch_training {
301 my ($self, $seqs) = @_;
302 my $valid_symbols;
304 if( ! defined $seqs) {
305 $self->warn("Cannot calculate without supply an observation sequence!");
306 return;
308 foreach $seq (@{$seqs}) {
309 my $s = $self->{'symbols'};
310 $_ = $seq;
311 $valid_symbols = eval "tr/$s//;";
312 if ($valid_symbols != length($seq)) {
313 $self->throw("Observation Sequence contains characters that is not in the
314 alphabet of observation symbols!\n");
317 Bio::Ext::HMM->HMM_baum_welch_training($self->{'hmm'}, $seqs);
320 =head2 viterbi
322 Title : viterbi
323 Usage : $hss = $hmm->viterbi($seq)
324 Function: Find out the hidden state sequence that can maximize the
325 probability of seeing observation sequence $seq.
326 Returns : Returns a string that is the hidden state sequence that maximizes
327 the probability of seeing $seq.
328 Args : The only argument is an observation sequence.
330 =cut
332 sub viterbi {
333 my ($self, $seq) = @_;
334 my $valid_symbols;
336 if( ! defined $seq) {
337 $self->warn("Cannot calculate without supply an observation sequence!");
338 return;
340 my $s = $self->{'symbols'};
341 $_ = $seq;
342 $valid_symbols = eval "tr/$s//;";
343 if ($valid_symbols != length($seq)) {
344 $self->throw("Observation Sequence contains characters that is not in the
345 alphabet of observation symbols!\n");
347 return Bio::Ext::HMM->HMM_viterbi($self->{'hmm'}, $seq);
350 =head2 symbols
352 Title : symbols
353 Usage : $symbols = $hmm->symbols() #get
354 : $hmm->symbols($value) #set
355 Function : the set get for the observation symbols
356 Example :
357 Returns : symbols string
358 Arguments : new value
360 =cut
362 sub symbols {
363 my ($self,$val) = @_;
364 my %alphabets = ();
365 my $c;
367 if ( defined $val ) {
368 # find duplicate
370 for ($i = 0; $i < length($val); ++$i) {
371 $c = substr($val, $i, 1);
372 if (defined $alphabets{$c}) {
373 $self->throw("Can't have duplicate symbols!");
375 else {
376 $alphabets{$c} = 1;
379 $self->{'symbols'} = $val;
381 return $self->{'symbols'};
385 =head2 states
387 Title : states
388 Usage : $states = $hmm->states() #get
389 : $hmm->states($value) #set
390 Function : the set get for the hidden states
391 Example :
392 Returns : states string
393 Arguments : new value
395 =cut
397 sub states {
398 my ($self,$val) = @_;
399 my %alphabets = ();
400 my $c;
402 if ( defined $val ) {
403 # find duplicate
405 for ($i = 0; $i < length($val); ++$i) {
406 $c = substr($val, $i, 1);
407 if (defined $alphabets{$c}) {
408 $self->throw("Can't have duplicate states!");
410 else {
411 $alphabets{$c} = 1;
414 $self->{'states'} = $val;
416 return $self->{'states'};
419 =head2 init_prob
421 Title : init_prob
422 Usage : $init = $hmm->init_prob() #get
423 : $hmm->transition_prob(\@init) #set
424 Function : the set get for the initial probability array
425 Example :
426 Returns : reference to double array
427 Arguments : new value
429 =cut
431 sub init_prob {
432 my ($self, $init) = @_;
433 my $i;
434 my @A;
436 if (defined $init) {
437 if (ref($init)) {
438 my $size = @{$init};
439 my $sum = 0.0;
440 foreach (@{$init}) {
441 $sum += $_;
443 if ($sum != 1.0) {
444 $self->throw("The sum of initial probability array must be 1.0!\n");
446 if ($size != length($self->{'states'})) {
447 $self->throw("The size of init array $size is different from the number of HMM's hidden states!\n");
449 for ($i = 0; $i < $size; ++$i) {
450 Bio::Ext::HMM::HMM->set_init_entry($self->{'hmm'}, substr($self->{'states'}, $i, 1), @{$init}[$i]);
453 else {
454 $self->throw("Initial Probability array must be a reference!\n");
457 else {
458 for ($i = 0; $i < length($self->{'states'}); ++$i) {
459 $A[$i] = Bio::Ext::HMM::HMM->get_init_entry($self->{'hmm'}, substr($self->{'states'}, $i, 1));
461 return \@A;
465 =head2 transition_prob
467 Title : transition_prob
468 Usage : $transition_matrix = $hmm->transition_prob() #get
469 : $hmm->transition_prob($matrix) #set
470 Function : the set get for the transition probability mairix
471 Example :
472 Returns : Bio::Matrix::Scoring
473 Arguments : new value
475 =cut
477 sub transition_prob {
478 my ($self, $matrix) = @_;
479 my $i, $j;
480 my @A;
482 if (defined $matrix) {
483 if ($matrix->isa('Bio::Matrix::Scoring')) {
484 my $row = join("", $matrix->row_names);
485 my $col = join("", $matrix->column_names);
486 if ($row ne $self->{'states'}) {
487 $self->throw("Names of the rows ($row) is different from the states of HMM " . $self->{'states'});
489 if ($col ne $self->{'states'}) {
490 $self->throw("Names of the columns ($col) is different from the states of HMM " . $self->{'states'});
492 for ($i = 0; $i < length($self->{'states'}); ++$i) {
493 my $sum = 0.0;
494 my $a = substr($self->{'states'}, $i, 1);
495 for ($j = 0; $j < length($self->{'states'}); ++$j) {
496 my $b = substr($self->{'states'}, $j, 1);
497 $sum += $matrix->entry($a, $b);
499 if ($sum != 1.0) {
500 $self->throw("Sum of probabilities for each from-state must be 1.0!\n");
503 for ($i = 0; $i < length($self->{'states'}); ++$i) {
504 my $a = substr($self->{'states'}, $i, 1);
505 for ($j = 0; $j < length($self->{'states'}); ++$j) {
506 my $b = substr($self->{'states'}, $j, 1);
507 Bio::Ext::HMM::HMM->set_a_entry($self->{'hmm'}, $a, $b, $matrix->entry($a, $b));
511 else {
512 $self->throw("Transition Probability matrix must be of type Bio::Matrix::Scoring.\n");
515 else {
516 for ($i = 0; $i < length($self->{'states'}); ++$i) {
517 for ($j = 0; $j < length($self->{'states'}); ++$j) {
518 $A[$i][$j] = Bio::Ext::HMM::HMM->get_a_entry($self->{'hmm'}, substr($self->{'states'}, $i, 1), substr($self->{'states'}, $j, 1));
521 my @rows = split(//, $self->{'states'});
522 return $matrix = Bio::Matrix::Scoring->new(-values => \@A, -rownames => \@rows, -colnames => \@rows);
526 =head2 emission_prob
528 Title : emission_prob
529 Usage : $emission_matrix = $hmm->emission_prob() #get
530 : $hmm->emission_prob($matrix) #set
531 Function : the set get for the emission probability mairix
532 Example :
533 Returns : Bio::Matrix::Scoring
534 Arguments : new value
536 =cut
538 sub emission_prob {
539 my ($self, $matrix) = @_;
540 my $i, $j;
541 my @A;
543 if (defined $matrix) {
544 if ($matrix->isa('Bio::Matrix::Scoring')) {
545 my $row = join("", $matrix->row_names);
546 my $col = join("", $matrix->column_names);
547 if ($row ne $self->{'states'}) {
548 $self->throw("Names of the rows ($row) is different from the states of HMM " . $self->{'states'});
550 if ($col ne $self->{'symbols'}) {
551 $self->throw("Names of the columns ($col) is different from the symbols of HMM " . $self->{'symbols'});
553 for ($i = 0; $i < length($self->{'states'}); ++$i) {
554 my $sum = 0.0;
555 my $a = substr($self->{'states'}, $i, 1);
556 for ($j = 0; $j < length($self->{'symbols'}); ++$j) {
557 my $b = substr($self->{'symbols'}, $j, 1);
558 $sum += $matrix->entry($a, $b);
560 if ($sum != 1.0) {
561 $self->throw("Sum of probabilities for each state must be 1.0!\n");
564 for ($i = 0; $i < length($self->{'states'}); ++$i) {
565 my $a = substr($self->{'states'}, $i, 1);
566 for ($j = 0; $j < length($self->{'symbols'}); ++$j) {
567 my $b = substr($self->{'symbols'}, $j, 1);
568 Bio::Ext::HMM::HMM->set_e_entry($self->{'hmm'}, $a, $b, $matrix->entry($a, $b));
572 else {
573 $self->throw("Emission Probability matrix must be of type Bio::Matrix::Scoring.\n");
576 else {
577 for ($i = 0; $i < length($self->{'states'}); ++$i) {
578 for ($j = 0; $j < length($self->{'symbols'}); ++$j) {
579 $A[$i][$j] = Bio::Ext::HMM::HMM->get_e_entry($self->{'hmm'}, substr($self->{'states'}, $i, 1), substr($self->{'symbols'}, $j, 1));
582 my @rows = split(//, $self->{'states'});
583 my @cols = split(//, $self->{'symbols'});
584 return $matrix = Bio::Matrix::Scoring->new(-values => \@A, -rownames => \@rows, -colnames => \@cols);