3 # BioPerl module for Bio::Tools::HMM
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Yee Man Chan <ymc@yahoo.com>
9 # Copyright Yee Man Chan
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
17 Bio::Tools::HMM - Perl extension to perform Hidden Markov Model calculations
23 use Bio::Matrix::Scoring;
26 # ACGT are the bases NC mean non-coding and coding
27 $hmm = Bio::Tools::HMM->new('-symbols' => "ACGT", '-states' => "NC");
29 # initialize some training observation sequences
30 $seq1 = Bio::SeqIO->new(-file => $ARGV[0], -format => 'fasta');
31 $seq2 = Bio::SeqIO->new(-file => $ARGV[1], -format => 'fasta');
32 @seqs = ($seq1, $seq2);
34 # train the HMM with the observation sequences
35 $hmm->baum_welch_training(\@seqs);
38 $init = $hmm->init_prob; # returns an array reference
39 $matrix1 = $hmm->transition_prob; # returns Bio::Matrix::Scoring
40 $matrix2 = $hmm->emission_prob; # returns Bio::Matrix::Scoring
42 # initialize training hidden state sequences
43 $hs1 = "NCNCNNNNNCCCCNNCCCNNNNC";
44 $hs2 = "NCNNCNNNNNNCNCNCNNNCNCN";
47 # train the HMM with both observation sequences and hidden state
49 $hmm->statistical_training(\@seqs, \@hss);
51 # with the newly calibrated HMM, we can use viterbi algorithm
52 # to obtain the hidden state sequence underlying an observation
54 $hss = $hmm->viterbi($seq); # returns a string of hidden states
58 Hidden Markov Model (HMM) was first introduced by Baum and his colleagues
59 in a series of classic papers in the late 1960s and 1970s. It was first
60 applied to the field of speech recognition with great success in the 1970s.
62 Explosion in the amount sequencing data in the 1990s opened the field
63 of Biological Sequence Analysis. Seeing HMM's effectiveness in detecing
64 signals in biological sequences, Krogh, Mian and Haussler used HMM to find
65 genes in E. coli DNA in a classical paper in 1994. Since then, there have
66 been extensive application of HMM to other area of Biology, for example,
67 multiple sequence alignment, CpG island detection and so on.
71 This package comes with the main bioperl distribution. You also need
72 to install the lastest bioperl-ext package which contains the XS code
73 that implements the algorithms. This package won't work if you haven't
74 compiled the bioperl-ext package.
83 Allow people to set and get the tolerance level in the EM algorithm.
87 Allow people to set and get the maximum number of iterations
88 to run in the EM algorithm.
92 A function to calculate the probability of an observation sequence
96 A function to do posterior decoding, ie to find the probabilty of
97 seeing a certain observation symbol at position i.
105 User feedback is an integral part of the evolution of this and other
106 Bioperl modules. Send your comments and suggestions preferably to one
107 of the Bioperl mailing lists. Your participation is much appreciated.
109 bioperl-l@bioperl.org - General discussion
110 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
114 Please direct usage questions or support issues to the mailing list:
116 L<bioperl-l@bioperl.org>
118 rather than to the module maintainer directly. Many experienced and
119 reponsive experts will be able look at the problem and quickly
120 address it. Please include a thorough description of the problem
121 with code and data examples if at all possible.
123 =head2 Reporting Bugs
125 Report bugs to the Bioperl bug tracking system to help us keep track
126 the bugs and their resolution. Bug reports can be submitted via the
129 http://bugzilla.open-bio.org/
133 This implementation was written by Yee Man Chan (ymc@yahoo.com).
134 Copyright (c) 2005 Yee Man Chan. All rights reserved. This program
135 is free software; you can redistribute it and/or modify it under
136 the same terms as Perl itself. All the code are written by Yee
137 Man Chan without borrowing any code from anywhere.
141 package Bio
::Tools
::HMM
;
143 use base
qw(Bio::Root::Root);
147 require Bio
::Ext
::HMM
;
150 die("\nThe C-compiled engine for Hidden Markov Model (HMM) has not been installed.\n Please read the install the bioperl-ext package\n\n");
156 my ($class, @args) = @_;
158 my $self = $class->SUPER::new
(@args);
160 my ($symbols, $states, $init, $a_mat, $e_mat) = $self->_rearrange([qw(SYMBOLS
167 $self->throw("Observation Symbols are not defined!") unless defined $symbols;
168 $self->throw("Hidden States are not defined!") unless defined $states;
170 if (defined $symbols) {
171 if (scalar($symbols)) {
172 # check duplicate symbols
173 if ($self->symbols($symbols) < 0) {
174 $self->throw("Duplicate symbols!\n");
178 $self->throw("We don't support list of symbols in this version.\n");
182 if (defined $states) {
183 if (scalar($states)) {
184 # check duplicate states
185 if ($self->states($states) < 0) {
186 $self->throw("Duplicate states!\n");
190 $self->throw("We don't support list of states in this version.\n");
194 $self->{'hmm'} = Bio
::Ext
::HMM
::HMM
->new($symbols, $states);
201 Usage : $prob = $hmm->likelihood($seq)
202 Function: Calculate the probability of an observation sequence given an HMM
203 Returns : An floating point number that is the logarithm of the probability
204 of an observation sequence given an HMM
205 Args : The only argument is a string that is the observation sequence
206 you are interested in. Note that the sequence must not contain
207 any character that is not in the alphabet of observation symbols.
212 my ($self, $seq) = @_;
215 if( ! defined $seq) {
216 $self->warn("Cannot calculate without supply an observation sequence!");
219 my $s = $self->{'symbols'};
221 $valid_symbols = eval "tr/$s//;";
222 if ($valid_symbols != length($seq)) {
223 $self->throw("Observation Sequence contains characters that is not in the
224 alphabet of observation symbols!\n");
226 return Bio
::Ext
::HMM
->HMM_likelihood($self->{'hmm'}, $seq);
229 =head2 statistical_training
231 Title : statistical_training
232 Usage : $hmm->statistical_training(\@seqs, \@hss)
233 Function: Estimate the parameters of an HMM given an array of observation
234 sequence and an array of the corresponding hidden state
236 Returns : Returns nothing. The parameters of the HMM will be set to the
238 Args : The first argument is a reference to an array of observation
239 sequences. The second argument is a reference to an array of
240 hidden state sequences. Note that the lengths of an observation
241 sequence and a hidden state sequence must be the same.
245 sub statistical_training
{
246 my ($self, $seqs, $hss) = @_;
248 my $seq_cnt, $hs_cnt;
251 if( ! defined $seqs or ! defined $hss) {
252 $self->warn("Cannot calculate without supply an observation and a hidden state sequence!");
257 if ($seq_cnt != $hs_cnt) {
258 $self->throw("There must be the same number of observation sequences and
259 hidden state sequences!\n");
261 for ($i = 0; $i < $seq_cnt; ++$i) {
262 if (length(@
{$seqs}[$i]) != length(@
{$hss}[$i])) {
263 $self->throw("The corresponding observation sequences and hidden state sequences must be of the same length!\n");
266 foreach $seq (@
{$seqs}) {
267 my $s = $self->{'symbols'};
269 $valid_symbols = eval "tr/$s//;";
270 if ($valid_symbols != length($seq)) {
271 $self->throw("Observation Sequence contains characters that is not in the
272 alphabet of observation symbols!\n");
275 foreach $seq (@
{$hss}) {
276 my $s = $self->{'states'};
278 $valid_symbols = eval "tr/$s//;";
279 if ($valid_symbols != length($seq)) {
280 $self->throw("Hidden State Sequence contains characters that is not in the
281 alphabet of hidden states!\n");
284 Bio
::Ext
::HMM
->HMM_statistical_training($self->{'hmm'}, $seqs, $hss);
287 =head2 baum_welch_training
289 Title : baum_welch_training
290 Usage : $hmm->baum_welch_training(\@seqs)
291 Function: Estimate the parameters of an HMM given an array of observation
293 Returns : Returns nothing. The parameters of the HMM will be set to the
295 Args : The only argument is a reference to an array of observation
300 sub baum_welch_training
{
301 my ($self, $seqs) = @_;
304 if( ! defined $seqs) {
305 $self->warn("Cannot calculate without supply an observation sequence!");
308 foreach $seq (@
{$seqs}) {
309 my $s = $self->{'symbols'};
311 $valid_symbols = eval "tr/$s//;";
312 if ($valid_symbols != length($seq)) {
313 $self->throw("Observation Sequence contains characters that is not in the
314 alphabet of observation symbols!\n");
317 Bio
::Ext
::HMM
->HMM_baum_welch_training($self->{'hmm'}, $seqs);
323 Usage : $hss = $hmm->viterbi($seq)
324 Function: Find out the hidden state sequence that can maximize the
325 probability of seeing observation sequence $seq.
326 Returns : Returns a string that is the hidden state sequence that maximizes
327 the probability of seeing $seq.
328 Args : The only argument is an observation sequence.
333 my ($self, $seq) = @_;
336 if( ! defined $seq) {
337 $self->warn("Cannot calculate without supply an observation sequence!");
340 my $s = $self->{'symbols'};
342 $valid_symbols = eval "tr/$s//;";
343 if ($valid_symbols != length($seq)) {
344 $self->throw("Observation Sequence contains characters that is not in the
345 alphabet of observation symbols!\n");
347 return Bio
::Ext
::HMM
->HMM_viterbi($self->{'hmm'}, $seq);
353 Usage : $symbols = $hmm->symbols() #get
354 : $hmm->symbols($value) #set
355 Function : the set get for the observation symbols
357 Returns : symbols string
358 Arguments : new value
363 my ($self,$val) = @_;
367 if ( defined $val ) {
370 for ($i = 0; $i < length($val); ++$i) {
371 $c = substr($val, $i, 1);
372 if (defined $alphabets{$c}) {
373 $self->throw("Can't have duplicate symbols!");
379 $self->{'symbols'} = $val;
381 return $self->{'symbols'};
388 Usage : $states = $hmm->states() #get
389 : $hmm->states($value) #set
390 Function : the set get for the hidden states
392 Returns : states string
393 Arguments : new value
398 my ($self,$val) = @_;
402 if ( defined $val ) {
405 for ($i = 0; $i < length($val); ++$i) {
406 $c = substr($val, $i, 1);
407 if (defined $alphabets{$c}) {
408 $self->throw("Can't have duplicate states!");
414 $self->{'states'} = $val;
416 return $self->{'states'};
422 Usage : $init = $hmm->init_prob() #get
423 : $hmm->transition_prob(\@init) #set
424 Function : the set get for the initial probability array
426 Returns : reference to double array
427 Arguments : new value
432 my ($self, $init) = @_;
444 $self->throw("The sum of initial probability array must be 1.0!\n");
446 if ($size != length($self->{'states'})) {
447 $self->throw("The size of init array $size is different from the number of HMM's hidden states!\n");
449 for ($i = 0; $i < $size; ++$i) {
450 Bio
::Ext
::HMM
::HMM
->set_init_entry($self->{'hmm'}, substr($self->{'states'}, $i, 1), @
{$init}[$i]);
454 $self->throw("Initial Probability array must be a reference!\n");
458 for ($i = 0; $i < length($self->{'states'}); ++$i) {
459 $A[$i] = Bio
::Ext
::HMM
::HMM
->get_init_entry($self->{'hmm'}, substr($self->{'states'}, $i, 1));
465 =head2 transition_prob
467 Title : transition_prob
468 Usage : $transition_matrix = $hmm->transition_prob() #get
469 : $hmm->transition_prob($matrix) #set
470 Function : the set get for the transition probability mairix
472 Returns : Bio::Matrix::Scoring
473 Arguments : new value
477 sub transition_prob
{
478 my ($self, $matrix) = @_;
482 if (defined $matrix) {
483 if ($matrix->isa('Bio::Matrix::Scoring')) {
484 my $row = join("", $matrix->row_names);
485 my $col = join("", $matrix->column_names);
486 if ($row ne $self->{'states'}) {
487 $self->throw("Names of the rows ($row) is different from the states of HMM " . $self->{'states'});
489 if ($col ne $self->{'states'}) {
490 $self->throw("Names of the columns ($col) is different from the states of HMM " . $self->{'states'});
492 for ($i = 0; $i < length($self->{'states'}); ++$i) {
494 my $a = substr($self->{'states'}, $i, 1);
495 for ($j = 0; $j < length($self->{'states'}); ++$j) {
496 my $b = substr($self->{'states'}, $j, 1);
497 $sum += $matrix->entry($a, $b);
500 $self->throw("Sum of probabilities for each from-state must be 1.0!\n");
503 for ($i = 0; $i < length($self->{'states'}); ++$i) {
504 my $a = substr($self->{'states'}, $i, 1);
505 for ($j = 0; $j < length($self->{'states'}); ++$j) {
506 my $b = substr($self->{'states'}, $j, 1);
507 Bio
::Ext
::HMM
::HMM
->set_a_entry($self->{'hmm'}, $a, $b, $matrix->entry($a, $b));
512 $self->throw("Transition Probability matrix must be of type Bio::Matrix::Scoring.\n");
516 for ($i = 0; $i < length($self->{'states'}); ++$i) {
517 for ($j = 0; $j < length($self->{'states'}); ++$j) {
518 $A[$i][$j] = Bio
::Ext
::HMM
::HMM
->get_a_entry($self->{'hmm'}, substr($self->{'states'}, $i, 1), substr($self->{'states'}, $j, 1));
521 my @rows = split(//, $self->{'states'});
522 return $matrix = Bio
::Matrix
::Scoring
->new(-values => \
@A, -rownames
=> \
@rows, -colnames
=> \
@rows);
528 Title : emission_prob
529 Usage : $emission_matrix = $hmm->emission_prob() #get
530 : $hmm->emission_prob($matrix) #set
531 Function : the set get for the emission probability mairix
533 Returns : Bio::Matrix::Scoring
534 Arguments : new value
539 my ($self, $matrix) = @_;
543 if (defined $matrix) {
544 if ($matrix->isa('Bio::Matrix::Scoring')) {
545 my $row = join("", $matrix->row_names);
546 my $col = join("", $matrix->column_names);
547 if ($row ne $self->{'states'}) {
548 $self->throw("Names of the rows ($row) is different from the states of HMM " . $self->{'states'});
550 if ($col ne $self->{'symbols'}) {
551 $self->throw("Names of the columns ($col) is different from the symbols of HMM " . $self->{'symbols'});
553 for ($i = 0; $i < length($self->{'states'}); ++$i) {
555 my $a = substr($self->{'states'}, $i, 1);
556 for ($j = 0; $j < length($self->{'symbols'}); ++$j) {
557 my $b = substr($self->{'symbols'}, $j, 1);
558 $sum += $matrix->entry($a, $b);
561 $self->throw("Sum of probabilities for each state must be 1.0!\n");
564 for ($i = 0; $i < length($self->{'states'}); ++$i) {
565 my $a = substr($self->{'states'}, $i, 1);
566 for ($j = 0; $j < length($self->{'symbols'}); ++$j) {
567 my $b = substr($self->{'symbols'}, $j, 1);
568 Bio
::Ext
::HMM
::HMM
->set_e_entry($self->{'hmm'}, $a, $b, $matrix->entry($a, $b));
573 $self->throw("Emission Probability matrix must be of type Bio::Matrix::Scoring.\n");
577 for ($i = 0; $i < length($self->{'states'}); ++$i) {
578 for ($j = 0; $j < length($self->{'symbols'}); ++$j) {
579 $A[$i][$j] = Bio
::Ext
::HMM
::HMM
->get_e_entry($self->{'hmm'}, substr($self->{'states'}, $i, 1), substr($self->{'symbols'}, $j, 1));
582 my @rows = split(//, $self->{'states'});
583 my @cols = split(//, $self->{'symbols'});
584 return $matrix = Bio
::Matrix
::Scoring
->new(-values => \
@A, -rownames
=> \
@rows, -colnames
=> \
@cols);