changes all issue tracking in preparation for switch to github issues
[bioperl-live.git] / Bio / Matrix / PSM / ProtMatrix.pm
blob337d76c9bc118629c76904bf30d91075a097c782
1 #---------------------------------------------------------
3 =head1 NAME
5 Bio::Matrix::PSM::ProtMatrix - SiteMatrixI implementation, holds a
6 position scoring matrix (or position weight matrix) with log-odds scoring
7 information.
9 =head1 SYNOPSIS
11 use Bio::Matrix::PSM::ProtMatrix;
12 # Create from memory by supplying probability matrix hash both as strings or
13 # arrays where the frequencies Hash entries of the form lN refer to an array
14 # of position-specific log-odds scores for amino acid N. Hash entries of the
15 # form pN represent the position-specific probability of finding amino acid N.
17 my %param = (
18 'id' => 'A. thaliana protein atp1',
19 '-e_val' => $score,
20 'lS' => [ '-2', '3', '-3', '2', '-3', '1', '1', '3' ],
21 'lF' => [ '-1', '-4', '0', '-5', '0', '-5', '-4', '-4' ],
22 'lT' => [ '-1', '1', '0', '1', '-2', '-1', '0', '1' ],
23 'lN' => [ '-3', '-1', '-2', '3', '-5', '5', '-2', '0' ],
24 'lK' => [ '-2', '0', '-3', '2', '-3', '2', '-3', '-1' ],
25 'lY' => [ '-2', '-3', '-3', '-4', '-3', '-4', '-4', '-4' ],
26 'lE' => [ '-3', '4', '-3', '2', '-4', '-2', '-3', '2' ],
27 'lV' => [ '0', '-2', '1', '-4', '1', '-4', '-1', '-3' ],
28 'lQ' => [ '-1', '0', '-2', '3', '-4', '1', '-3', '0' ],
29 'lM' => [ '8', '-3', '8', '-3', '1', '-3', '-3', '-3' ],
30 'lC' => [ '-2', '-3', '-3', '-4', '-3', '-4', '-3', '-3' ],
31 'lL' => [ '1', '-3', '1', '-4', '3', '-4', '-2', '-4' ],
32 'lA' => [ '-2', '1', '-2', '0', '-2', '-2', '2', '2' ],
33 'lW' => [ '-2', '-4', '-3', '-5', '-4', '-5', '-5', '-5' ],
34 'lP' => [ '-3', '-2', '-4', '-3', '-1', '-3', '6', '-3' ],
35 'lH' => [ '-2', '-2', '-3', '-2', '-5', '-2', '-2', '-3' ],
36 'lD' => [ '-4', '-1', '-3', '1', '-3', '-1', '-3', '4' ],
37 'lR' => [ '-2', '-1', '-3', '0', '-4', '4', '-4', '-3' ],
38 'lI' => [ '0', '-3', '0', '-4', '6', '-4', '-2', '-2' ],
39 'lG' => [ '-4', '-2', '-4', '-2', '-5', '-3', '-1', '-2' ],
40 'pS' => [ '0', '33', '0', '16', '1', '12', '11', '25' ],
41 'pF' => [ '0', '0', '2', '0', '3', '0', '0', '0' ],
42 'pT' => [ '0', '8', '7', '10', '1', '2', '7', '8' ],
43 'pN' => [ '0', '0', '2', '13', '0', '36', '1', '4' ],
44 'pK' => [ '0', '5', '0', '13', '1', '15', '0', '2' ],
45 'pY' => [ '0', '0', '0', '0', '0', '0', '0', '0' ],
46 'pE' => [ '0', '41', '1', '12', '0', '0', '0', '15' ],
47 'pV' => [ '0', '3', '9', '0', '2', '0', '3', '1' ],
48 'pQ' => [ '0', '0', '0', '15', '0', '4', '0', '3' ],
49 'pM' => [ '100', '0', '66', '0', '2', '0', '0', '0' ],
50 'pC' => [ '0', '0', '0', '0', '0', '0', '0', '0' ],
51 'pL' => [ '0', '0', '8', '0', '25', '0', '4', '0' ],
52 'pA' => [ '0', '10', '1', '9', '2', '0', '22', '16' ],
53 'pW' => [ '0', '0', '0', '0', '0', '0', '0', '0' ],
54 'pP' => [ '0', '0', '0', '0', '3', '1', '45', '0' ],
55 'pH' => [ '0', '0', '0', '0', '0', '0', '1', '0' ],
56 'pD' => [ '0', '0', '1', '7', '2', '2', '0', '22' ],
57 'pR' => [ '0', '0', '0', '3', '0', '27', '0', '0' ],
58 'pI' => [ '0', '0', '3', '0', '59', '1', '2', '3' ],
59 'pG' => [ '0', '0', '0', '1', '0', '0', '4', '1' ],
62 my $matrix = Bio::Matrix::PSM::ProtMatrix( %param );
65 my $site = Bio::Matrix::PSM::ProtMatrix->new(%param);
66 # Or get it from a file:
67 use Bio::Matrix::PSM::IO;
68 my $psmIO = Bio::Matrix::PSM::IO->new(-file => $file, -format => 'psi-blast');
69 while (my $psm = $psmIO->next_psm) {
70 #Now we have a Bio::Matrix::PSM::Psm object,
71 # see Bio::Matrix::PSM::PsmI for details
72 #This is a Bio::Matrix::PSM::ProtMatrix object now
73 my $matrix = $psm->matrix;
76 # Get a simple consensus, where alphabet is:
77 # {A, R, N, D, C, Q, E, G, H, I, L, K, M, F, P, S, T, W, Y, V,}
78 # choosing the highest probability or N if prob is too low
79 my $consensus = $site->consensus;
81 # Retrieving and using regular expressions:
82 my $regexp = $site->regexp;
83 my $count = grep($regexp,$seq);
84 my $count = ($seq=~ s/$regexp/$1/eg);
85 print "Motif $mid is present $count times in this sequence\n";
87 =head1 DESCRIPTION
89 ProtMatrix is designed to provide some basic methods when working with
90 position scoring (weight) matrices related to protein sequences. A
91 protein PSM consists of 20 vectors with 20 frequencies (one per amino
92 acid per position). This is the minimum information you should
93 provide to construct a PSM object. The vectors can be provided as
94 strings with frequencies where the frequency is {0..a} and a=1. This
95 is the way MEME compressed representation of a matrix and it is quite
96 useful when working with relational DB. If arrays are provided as an
97 input (references to arrays actually) they can be any number, real or
98 integer (frequency or count).
100 When creating the object the constructor will check for positions that
101 equal 0. If such is found it will increase the count for all
102 positions by one and recalculate the frequency. Potential bug - if
103 you are using frequencies and one of the positions is 0 it will change
104 significantly. However, you should never have frequency that equals
107 Throws an exception if: You mix as an input array and string (for
108 example A matrix is given as array, C - as string). The position
109 vector is (0,0,0,0). One of the probability vectors is shorter than
110 the rest.
112 Summary of the methods I use most frequently (details bellow):
114 iupac - return IUPAC compliant consensus as a string
115 score - Returns the score as a real number
116 IC - information content. Returns a real number
117 id - identifier. Returns a string
118 accession - accession number. Returns a string
119 next_pos - return the sequence probably for each letter, IUPAC
120 symbol, IUPAC probability and simple sequence
121 consenus letter for this position. Rewind at the end. Returns a hash.
122 pos - current position get/set. Returns an integer.
123 regexp - construct a regular expression based on IUPAC consensus.
124 For example AGWV will be [Aa][Gg][AaTt][AaCcGg]
125 width - site width
126 get_string - gets the probability vector for a single base as a string.
127 get_array - gets the probability vector for a single base as an array.
128 get_logs_array - gets the log-odds vector for a single base as an array.
130 New methods, which might be of interest to anyone who wants to store
131 PSM in a relational database without creating an entry for each
132 position is the ability to compress the PSM vector into a string with
133 losing usually less than 1% of the data. this can be done with:
135 my $str=$matrix->get_compressed_freq('A');
138 my $str=$matrix->get_compressed_logs('A');
140 Loading from a database should be done with new, but is not yet implemented.
141 However you can still uncompress such string with:
143 my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1,1); for PSM
147 my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1000,2); for log odds
149 =head1 FEEDBACK
151 =head2 Mailing Lists
153 User feedback is an integral part of the evolution of this and other
154 Bioperl modules. Send your comments and suggestions preferably to one
155 of the Bioperl mailing lists. Your participation is much appreciated.
157 bioperl-l@bioperl.org - General discussion
158 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
160 =head2 Support
162 Please direct usage questions or support issues to the mailing list:
164 I<bioperl-l@bioperl.org>
166 rather than to the module maintainer directly. Many experienced and
167 reponsive experts will be able look at the problem and quickly
168 address it. Please include a thorough description of the problem
169 with code and data examples if at all possible.
171 =head2 Reporting Bugs
173 Report bugs to the Bioperl bug tracking system to help us keep track
174 the bugs and their resolution. Bug reports can be submitted via the
175 web:
177 https://github.com/bioperl/bioperl-live/issues
179 =head1 AUTHOR - James Thompson
181 Email tex@biosysadmin.com
183 =head1 APPENDIX
185 =cut
188 # Let the code begin...
189 package Bio::Matrix::PSM::ProtMatrix;
190 use strict;
192 use base qw(Bio::Root::Root Bio::Matrix::PSM::SiteMatrixI);
194 =head2 new
196 Title : new
197 Usage : my $site = Bio::Matrix::PSM::ProtMatrix->new(
198 %probs,
199 %logs,
200 -IC => $ic,
201 -e_val => $score,
202 -id => $mid
203 -model => \%model
205 Function : Creates a new Bio::Matrix::PSM::ProtMatrix object from memory
206 Throws : If inconsistent data for all vectors (all 20 amino acids) is
207 provided, if you mix input types (string vs array) or if a
208 position freq is 0.
209 Example :
210 Returns : Bio::Matrix::PSM::ProtMatrix object
211 Args : Hash references to log-odds scores and probabilities for
212 position-specific scoring info, e-value (optional), information
213 content (optional), id (optional), model for background distribution
214 of proteins (optional).
216 =cut
218 sub new {
219 my ($class, @args) = @_;
220 my $self = $class->SUPER::new(@args);
221 my $consensus;
222 #Too many things to rearrange, and I am creating simultanuously >500
223 # such objects routinely, so this becomes performance issue
224 my %input;
225 while( @args ) {
226 (my $key = shift @args) =~ s/-//gi; #deletes all dashes (only dashes)!
227 $input{$key} = shift @args;
230 # get a protein alphabet for processing log-odds scores and probabilities
231 # maybe change this later on to allow for non-standard aa lists?
232 my @alphabet = qw/A R N D C Q E G H I L K M F P S T W Y V/;
234 foreach my $aa (@alphabet) {
235 $self->{"log$aa"} = defined($input{"l$aa"}) ? $input{"l$aa"}
236 : $self->throw("Error: No log-odds information for $aa!");
237 $self->{"prob$aa"} = defined($input{"p$aa"}) ? $input{"p$aa"}
238 : $self->throw("Error: No probability information for $aa!");
241 $self->{_position} = 0;
242 $self->{IC} = $input{IC};
243 $self->{e_val} = $input{e_val};
244 $self->{sites} = $input{sites};
245 $self->{width} = $input{width};
246 $self->{accession_number} = $input{accession_number};
247 $self->{_correction} = defined($input{correction}) ?
248 $input{correction} : 1 ; # Correction might be unwanted- supply your own
249 # No id provided, null for the sake of rel db
250 $self->{id} = defined($input{id}) ? $input{id} : 'null';
251 $self->{_alphabet} = \@alphabet;
253 #Make consensus, throw if any one of the vectors is shorter
254 $self = _calculate_consensus($self,$input{model});
255 return $self;
258 =head2 alphabet
260 Title : Returns an array (or array reference if desired) to the alphabet
261 Usage :
262 Function : Returns an array (or array reference) containing all of the
263 allowable characters for this matrix.
264 Throws :
265 Example :
266 Returns : Array or arrary reference.
267 Args :
269 =cut
271 sub alphabet {
272 my $self = shift;
273 if ( wantarray ) {
274 return $self->{_alphabet};
275 } else {
276 return @{$self->{_alphabet}};
280 =head2 _calculate_consensus
282 Title : _calculate_consensus
283 Usage :
284 Function : Calculates the consensus sequence for this matrix.
285 Throws :
286 Example :
287 Returns :
288 Args :
290 =cut
292 sub _calculate_consensus {
293 my $self = shift;
294 my $thresh = shift;
296 # verify that all of the array lengths in %probs are the same
297 my @lengths = map { scalar(@$_) } map {$self->{"prob$_"}} @{ $self->{_alphabet} };
298 my $len = shift @lengths;
299 for ( @lengths ) {
300 if ( $_ ne $len ) { $self->throw( "Probability matrix is damaged!\n" ) };
303 # iterate over probs, generate the most likely sequence and put it into
304 # $self->{seq}. Put the probability of this sequence into $self->{seqp}.
305 for ( my $i = 0; $i < $len; $i++ ) {
306 # get a list of all the probabilities at position $i, ordered by $self->{_alphabet}
307 my @probs = map { ${$self->{"prob$_"}}[$i] } @{ $self->{_alphabet} };
308 # calculate the consensus of @probs, put sequence into seqp and probabilities into seqp
309 (${$self->{seq}}[$i],${$self->{seqp}}[$i]) = $self->_to_cons( @probs, $thresh );
312 return $self;
315 =head2 next_pos
317 Title : next_pos
318 Usage :
319 Function : Retrives the next position features: frequencies for all 20 amino
320 acids, log-odds scores for all 20 amino acids at this position,
321 the main (consensus) letter at this position, the probability
322 for the consensus letter to occur at this position and the relative
323 current position as an integer.
324 Throws :
325 Example :
326 Returns : hash (or hash reference) (pA,pR,pN,pD,...,logA,logR,logN,logD,aa,prob,rel)
327 - pN entries represent the probability for amino acid N
328 to be at this position
329 - logN entries represent the log-odds score for having amino acid
330 N at this position
331 - aa is the consensus amino acid
332 - prob is the probability for the consensus amino acid to be at this
333 position
334 - rel is the relative index of the current position (integer)
335 Args : none
338 =cut
340 sub next_pos {
341 my $self = shift;
342 $self->throw("instance method called on class") unless ref $self;
344 my $len = @{$self->{seq}};
345 my $pos = $self->{_position};
347 # return a PSM if we're still within range
348 if ($pos<$len) {
350 my %probs = map { ("p$_", ${$self->{"prob$_"}}[$pos]) } @{$self->{_alphabet}};
351 my %logs = map { ("l$_", ${$self->{"log$_"}}[$pos]) } @{$self->{_alphabet}};
352 my $base = ${$self->{seq}}[$pos];
353 my $prob = ${$self->{seqp}}[$pos];
355 $self->{_position}++;
356 my %hash = ( %probs, %logs, base => $base, rel => $pos, prob => $prob );
358 # decide whether to return the hash or a reference to it
359 if ( wantarray ) {
360 return %hash;
361 } else {
362 return \%hash;
364 } else { # otherwise, reset $self->{_position} and return nothing
365 $self->{_position} = 0;
366 return;
371 =head2 curpos
373 Title : curpos
374 Usage :
375 Function : Gets/sets the current position.
376 Throws :
377 Example :
378 Returns : Current position (integer).
379 Args : New position (integer).
381 =cut
383 sub curpos {
384 my $self = shift;
385 if (@_) { $self->{_position} = shift; }
386 return $self->{_position};
390 =head2 e_val
392 Title : e_val
393 Usage :
394 Function : Gets/sets the e-value
395 Throws :
396 Example :
397 Returns :
398 Args : real number
400 =cut
402 sub e_val {
403 my $self = shift;
404 if (@_) { $self->{e_val} = shift; }
405 return $self->{e_val};
409 =head2 IC
411 Title : IC
412 Usage :
413 Function : Position-specific information content.
414 Throws :
415 Example :
416 Returns : Information content for current position.
417 Args : Information content for current position.
419 =cut
421 sub IC {
422 my $self = shift;
423 if (@_) { $self->{IC} = shift; }
424 return $self->{IC};
427 =head2 accession_number
429 Title : accession_number
430 Usage :
431 Function: accession number, this will be unique id for the ProtMatrix object as
432 well for any other object, inheriting from ProtMatrix.
433 Throws :
434 Example :
435 Returns : New accession number (string)
436 Args : Accession number (string)
438 =cut
440 sub accession_number {
441 my $self = shift;
442 if (@_) { $self->{accession_number} = shift; }
443 return $self->{accession_number};
446 =head2 consensus
448 Title : consensus
449 Usage :
450 Function : Returns the consensus sequence for this PSM.
451 Throws : if supplied with thresold outisde 5..10 range
452 Example :
453 Returns : string
454 Args : (optional) threshold value 5 to 10 (corresponds to 50-100% at each position
456 =cut
458 sub consensus {
459 my $self = shift;
460 my $thresh=shift;
461 $self->_calculate_consensus($thresh) if ($thresh); #Change of threshold
462 my $consensus='';
464 foreach my $letter (@{$self->{seq}}) {
465 $consensus .= $letter;
468 return $consensus;
471 sub IUPAC {
472 my $self = shift;
473 return $self->consensus;
477 =head2 get_string
479 Title : get_string
480 Usage :
481 Function: Returns given probability vector as a string. Useful if you want to
482 store things in a rel database, where arrays are not first choice
483 Throws : If the argument is outside {A,C,G,T}
484 Example :
485 Returns : string
486 Args : character {A,C,G,T}
488 =cut
490 sub get_string {
491 my $self = shift;
492 my $base = shift;
493 my $string = '';
495 my @prob = @{$self->{"prob$base"}};
496 if ( ! @prob ) {
497 $self->throw( "No such base: $base\n");
500 foreach my $prob (@prob) {
501 my $corrected = $prob*10;
502 my $next = sprintf("%.0f",$corrected);
503 $next = 'a' if ($next eq '10');
504 $string .= $next;
506 return $string;
511 =head2 width
513 Title : width
514 Usage :
515 Function : Returns the length of the site
516 Throws :
517 Example :
518 Returns : number
519 Args :
521 =cut
523 sub width {
524 my $self = shift;
525 my $width = @{$self->{probA}};
526 return $width;
529 =head2 get_array
531 Title : get_array
532 Usage :
533 Function : Returns an array with frequencies for a specified amino acid.
534 Throws :
535 Example :
536 Returns : Array representing frequencies for specified amino acid.
537 Args : Single amino acid (character).
539 =cut
541 sub get_array {
542 my $self = shift;
543 my $letter = uc(shift);
545 $self->throw ("No such base: $letter!\n") unless grep { /$letter/ } @{$self->{_alphabet}};
547 return @{$self->{"prob$letter"}};
551 =head2 get_logs_array
553 Title : get_logs_array
554 Usage :
555 Function : Returns an array with log_odds for a specified base
556 Throws :
557 Example :
558 Returns : Array representing log-odds scores for specified amino acid.
559 Args : Single amino acid (character).
561 =cut
563 sub get_logs_array {
564 my $self = shift;
565 my $letter = uc(shift);
567 $self->throw ("No such base: $letter!\n") unless grep { /$letter/ } @{$self->{_alphabet}};
569 return @{$self->{"log$letter"}};
572 =head2 id
574 Title : id
575 Usage :
576 Function : Gets/sets the site id
577 Throws :
578 Example :
579 Returns : string
580 Args : string
582 =cut
584 sub id {
585 my $self = shift;
586 if (@_) { $self->{id} = shift; }
587 return $self->{id};
590 =head2 regexp
592 Title : regexp
593 Usage :
594 Function : Returns a case-insensitive regular expression which matches the
595 IUPAC convention. X's in consensus sequence will match anything.
596 Throws :
597 Example :
598 Returns : string
599 Args : Threshold for calculating consensus sequence (number in range 0-100
600 representing a percentage). Threshold defaults to 20.
602 =cut
604 sub regexp {
605 my $self = shift;
606 my $threshold = 20;
607 if ( @_ ) { my $threshold = shift };
609 my @alphabet = @{$self->{_alphabet}};
610 my $width = $self->width;
611 my (@regexp, $i);
612 for ( $i = 0; $i < $width; $i++ ) {
613 # get an array of the residues at this position with p > $threshold
614 my @letters = map { uc($_).lc($_) } grep { $self->{"prob$_"}->[$i] >= $threshold } @alphabet;
616 my $reg;
617 if ( scalar(@letters) == 0 ) {
618 $reg = '\.';
619 } else {
620 $reg = '['.join('',@letters).']';
622 push @regexp, $reg;
625 if ( wantarray ) {
626 return @regexp;
627 } else {
628 return join '', @regexp;
633 =head2 regexp_array
635 Title : regexp_array
636 Usage :
637 Function : Returns an array of position-specific regular expressions.
638 X's in consensus sequence will match anything.
639 Throws :
640 Example :
641 Returns : Array of position-specific regular expressions.
642 Args : Threshold for calculating consensus sequence (number in range 0-100
643 representing a percentage). Threshold defaults to 20.
644 Notes : Simply calls regexp method in list context.
646 =cut
648 sub regexp_array {
649 my $self = shift;
651 return @{ $self->regexp };
655 =head2 _compress_array
657 Title : _compress_array
658 Usage :
659 Function : Will compress an array of real signed numbers to a string (ie vector of bytes)
660 -127 to +127 for bi-directional(signed) and 0..255 for unsigned ;
661 Throws :
662 Example : Internal stuff
663 Returns : String
664 Args : array reference, followed by max value and direction (optional, defaults to 1),
665 direction of 1 is unsigned, anything else is signed.
667 =cut
669 sub _compress_array {
670 my ($array,$lm,$direct)=@_;
671 my $str;
672 return unless(($array) && ($lm));
673 $direct=1 unless ($direct);
674 my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
675 foreach my $c (@{$array}) {
676 $c=$lm if ($c>$lm);
677 $c=-$lm if (($c<-$lm) && ($direct !=1));
678 $c=0 if (($c<0) && ($direct ==1));
679 my $byte=int($k1*$c);
680 $byte=127+$byte if ($direct !=1);#Clumsy, should be really shift the bits
681 my $char=chr($byte);
682 $str.=$char;
684 return $str;
687 =head2 _uncompress_string
689 Title : _uncompress_string
690 Usage :
691 Function : Will uncompress a string (vector of bytes) to create an array of real
692 signed numbers (opposite to_compress_array)
693 Throws :
694 Example : Internal stuff
695 Returns : string, followed by max value and direction (optional, defaults to 1),
696 direction of 1 is unsigned, anything else is signed.
697 Args : array
699 =cut
701 sub _uncompress_string {
702 my ($str,$lm,$direct)=@_;
703 my @array;
704 return unless(($str) && ($lm));
705 $direct=1 unless ($direct);
706 my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
707 while (my $c=chop($str)) {
708 my $byte=ord($c);
709 $byte=$byte-127 if ($direct !=1);#Clumsy, should be really shift the bits
710 my $num=$byte/$k1;
711 unshift @array,$num;
714 return @array;
717 =head2 get_compressed_freq
719 Title : get_compressed_freq
720 Usage :
721 Function: A method to provide a compressed frequency vector. It uses one byte to
722 code the frequence for one of the probability vectors for one position.
723 Useful for relational database. Improvment of the previous 0..a coding.
724 Throws :
725 Example : my $strA=$self->get_compressed_freq('A');
726 Returns : String
727 Args : char
729 =cut
731 sub get_compressed_freq {
732 my $self=shift;
733 my $base=shift;
734 my $string='';
735 my @prob;
736 BASE: {
737 if ($base eq 'A') {
738 @prob = @{$self->{probA}} unless (!defined($self->{probA}));
739 last BASE;
741 if ($base eq 'G') {
742 @prob = @{$self->{probG}} unless (!defined($self->{probG}));
743 last BASE;
745 if ($base eq 'C') {
746 @prob = @{$self->{probC}} unless (!defined($self->{probC}));
747 last BASE;
749 if ($base eq 'T') {
750 @prob = @{$self->{probT}} unless (!defined($self->{probT}));
751 last BASE;
753 $self->throw ("No such base: $base!\n");
755 my $str= _compress_array(\@prob,1,1);
756 return $str;
759 =head2 sequence_match_weight
761 Title : sequence_match_weight
762 Usage :
763 Function : This method will calculate the score of a match, based on the PSM
764 if such is associated with the matrix object. Returns undef if no
765 PSM data is available.
766 Throws : if the length of the sequence is different from the matrix width
767 Example : my $score=$matrix->sequence_match_weight('ACGGATAG');
768 Returns : Floating point
769 Args : string
771 =cut
773 sub sequence_match_weight {
774 my ($self,$seq)=@_;
775 return unless ($self->{logA});
777 my $seqlen = length($seq);
778 my $width = $self->width;
779 $self->throw("Error: Input sequence size ($seqlen) not equal to PSM size ($width)!\n")
780 unless (length($seq) == $self->width);
782 my ($score,$i) = (0,0);
783 foreach my $letter ( split //, $seq ) {
784 # add up the score for this position
785 $score += $self->{"log$letter"}->[$i];
786 $i++;
788 return $score;
792 =head2 _to_IUPAC
794 Title : _to_IUPAC
795 Usage :
796 Function: Converts a single position to IUPAC compliant symbol and returns its probability.
797 Currently returns the most likely amino acid/probability combination.
798 Throws :
799 Example :
800 Returns : char, real number representing an amino acid and a probability.
801 Args : real numbers for all 20 amino acids (ordered by alphabet contained
802 in $self->{_alphabet}, minimum probability threshold.
804 =cut
806 sub _to_IUPAC {
807 my ($self,@probs,$thresh) = @_;
809 # provide a default threshold of 5, corresponds to 5% threshold for
810 # inferring that the aa at any position is the true aa
811 $thresh = 5 unless ( defined $thresh );
813 my ($IUPAC_aa,$max_prob) = ('X',$thresh);
814 for my $aa ( @{$self->{_alphabet}} ) {
815 my $prob = shift @probs;
816 if ( $prob > $max_prob ) {
817 $IUPAC_aa = $aa;
818 $max_prob = $prob;
822 return $IUPAC_aa, $max_prob;
825 =head2 _to_cons
827 Title : _to_cons
828 Usage :
829 Function: Converts a single position to simple consensus character and returns
830 its probability. Currently just calls the _to_IUPAC subroutine.
831 Throws :
832 Example :
833 Returns : char, real number
834 Args : real numbers for A,C,G,T (positional)
836 =cut
838 sub _to_cons {
839 return _to_IUPAC( @_ );
842 =head2 get_all_vectors
844 Title : get_all_vectors
845 Usage :
846 Function : returns all possible sequence vectors to satisfy the PFM under
847 a given threshold
848 Throws : If threshold outside of 0..1 (no sense to do that)
849 Example : my @vectors = $self->get_all_vectors(4);
850 Returns : Array of strings
851 Args : (optional) floating
853 =cut
855 #sub get_all_vectors {
856 # my $self = shift;
857 # my $thresh = shift;
859 # $self->throw("Out of range. Threshold should be >0 and 1<.\n") if (($thresh<0) || ($thresh>1));
861 # my @seq = split(//,$self->consensus($thresh*10));
862 # my @perm;
863 # for my $i (0..@{$self->{probA}}) {
864 # push @{$perm[$i]},'A' if ($self->{probA}->[$i]>$thresh);
865 # push @{$perm[$i]},'C' if ($self->{probC}->[$i]>$thresh);
866 # push @{$perm[$i]},'G' if ($self->{probG}->[$i]>$thresh);
867 # push @{$perm[$i]},'T' if ($self->{probT}->[$i]>$thresh);
868 # push @{$perm[$i]},'N' if ($seq[$i] eq 'N');
870 # my $fpos=shift @perm;
871 # my @strings=@$fpos;
872 # foreach my $pos (@perm) {
873 # my @newstr;
874 # foreach my $let (@$pos) {
875 # foreach my $string (@strings) {
876 # my $newstring = $string . $let;
877 # push @newstr,$newstring;
880 # @strings=@newstr;
882 # return @strings;