sync w/ main trunk
[bioperl-live.git] / Bio / Matrix / PSM / ProtMatrix.pm
blob968ec2d96c81f19a45fbdf2026f82d88c2ee2d3a
1 # $Id$
2 #---------------------------------------------------------
4 =head1 NAME
6 Bio::Matrix::PSM::ProtMatrix - SiteMatrixI implementation, holds a
7 position scoring matrix (or position weight matrix) with log-odds scoring
8 information.
10 =head1 SYNOPSIS
12 use Bio::Matrix::PSM::ProtMatrix;
13 # Create from memory by supplying probability matrix hash both as strings or
14 # arrays where the frequencies Hash entries of the form lN refer to an array
15 # of position-specific log-odds scores for amino acid N. Hash entries of the
16 # form pN represent the position-specific probability of finding amino acid N.
18 my %param = (
19 'id' => 'A. thaliana protein atp1',
20 '-e_val' => $score,
21 'lS' => [ '-2', '3', '-3', '2', '-3', '1', '1', '3' ],
22 'lF' => [ '-1', '-4', '0', '-5', '0', '-5', '-4', '-4' ],
23 'lT' => [ '-1', '1', '0', '1', '-2', '-1', '0', '1' ],
24 'lN' => [ '-3', '-1', '-2', '3', '-5', '5', '-2', '0' ],
25 'lK' => [ '-2', '0', '-3', '2', '-3', '2', '-3', '-1' ],
26 'lY' => [ '-2', '-3', '-3', '-4', '-3', '-4', '-4', '-4' ],
27 'lE' => [ '-3', '4', '-3', '2', '-4', '-2', '-3', '2' ],
28 'lV' => [ '0', '-2', '1', '-4', '1', '-4', '-1', '-3' ],
29 'lQ' => [ '-1', '0', '-2', '3', '-4', '1', '-3', '0' ],
30 'lM' => [ '8', '-3', '8', '-3', '1', '-3', '-3', '-3' ],
31 'lC' => [ '-2', '-3', '-3', '-4', '-3', '-4', '-3', '-3' ],
32 'lL' => [ '1', '-3', '1', '-4', '3', '-4', '-2', '-4' ],
33 'lA' => [ '-2', '1', '-2', '0', '-2', '-2', '2', '2' ],
34 'lW' => [ '-2', '-4', '-3', '-5', '-4', '-5', '-5', '-5' ],
35 'lP' => [ '-3', '-2', '-4', '-3', '-1', '-3', '6', '-3' ],
36 'lH' => [ '-2', '-2', '-3', '-2', '-5', '-2', '-2', '-3' ],
37 'lD' => [ '-4', '-1', '-3', '1', '-3', '-1', '-3', '4' ],
38 'lR' => [ '-2', '-1', '-3', '0', '-4', '4', '-4', '-3' ],
39 'lI' => [ '0', '-3', '0', '-4', '6', '-4', '-2', '-2' ],
40 'lG' => [ '-4', '-2', '-4', '-2', '-5', '-3', '-1', '-2' ],
41 'pS' => [ '0', '33', '0', '16', '1', '12', '11', '25' ],
42 'pF' => [ '0', '0', '2', '0', '3', '0', '0', '0' ],
43 'pT' => [ '0', '8', '7', '10', '1', '2', '7', '8' ],
44 'pN' => [ '0', '0', '2', '13', '0', '36', '1', '4' ],
45 'pK' => [ '0', '5', '0', '13', '1', '15', '0', '2' ],
46 'pY' => [ '0', '0', '0', '0', '0', '0', '0', '0' ],
47 'pE' => [ '0', '41', '1', '12', '0', '0', '0', '15' ],
48 'pV' => [ '0', '3', '9', '0', '2', '0', '3', '1' ],
49 'pQ' => [ '0', '0', '0', '15', '0', '4', '0', '3' ],
50 'pM' => [ '100', '0', '66', '0', '2', '0', '0', '0' ],
51 'pC' => [ '0', '0', '0', '0', '0', '0', '0', '0' ],
52 'pL' => [ '0', '0', '8', '0', '25', '0', '4', '0' ],
53 'pA' => [ '0', '10', '1', '9', '2', '0', '22', '16' ],
54 'pW' => [ '0', '0', '0', '0', '0', '0', '0', '0' ],
55 'pP' => [ '0', '0', '0', '0', '3', '1', '45', '0' ],
56 'pH' => [ '0', '0', '0', '0', '0', '0', '1', '0' ],
57 'pD' => [ '0', '0', '1', '7', '2', '2', '0', '22' ],
58 'pR' => [ '0', '0', '0', '3', '0', '27', '0', '0' ],
59 'pI' => [ '0', '0', '3', '0', '59', '1', '2', '3' ],
60 'pG' => [ '0', '0', '0', '1', '0', '0', '4', '1' ],
63 my $matrix = Bio::Matrix::PSM::ProtMatrix( %param );
66 my $site = Bio::Matrix::PSM::ProtMatrix->new(%param);
67 # Or get it from a file:
68 use Bio::Matrix::PSM::IO;
69 my $psmIO = Bio::Matrix::PSM::IO->new(-file => $file, -format => 'psi-blast');
70 while (my $psm = $psmIO->next_psm) {
71 #Now we have a Bio::Matrix::PSM::Psm object,
72 # see Bio::Matrix::PSM::PsmI for details
73 #This is a Bio::Matrix::PSM::ProtMatrix object now
74 my $matrix = $psm->matrix;
77 # Get a simple consensus, where alphabet is:
78 # {A, R, N, D, C, Q, E, G, H, I, L, K, M, F, P, S, T, W, Y, V,}
79 # choosing the highest probability or N if prob is too low
80 my $consensus = $site->consensus;
82 # Retrieving and using regular expressions:
83 my $regexp = $site->regexp;
84 my $count = grep($regexp,$seq);
85 my $count = ($seq=~ s/$regexp/$1/eg);
86 print "Motif $mid is present $count times in this sequence\n";
88 =head1 DESCRIPTION
90 ProtMatrix is designed to provide some basic methods when working with
91 position scoring (weight) matrices related to protein sequences. A
92 protein PSM consists of 20 vectors with 20 frequencies (one per amino
93 acid per position). This is the minimum information you should
94 provide to construct a PSM object. The vectors can be provided as
95 strings with frequencies where the frequency is {0..a} and a=1. This
96 is the way MEME compressed representation of a matrix and it is quite
97 useful when working with relational DB. If arrays are provided as an
98 input (references to arrays actually) they can be any number, real or
99 integer (frequency or count).
101 When creating the object the constructor will check for positions that
102 equal 0. If such is found it will increase the count for all
103 positions by one and recalculate the frequency. Potential bug - if
104 you are using frequencies and one of the positions is 0 it will change
105 significantly. However, you should never have frequency that equals
108 Throws an exception if: You mix as an input array and string (for
109 example A matrix is given as array, C - as string). The position
110 vector is (0,0,0,0). One of the probability vectors is shorter than
111 the rest.
113 Summary of the methods I use most frequently (details bellow):
115 iupac - return IUPAC compliant consensus as a string
116 score - Returns the score as a real number
117 IC - information content. Returns a real number
118 id - identifier. Returns a string
119 accession - accession number. Returns a string
120 next_pos - return the sequence probably for each letter, IUPAC
121 symbol, IUPAC probability and simple sequence
122 consenus letter for this position. Rewind at the end. Returns a hash.
123 pos - current position get/set. Returns an integer.
124 regexp - construct a regular expression based on IUPAC consensus.
125 For example AGWV will be [Aa][Gg][AaTt][AaCcGg]
126 width - site width
127 get_string - gets the probability vector for a single base as a string.
128 get_array - gets the probability vector for a single base as an array.
129 get_logs_array - gets the log-odds vector for a single base as an array.
131 New methods, which might be of interest to anyone who wants to store
132 PSM in a relational database without creating an entry for each
133 position is the ability to compress the PSM vector into a string with
134 losing usually less than 1% of the data. this can be done with:
136 my $str=$matrix->get_compressed_freq('A');
139 my $str=$matrix->get_compressed_logs('A');
141 Loading from a database should be done with new, but is not yet implemented.
142 However you can still uncompress such string with:
144 my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1,1); for PSM
148 my @arr=Bio::Matrix::PSM::_uncompress_string ($str,1000,2); for log odds
150 =head1 FEEDBACK
152 =head2 Mailing Lists
154 User feedback is an integral part of the evolution of this and other
155 Bioperl modules. Send your comments and suggestions preferably to one
156 of the Bioperl mailing lists. Your participation is much appreciated.
158 bioperl-l@bioperl.org - General discussion
159 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
161 =head2 Support
163 Please direct usage questions or support issues to the mailing list:
165 L<bioperl-l@bioperl.org>
167 rather than to the module maintainer directly. Many experienced and
168 reponsive experts will be able look at the problem and quickly
169 address it. Please include a thorough description of the problem
170 with code and data examples if at all possible.
172 =head2 Reporting Bugs
174 Report bugs to the Bioperl bug tracking system to help us keep track
175 the bugs and their resolution. Bug reports can be submitted via the
176 web:
178 http://bugzilla.open-bio.org/
180 =head1 AUTHOR - James Thompson
182 Email tex@biosysadmin.com
184 =head1 APPENDIX
186 =cut
189 # Let the code begin...
190 package Bio::Matrix::PSM::ProtMatrix;
191 use strict;
193 use base qw(Bio::Root::Root Bio::Matrix::PSM::SiteMatrixI);
195 =head2 new
197 Title : new
198 Usage : my $site = Bio::Matrix::PSM::ProtMatrix->new(
199 %probs,
200 %logs,
201 -IC => $ic,
202 -e_val => $score,
203 -id => $mid
204 -model => \%model
206 Function : Creates a new Bio::Matrix::PSM::ProtMatrix object from memory
207 Throws : If inconsistent data for all vectors (all 20 amino acids) is
208 provided, if you mix input types (string vs array) or if a
209 position freq is 0.
210 Example :
211 Returns : Bio::Matrix::PSM::ProtMatrix object
212 Args : Hash references to log-odds scores and probabilities for
213 position-specific scoring info, e-value (optional), information
214 content (optional), id (optional), model for background distribution
215 of proteins (optional).
217 =cut
219 sub new {
220 my ($class, @args) = @_;
221 my $self = $class->SUPER::new(@args);
222 my $consensus;
223 #Too many things to rearrange, and I am creating simultanuously >500
224 # such objects routinely, so this becomes performance issue
225 my %input;
226 while( @args ) {
227 (my $key = shift @args) =~ s/-//gi; #deletes all dashes (only dashes)!
228 $input{$key} = shift @args;
231 # get a protein alphabet for processing log-odds scores and probabilities
232 # maybe change this later on to allow for non-standard aa lists?
233 my @alphabet = qw/A R N D C Q E G H I L K M F P S T W Y V/;
235 foreach my $aa (@alphabet) {
236 $self->{"log$aa"} = defined($input{"l$aa"}) ? $input{"l$aa"}
237 : $self->throw("Error: No log-odds information for $aa!");
238 $self->{"prob$aa"} = defined($input{"p$aa"}) ? $input{"p$aa"}
239 : $self->throw("Error: No probability information for $aa!");
242 $self->{_position} = 0;
243 $self->{IC} = $input{IC};
244 $self->{e_val} = $input{e_val};
245 $self->{sites} = $input{sites};
246 $self->{width} = $input{width};
247 $self->{accession_number} = $input{accession_number};
248 $self->{_correction} = defined($input{correction}) ?
249 $input{correction} : 1 ; # Correction might be unwanted- supply your own
250 # No id provided, null for the sake of rel db
251 $self->{id} = defined($input{id}) ? $input{id} : 'null';
252 $self->{_alphabet} = \@alphabet;
254 #Make consensus, throw if any one of the vectors is shorter
255 $self = _calculate_consensus($self,$input{model});
256 return $self;
259 =head2 alphabet
261 Title : Returns an array (or array reference if desired) to the alphabet
262 Usage :
263 Function : Returns an array (or array reference) containing all of the
264 allowable characters for this matrix.
265 Throws :
266 Example :
267 Returns : Array or arrary reference.
268 Args :
270 =cut
272 sub alphabet {
273 my $self = shift;
274 if ( wantarray ) {
275 return $self->{_alphabet};
276 } else {
277 return @{$self->{_alphabet}};
281 =head2 _calculate_consensus
283 Title : _calculate_consensus
284 Usage :
285 Function : Calculates the consensus sequence for this matrix.
286 Throws :
287 Example :
288 Returns :
289 Args :
291 =cut
293 sub _calculate_consensus {
294 my $self = shift;
295 my $thresh = shift;
297 # verify that all of the array lengths in %probs are the same
298 my @lengths = map { scalar(@$_) } map {$self->{"prob$_"}} @{ $self->{_alphabet} };
299 my $len = shift @lengths;
300 for ( @lengths ) {
301 if ( $_ ne $len ) { $self->throw( "Probability matrix is damaged!\n" ) };
304 # iterate over probs, generate the most likely sequence and put it into
305 # $self->{seq}. Put the probability of this sequence into $self->{seqp}.
306 for ( my $i = 0; $i < $len; $i++ ) {
307 # get a list of all the probabilities at position $i, ordered by $self->{_alphabet}
308 my @probs = map { ${$self->{"prob$_"}}[$i] } @{ $self->{_alphabet} };
309 # calculate the consensus of @probs, put sequence into seqp and probabilities into seqp
310 (${$self->{seq}}[$i],${$self->{seqp}}[$i]) = $self->_to_cons( @probs, $thresh );
313 return $self;
316 =head2 next_pos
318 Title : next_pos
319 Usage :
320 Function : Retrives the next position features: frequencies for all 20 amino
321 acids, log-odds scores for all 20 amino acids at this position,
322 the main (consensus) letter at this position, the probability
323 for the consensus letter to occur at this position and the relative
324 current position as an integer.
325 Throws :
326 Example :
327 Returns : hash (or hash reference) (pA,pR,pN,pD,...,logA,logR,logN,logD,aa,prob,rel)
328 - pN entries represent the probability for amino acid N
329 to be at this position
330 - logN entries represent the log-odds score for having amino acid
331 N at this position
332 - aa is the consensus amino acid
333 - prob is the probability for the consensus amino acid to be at this
334 position
335 - rel is the relative index of the current position (integer)
336 Args : none
339 =cut
341 sub next_pos {
342 my $self = shift;
343 $self->throw("instance method called on class") unless ref $self;
345 my $len = @{$self->{seq}};
346 my $pos = $self->{_position};
348 # return a PSM if we're still within range
349 if ($pos<$len) {
351 my %probs = map { ("p$_", ${$self->{"prob$_"}}[$pos]) } @{$self->{_alphabet}};
352 my %logs = map { ("l$_", ${$self->{"log$_"}}[$pos]) } @{$self->{_alphabet}};
353 my $base = ${$self->{seq}}[$pos];
354 my $prob = ${$self->{seqp}}[$pos];
356 $self->{_position}++;
357 my %hash = ( %probs, %logs, base => $base, rel => $pos, prob => $prob );
359 # decide whether to return the hash or a reference to it
360 if ( wantarray ) {
361 return %hash;
362 } else {
363 return \%hash;
365 } else { # otherwise, reset $self->{_position} and return nothing
366 $self->{_position} = 0;
367 return;
372 =head2 curpos
374 Title : curpos
375 Usage :
376 Function : Gets/sets the current position.
377 Throws :
378 Example :
379 Returns : Current position (integer).
380 Args : New position (integer).
382 =cut
384 sub curpos {
385 my $self = shift;
386 if (@_) { $self->{_position} = shift; }
387 return $self->{_position};
391 =head2 e_val
393 Title : e_val
394 Usage :
395 Function : Gets/sets the e-value
396 Throws :
397 Example :
398 Returns :
399 Args : real number
401 =cut
403 sub e_val {
404 my $self = shift;
405 if (@_) { $self->{e_val} = shift; }
406 return $self->{e_val};
410 =head2 IC
412 Title : IC
413 Usage :
414 Function : Position-specific information content.
415 Throws :
416 Example :
417 Returns : Information content for current position.
418 Args : Information content for current position.
420 =cut
422 sub IC {
423 my $self = shift;
424 if (@_) { $self->{IC} = shift; }
425 return $self->{IC};
428 =head2 accession_number
430 Title : accession_number
431 Usage :
432 Function: accession number, this will be unique id for the ProtMatrix object as
433 well for any other object, inheriting from ProtMatrix.
434 Throws :
435 Example :
436 Returns : New accession number (string)
437 Args : Accession number (string)
439 =cut
441 sub accession_number {
442 my $self = shift;
443 if (@_) { $self->{accession_number} = shift; }
444 return $self->{accession_number};
447 =head2 consensus
449 Title : consensus
450 Usage :
451 Function : Returns the consensus sequence for this PSM.
452 Throws : if supplied with thresold outisde 5..10 range
453 Example :
454 Returns : string
455 Args : (optional) threshold value 5 to 10 (corresponds to 50-100% at each position
457 =cut
459 sub consensus {
460 my $self = shift;
461 my $thresh=shift;
462 $self->_calculate_consensus($thresh) if ($thresh); #Change of threshold
463 my $consensus='';
465 foreach my $letter (@{$self->{seq}}) {
466 $consensus .= $letter;
469 return $consensus;
472 sub IUPAC {
473 my $self = shift;
474 return $self->consensus;
478 =head2 get_string
480 Title : get_string
481 Usage :
482 Function: Returns given probability vector as a string. Useful if you want to
483 store things in a rel database, where arrays are not first choice
484 Throws : If the argument is outside {A,C,G,T}
485 Example :
486 Returns : string
487 Args : character {A,C,G,T}
489 =cut
491 sub get_string {
492 my $self = shift;
493 my $base = shift;
494 my $string = '';
496 my @prob = @{$self->{"prob$base"}};
497 if ( ! @prob ) {
498 $self->throw( "No such base: $base\n");
501 foreach my $prob (@prob) {
502 my $corrected = $prob*10;
503 my $next = sprintf("%.0f",$corrected);
504 $next = 'a' if ($next eq '10');
505 $string .= $next;
507 return $string;
512 =head2 width
514 Title : width
515 Usage :
516 Function : Returns the length of the site
517 Throws :
518 Example :
519 Returns : number
520 Args :
522 =cut
524 sub width {
525 my $self = shift;
526 my $width = @{$self->{probA}};
527 return $width;
530 =head2 get_array
532 Title : get_array
533 Usage :
534 Function : Returns an array with frequencies for a specified amino acid.
535 Throws :
536 Example :
537 Returns : Array representing frequencies for specified amino acid.
538 Args : Single amino acid (character).
540 =cut
542 sub get_array {
543 my $self = shift;
544 my $letter = uc(shift);
546 $self->throw ("No such base: $letter!\n") unless grep { /$letter/ } @{$self->{_alphabet}};
548 return @{$self->{"prob$letter"}};
552 =head2 get_logs_array
554 Title : get_logs_array
555 Usage :
556 Function : Returns an array with log_odds for a specified base
557 Throws :
558 Example :
559 Returns : Array representing log-odds scores for specified amino acid.
560 Args : Single amino acid (character).
562 =cut
564 sub get_logs_array {
565 my $self = shift;
566 my $letter = uc(shift);
568 $self->throw ("No such base: $letter!\n") unless grep { /$letter/ } @{$self->{_alphabet}};
570 return @{$self->{"log$letter"}};
573 =head2 id
575 Title : id
576 Usage :
577 Function : Gets/sets the site id
578 Throws :
579 Example :
580 Returns : string
581 Args : string
583 =cut
585 sub id {
586 my $self = shift;
587 if (@_) { $self->{id} = shift; }
588 return $self->{id};
591 =head2 regexp
593 Title : regexp
594 Usage :
595 Function : Returns a case-insensitive regular expression which matches the
596 IUPAC convention. X's in consensus sequence will match anything.
597 Throws :
598 Example :
599 Returns : string
600 Args : Threshold for calculating consensus sequence (number in range 0-100
601 representing a percentage). Threshold defaults to 20.
603 =cut
605 sub regexp {
606 my $self = shift;
607 my $threshold = 20;
608 if ( @_ ) { my $threshold = shift };
610 my @alphabet = @{$self->{_alphabet}};
611 my $width = $self->width;
612 my (@regexp, $i);
613 for ( $i = 0; $i < $width; $i++ ) {
614 # get an array of the residues at this position with p > $threshold
615 my @letters = map { uc($_).lc($_) } grep { $self->{"prob$_"}->[$i] >= $threshold } @alphabet;
617 my $reg;
618 if ( scalar(@letters) == 0 ) {
619 $reg = '\.';
620 } else {
621 $reg = '['.join('',@letters).']';
623 push @regexp, $reg;
626 if ( wantarray ) {
627 return @regexp;
628 } else {
629 return join '', @regexp;
634 =head2 regexp_array
636 Title : regexp_array
637 Usage :
638 Function : Returns an array of position-specific regular expressions.
639 X's in consensus sequence will match anything.
640 Throws :
641 Example :
642 Returns : Array of position-specific regular expressions.
643 Args : Threshold for calculating consensus sequence (number in range 0-100
644 representing a percentage). Threshold defaults to 20.
645 Notes : Simply calls regexp method in list context.
647 =cut
649 sub regexp_array {
650 my $self = shift;
652 return @{ $self->regexp };
656 =head2 _compress_array
658 Title : _compress_array
659 Usage :
660 Function : Will compress an array of real signed numbers to a string (ie vector of bytes)
661 -127 to +127 for bi-directional(signed) and 0..255 for unsigned ;
662 Throws :
663 Example : Internal stuff
664 Returns : String
665 Args : array reference, followed by max value and direction (optional, defaults to 1),
666 direction of 1 is unsigned, anything else is signed.
668 =cut
670 sub _compress_array {
671 my ($array,$lm,$direct)=@_;
672 my $str;
673 return unless(($array) && ($lm));
674 $direct=1 unless ($direct);
675 my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
676 foreach my $c (@{$array}) {
677 $c=$lm if ($c>$lm);
678 $c=-$lm if (($c<-$lm) && ($direct !=1));
679 $c=0 if (($c<0) && ($direct ==1));
680 my $byte=int($k1*$c);
681 $byte=127+$byte if ($direct !=1);#Clumsy, should be really shift the bits
682 my $char=chr($byte);
683 $str.=$char;
685 return $str;
688 =head2 _uncompress_string
690 Title : _uncompress_string
691 Usage :
692 Function : Will uncompress a string (vector of bytes) to create an array of real
693 signed numbers (opposite to_compress_array)
694 Throws :
695 Example : Internal stuff
696 Returns : string, followed by max value and direction (optional, defaults to 1),
697 direction of 1 is unsigned, anything else is signed.
698 Args : array
700 =cut
702 sub _uncompress_string {
703 my ($str,$lm,$direct)=@_;
704 my @array;
705 return unless(($str) && ($lm));
706 $direct=1 unless ($direct);
707 my $k1= ($direct==1) ? (255/$lm) : (127/$lm);
708 while (my $c=chop($str)) {
709 my $byte=ord($c);
710 $byte=$byte-127 if ($direct !=1);#Clumsy, should be really shift the bits
711 my $num=$byte/$k1;
712 unshift @array,$num;
715 return @array;
718 =head2 get_compressed_freq
720 Title : get_compressed_freq
721 Usage :
722 Function: A method to provide a compressed frequency vector. It uses one byte to
723 code the frequence for one of the probability vectors for one position.
724 Useful for relational database. Improvment of the previous 0..a coding.
725 Throws :
726 Example : my $strA=$self->get_compressed_freq('A');
727 Returns : String
728 Args : char
730 =cut
732 sub get_compressed_freq {
733 my $self=shift;
734 my $base=shift;
735 my $string='';
736 my @prob;
737 BASE: {
738 if ($base eq 'A') {
739 @prob = @{$self->{probA}} unless (!defined($self->{probA}));
740 last BASE;
742 if ($base eq 'G') {
743 @prob = @{$self->{probG}} unless (!defined($self->{probG}));
744 last BASE;
746 if ($base eq 'C') {
747 @prob = @{$self->{probC}} unless (!defined($self->{probC}));
748 last BASE;
750 if ($base eq 'T') {
751 @prob = @{$self->{probT}} unless (!defined($self->{probT}));
752 last BASE;
754 $self->throw ("No such base: $base!\n");
756 my $str= _compress_array(\@prob,1,1);
757 return $str;
760 =head2 sequence_match_weight
762 Title : sequence_match_weight
763 Usage :
764 Function : This method will calculate the score of a match, based on the PSM
765 if such is associated with the matrix object. Returns undef if no
766 PSM data is available.
767 Throws : if the length of the sequence is different from the matrix width
768 Example : my $score=$matrix->sequence_match_weight('ACGGATAG');
769 Returns : Floating point
770 Args : string
772 =cut
774 sub sequence_match_weight {
775 my ($self,$seq)=@_;
776 return unless ($self->{logA});
778 my $seqlen = length($seq);
779 my $width = $self->width;
780 $self->throw("Error: Input sequence size ($seqlen) not equal to PSM size ($width)!\n")
781 unless (length($seq) == $self->width);
783 my ($score,$i) = (0,0);
784 foreach my $letter ( split //, $seq ) {
785 # add up the score for this position
786 $score += $self->{"log$letter"}->[$i];
787 $i++;
789 return $score;
793 =head2 _to_IUPAC
795 Title : _to_IUPAC
796 Usage :
797 Function: Converts a single position to IUPAC compliant symbol and returns its probability.
798 Currently returns the most likely amino acid/probability combination.
799 Throws :
800 Example :
801 Returns : char, real number representing an amino acid and a probability.
802 Args : real numbers for all 20 amino acids (ordered by alphabet contained
803 in $self->{_alphabet}, minimum probability threshold.
805 =cut
807 sub _to_IUPAC {
808 my ($self,@probs,$thresh) = @_;
810 # provide a default threshold of 5, corresponds to 5% threshold for
811 # inferring that the aa at any position is the true aa
812 $thresh = 5 unless ( defined $thresh );
814 my ($IUPAC_aa,$max_prob) = ('X',$thresh);
815 for my $aa ( @{$self->{_alphabet}} ) {
816 my $prob = shift @probs;
817 if ( $prob > $max_prob ) {
818 $IUPAC_aa = $aa;
819 $max_prob = $prob;
823 return $IUPAC_aa, $max_prob;
826 =head2 _to_cons
828 Title : _to_cons
829 Usage :
830 Function: Converts a single position to simple consensus character and returns
831 its probability. Currently just calls the _to_IUPAC subroutine.
832 Throws :
833 Example :
834 Returns : char, real number
835 Args : real numbers for A,C,G,T (positional)
837 =cut
839 sub _to_cons {
840 return _to_IUPAC( @_ );
843 =head2 get_all_vectors
845 Title : get_all_vectors
846 Usage :
847 Function : returns all possible sequence vectors to satisfy the PFM under
848 a given threshold
849 Throws : If threshold outside of 0..1 (no sense to do that)
850 Example : my @vectors = $self->get_all_vectors(4);
851 Returns : Array of strings
852 Args : (optional) floating
854 =cut
856 #sub get_all_vectors {
857 # my $self = shift;
858 # my $thresh = shift;
860 # $self->throw("Out of range. Threshold should be >0 and 1<.\n") if (($thresh<0) || ($thresh>1));
862 # my @seq = split(//,$self->consensus($thresh*10));
863 # my @perm;
864 # for my $i (0..@{$self->{probA}}) {
865 # push @{$perm[$i]},'A' if ($self->{probA}->[$i]>$thresh);
866 # push @{$perm[$i]},'C' if ($self->{probC}->[$i]>$thresh);
867 # push @{$perm[$i]},'G' if ($self->{probG}->[$i]>$thresh);
868 # push @{$perm[$i]},'T' if ($self->{probT}->[$i]>$thresh);
869 # push @{$perm[$i]},'N' if ($seq[$i] eq 'N');
871 # my $fpos=shift @perm;
872 # my @strings=@$fpos;
873 # foreach my $pos (@perm) {
874 # my @newstr;
875 # foreach my $let (@$pos) {
876 # foreach my $string (@strings) {
877 # my $newstring = $string . $let;
878 # push @newstr,$newstring;
881 # @strings=@newstr;
883 # return @strings;