3 // Copyright (C) 2008 Elena Grassi <grassi.e@gmail.com>
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by the
7 // Free Software Foundation; either version 2, or (at your option) any later
10 // This program is distributed in the hope that it will be useful, but
11 // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY
12 // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
21 #include "total_affinity.h"
24 //XXX fix Paolo's format
26 Function: get_matrixes
27 Loads matrixes from the given file, putting their infos in info.
30 info - pointer to run struct with a properly opened FILE* with matrixes
31 (Paolo's format), info->count tells if we already have percentages
32 [0] or pseudocounts [1] or counts [2] of single nucleotides.
33 Error flags will be put there.
36 void get_matrixes(run info
)
39 if(info
->window
!=0 && info
->window
!= -1){
40 assert(info
->window
% info
->slide
== 0);
41 cache_size
= info
->window
/ info
->slide
;
43 if (alloc_matrixes(&(info
->matrixes
),cache_size
))
44 info
->error
= MEMORY_ERROR
;
45 matrix_ll
*matrixes
= info
->matrixes
;
46 char name
[MAX_MATRIX_NAME
];
47 char old_name
[MAX_MATRIX_NAME
];
56 if (fscanf(info
->f_matrixes
, "%s\t%*d\t%lf\t%lf\t%lf\t%lf\n",
57 name
, &acgt
[A
], &acgt
[C
], &acgt
[G
], &acgt
[T
]) != 5) {
59 printf("You fool fscanf!\n");
61 if (fgetc(info
->f_matrixes
) != EOF
)
62 info
->error
= MATRIX_FILE_ERROR
;
66 if (strcmp(name
, old_name
)) { /* we have a new matrix */
68 printf("Last was %d, now adding %s\n", length
, name
);
70 if (info
->n_matrixes
!= 0) {
71 (*matrixes
)->length
= length
; /* assign length to last matrix */
72 get_fractions_from_pcounts(*matrixes
, info
);
74 matrixes
++; /* next matrix to be loaded */
79 strcpy((*matrixes
)->name
, name
);
81 for (i
= 0; i
< BASES
; i
++)
82 (*matrixes
)->freq
[length
][i
] = acgt
[i
];
84 if (info
->n_matrixes
>= MAX_MATRIXES
85 || length
>= MAX_MATRIX_LENGTH
) {
87 info
->error
= MATRIX_LIMIT_ERROR
;
89 strcpy(old_name
, name
);
91 (*matrixes
)->length
= length
; /* assign length to last matrix */
92 get_fractions_from_pcounts(*matrixes
, info
);
96 void get_cutoffs(run info
)
101 char name
[MAX_MATRIX_NAME
];
102 int found
[info
->n_matrixes
];
103 for (; i
< info
->n_matrixes
; i
++)
105 /* matrix name \t absolute_cutoff \t fractional_cutoff */
107 (info
->f_cutoffs
, "%s\t\t%lf\t%lf\n", name
, &abs_c
,
109 if (frac_c
> 1 || frac_c
< 0) {
111 "WARNING: invalid fractional cutoff for %s matrix!\n ",
113 info
->error
= CUTOFF_FILE_ERROR
;
116 int matrix_index
= find_matrix_index(info
, name
);
117 if (matrix_index
== -1) {
118 /* We issue a warning for unknown matrixes. */
120 "WARNING: matrix %s in the cutoff file is not in the matrix file, ignoring it\n",
122 } else if (found
[matrix_index
]) {
123 /* We exit if a double matrix is found. */
125 "WARNING: matrix %s in the cutoff file is doubled!\n",
127 info
->error
= CUTOFF_FILE_ERROR
;
131 found
[matrix_index
] = 1;
132 info
->matrixes
[matrix_index
]->abs_cutoff
= abs_c
;
133 info
->matrixes
[matrix_index
]->frac_cutoff
= frac_c
;
136 if (fgetc(info
->f_cutoffs
) != EOF
) {
137 info
->error
= CUTOFF_FILE_ERROR
;
140 if (c
!= info
->n_matrixes
) {
142 "WARNING: Some matrixes are missing in the cutoff file:\n");
143 for (i
= 0; i
< info
->n_matrixes
; i
++) {
145 fprintf(stderr
, "%s\n",
146 info
->matrixes
[i
]->name
);
149 info
->error
= CUTOFF_FILE_ERROR
;
155 int find_matrix_index(run info
, char *name
)
157 /* Non-duplicate matrix names is a prerequisite. */
158 /* Use an hash to be more efficient. */
160 for (; i
< info
->n_matrixes
; i
++) {
161 if (!strcmp(info
->matrixes
[i
]->name
, name
))
168 Function: get_fractions_from_pcounts.
170 Get nucleotides frequencies for a given matrix_ll with counts or pseudocounts
171 stored in the freq array and store there the frequency.
172 If count evaluates true (default) 0 values are converted to 1 (pseudocount),
173 otherwise (perc to be implemented, TODO).
174 Issues an error in info->error if it finds a zero total for counts in the
178 m - matrix_ll with matrixes data.
179 info - run with the desired count.
181 /* possible gain of speed getting ACGT total in get_matrixes */
182 void get_fractions_from_pcounts(matrix_ll m
, run info
)
186 for (j
= 0; j
< m
->length
; j
++) {
187 for (i
= 0; i
< BASES
; i
++) {
188 int val
= (int)m
->freq
[j
][i
];
189 if (info
->counts
&& val
!= m
->freq
[j
][i
]) { //then it's not an integer
190 info
->error
= MATRIX_COUNT_ERROR
;
193 if (m
->freq
[j
][i
] <= EEEPSILON
&& !info
->keep_zeroes
) {
197 info
->error
= FREQ_ZERO_ERROR
;
201 tot
+= (int)m
->freq
[j
][i
];
206 info
->error
= MATRIX_COUNT_ERROR
;
209 for (i
= 0; i
< BASES
; i
++) {
210 m
->freq
[j
][i
] = m
->freq
[j
][i
] / tot
;
218 Function: normalize_fractions
226 Computes log-likelihood given a fasta with background informations and a
227 matrix_ll with freq loaded (counts already converted to fractions).
228 This function does not compute affinities, it's only a preprocess step.
231 m - matrix_ll with freq loaded and missing ll.
232 f - fasta with background informations to compute log likelihoods.
233 info - pointer to run struct to set ad error flag if there is a zero or
234 negative freq (or background)
236 void assign_ll(matrix_ll m
, fasta f
, run info
)
240 for (; j
< m
->length
; j
++)
241 for (i
= 0; i
< BASES
; i
++) {
243 m
->ll
[j
][i
] = log2_ratio(m
->freq
[j
][i
], f
->background
[i
], info
);
245 m
->ll
[j
][i
] = m
->freq
[j
][i
];
250 Function: assign_cutoff
252 Assign a cutoff to a matrix_ll using the fractional cutoff
253 in the given run. If given_abs_cutoff evaluates true it uses the
254 weakest cutoff (the smallest) between abs and fractional.
257 m - matrix_ll with ll loaded and missing cutoff.
258 info - run with cutoff informations.
260 void assign_cutoff(matrix_ll m
, run info
)
267 for (; j
< m
->length
; j
++) {
269 for (i
= 1; i
< BASES
; i
++) {
270 if (m
->ll
[j
][i
] > max
) {
278 /* If a file with cutoffs is given we will use them, otherwise we have default
279 or given parameters equals for every matrix. */
280 if (info
->f_cutoffs
) {
281 info
->cutoff
= m
->frac_cutoff
;
282 info
->abs_cutoff
= m
->abs_cutoff
;
284 m
->cutoff
= max_tot
* info
->cutoff
;
286 if ((info
->given_abs_cutoff
|| info
->f_cutoffs
)
287 && m
->cutoff
> info
->abs_cutoff
)
288 m
->cutoff
= info
->abs_cutoff
;
290 if (info
->miRNA
) { /* We count matches for miRNA seeking mode. */
295 /* Function: log2_ratio
297 Returns the log2ratio or only the ratio (depending on info->log2),
298 sets error to 1 if the ratio is non-positive.
299 Computes log2ratio if info->log2 evaluates true, ratio otherwise.
302 n - double which will be used as numerator.
303 d - double which will be used as denominator.
304 info - run used to set error flags and determine if log has to be applied.
307 double which is the log2-ratio or simple ratio between n and d.
308 (will be 0 if an error occurred).
311 double log2_ratio(double n
, double d
, run info
)
314 info
->error
= BACKGROUND_FREQ_ERROR
;
317 double ratio
= n
/ d
;
318 if (ratio
<= 0 && !(ratio
== 0 && info
->keep_zeroes
)) {
319 info
->error
= BACKGROUND_FREQ_ERROR
;
323 return log(ratio
) / log(2);
329 Function: alloc_matrixes
331 Creates space (calloc) for all the matrixes that will be loaded.
334 m - matrix_ll ** where the matrixes will be stored.
337 ERROR (1) if any calloc call failed, OK (0) otherwise.
339 int alloc_matrixes(matrix_ll
** m
, int cache_size
)
341 *m
= (matrix_ll
*) calloc(MAX_MATRIXES
+ 1, sizeof(matrix_ll
));
347 for (i
= 0; i
< MAX_MATRIXES
; i
++) {
348 (*m
)[i
] = (matrix_ll
) calloc(1, sizeof(struct matrix_ll_
));
350 free_matrixes(*m
, i
);
354 (double **)calloc(MAX_MATRIX_LENGTH
+ 1, sizeof(double *));
355 if (!((*m
)[i
]->ll
)) {
356 free_matrixes(*m
, i
);
360 (double **)calloc(MAX_MATRIX_LENGTH
+ 1, sizeof(double *));
361 if (!((*m
)[i
]->freq
)) {
362 free_matrixes(*m
, i
);
365 (*m
)[i
]->cache
= NULL
;
368 (double *)calloc(cache_size
, sizeof(double));
369 if (!((*m
)[i
]->cache
)) {
370 free_matrixes(*m
, i
);
374 (*m
)[i
]->normaliz_cache
= NULL
;
376 (*m
)[i
]->normaliz_cache
=
377 (int *)calloc(cache_size
, sizeof(double));
378 if (!((*m
)[i
]->normaliz_cache
)) {
379 free_matrixes(*m
, i
);
384 for (cur
= (*m
)[i
]->ll
; cur
< (*m
)[i
]->ll
+ MAX_MATRIX_LENGTH
;
386 *cur
= (double *)calloc(BASES
, sizeof(double));
388 free_matrixes(*m
, i
);
392 for (cur
= (*m
)[i
]->freq
;
393 cur
< (*m
)[i
]->freq
+ MAX_MATRIX_LENGTH
; cur
++) {
394 *cur
= (double *)calloc(BASES
, sizeof(double));
396 free_matrixes(*m
, i
);
400 (*m
)[i
]->name
= (char *)calloc(MAX_MATRIX_NAME
, sizeof(char));
401 if (!((*m
)[i
]->name
)) {
402 free_matrixes(*m
, i
);
410 Function: free_matrixes
412 Frees the space used by all the loaded matrixes.
416 m - matrix_ll * with the matrixes to be freed.
417 loaded - number of matrixes to be freed.
419 void free_matrixes(matrix_ll
* m
, int loaded
)
422 for (i
= 0; i
< loaded
; i
++) {
425 for (cur
= m
[i
]->ll
; cur
< m
[i
]->ll
+ MAX_MATRIX_LENGTH
; cur
++)
428 for (cur
= m
[i
]->freq
; cur
< m
[i
]->freq
+ MAX_MATRIX_LENGTH
; cur
++)
432 free(m
[i
]->normaliz_cache
);
439 Function: get_affinity
441 Computes affinity between a matrix and a slice of a string, given the first char to be evaluated.
442 Assumes enough space in the string. Returns total values ("straight" + revcomp).
446 m - matrix_ll matrix that will be used.
447 s - char * versus which matrix will be compared.
448 start - int first position for affinity computation.
452 double with the total affinity (sum of all log_likelihoods) between
453 the given matrix and string.
455 double get_affinity(matrix_ll m
, char *s
, int start
)
459 while (i
< m
->length
) {
460 /* switch(toupper(s[i])) { //toupper done while loading fasta for efficiency's sake */
463 tot
+= m
->ll
[i
][A
] + m
->ll
[(m
->length
) - i
- 1][T
];
466 tot
+= m
->ll
[i
][C
] + m
->ll
[(m
->length
) - i
- 1][G
];
469 tot
+= m
->ll
[i
][G
] + m
->ll
[(m
->length
) - i
- 1][C
];
472 tot
+= m
->ll
[i
][T
] + m
->ll
[(m
->length
) - i
- 1][A
];
485 Function: get_affinities
487 Computes affinity between a matrix and a slice of a string, given the first char to be evaluated.
488 Assumes enough space in the string. Puts results in the given double array ("straight" and revcomp
489 are evaluated at the same time).
493 m - matrix_ll matrix that will be used.
494 s - char * versus which matrix will be compared.
495 start - int first position for affinity computation.
496 results - double * where affinities will be put.
498 void get_affinities(matrix_ll m
, char *s
, int start
, double *results
)
500 /* results[0] is straight total, results[1] revcomp */
504 while (i
< m
->length
) {
507 results
[0] += m
->ll
[i
][A
];
508 results
[1] += m
->ll
[(m
->length
) - i
- 1][T
];
511 results
[0] += m
->ll
[i
][C
];
512 results
[1] += m
->ll
[(m
->length
) - i
- 1][G
];
515 results
[0] += m
->ll
[i
][G
];
516 results
[1] += m
->ll
[(m
->length
) - i
- 1][C
];
519 results
[0] += m
->ll
[i
][T
];
520 results
[1] += m
->ll
[(m
->length
) - i
- 1][A
];
533 double get_affinities_nonLog(matrix_ll m
, char *s
, int start
, int *foundN
, run info
)
535 /* results[0] is straight total, results[1] revcomp */
540 while (i
< m
->length
) {
543 results
[0] *= m
->ll
[i
][A
];
544 results
[1] *= m
->ll
[(m
->length
) - i
- 1][T
];
547 results
[0] *= m
->ll
[i
][C
];
548 results
[1] *= m
->ll
[(m
->length
) - i
- 1][G
];
551 results
[0] *= m
->ll
[i
][G
];
552 results
[1] *= m
->ll
[(m
->length
) - i
- 1][C
];
555 results
[0] *= m
->ll
[i
][T
];
556 results
[1] *= m
->ll
[(m
->length
) - i
- 1][A
];
567 if (info
->single_strand
)
570 return (results
[0] > results
[1]) ? results
[0] : results
[1];
574 double get_affinities_single_strand(matrix_ll m
, char *s
, int start
, int *foundN
, run info
)
578 while (i
< m
->length
) {
600 if (fabs(res
- m
->cutoff
) <= EEEPSILON
)
609 Computes affinity between a matrix and a string on all viable positions.
613 m - matrix_ll matrix that will be used.
614 s - fasta versus which matrix will be compared.
615 how_many - int pointer where the number of viable positions will be put.
616 info - run to issue error flags (memory error or fasta too short).
620 double array which stores total affinity of the matrix for each viable
621 position on the given string (starts at 0, +1 and evaluates it until there
622 are enough chars left in the given string). NULL if a memory error occurs or
623 the fasta is too short to find the given matrix.
625 double *matrix_run(matrix_ll m
, fasta s
, int *how_many
, run info
)
628 printf("matrix_run with %s %d\n", m
->name
, m
->length
);
630 *how_many
= s
->length
- m
->length
+ 1;
631 if (*how_many
<= 0) {
632 info
->error
= SHORT_FASTA_WARNING
;
635 double *res
= (double *)calloc(*how_many
, sizeof(double));
637 info
->error
= MEMORY_ERROR
;
641 while (m
->length
<= s
->length
- offset
) {
643 res
[offset
] = info
->get_affinities_pointer(m
, s
->seq
, offset
, &dummy
, info
);
645 printf("Seen affinity %g at offset %d\n", res
[offset
], offset
);
650 printf("matrix_run %d %d\n", offset
, *how_many
);
652 assert(offset
== *how_many
);
658 Function: matrix_little_window_print_tot
660 Computes affinity between a matrix and a string on all viable positions and
661 returns the sum of all affinities.
665 m - matrix_ll matrix that will be used.
666 seq - sequence versus which matrix will be compared.
672 double with total affinity between the given matrix and fasta,
673 normalized upon the number of matrix matches != 0 (i.e. without an N).
675 double matrix_little_window_tot(matrix_ll m
, fasta f
, int begin
, int end
, int *tot_match_noN
, run info
)
682 fprintf(stderr
, "ERROR: invalid begin in matrix_little_window_tot()\n");
689 fprintf(stderr
, "ERROR: invalid end in matrix_little_window_tot()\n");
697 (*tot_match_noN
) = 0;
698 while (offset
<= end
- m
->length
) {
700 double match
= get_affinities_nonLog(m
, seq
, offset
, &foundN
, info
);
702 //if (!match) { //now we directly count the Ns
712 Function: print_window_affinities
714 Prints (on stdout) total affinities for all possible windows, getting infos from a
715 double array which has affinities for a matrix computed for all positions.
719 tot_aff - double * with the computed affinities for all positions.
720 window - int with the wanted window size.
721 tot - length of tot_aff array.
722 slide - number of bases to be skipped between two outputs.
724 void print_window_affinities(double *tot_aff
, int window
, int tot
, int slide
)
728 printf("%d: %g\n", k, tot_aff[k]);
729 printf("Using window %d tot %d\n", window, tot); */
731 int tot_match_noN
= 0;
737 //printf("For window %d at %d adding %g\n", cw, i, tot_aff[i]);
745 printf("%d\t%g\n", cw
++, tot_w
/ tot_match_noN
);
746 if (slide
< window
) {
748 for (j
= (i
- window
); j
< (i
+ slide
- window
);
750 //printf("For window %d at %d removing %g\n", cw, j, tot_aff[j]);
762 //printf("Starting for a long jump after w.%d, to %d\n", cw, i);
767 printf("%d\t%g\n", cw
++, tot_w
/ tot_match_noN
);
772 Function: matrix_cutoff_print
774 Computes affinity between a matrix and a string on all viable positions,
775 compares with the cutoff and prints the result if they are greater.
779 m - matrix_ll matrix that will be used.
780 s - fasta versus which matrix will be compared.
781 info - run with cutoff record that says if we have
782 to print the matched revcomp (1) or straight seq (0).
784 void matrix_cutoff_print(matrix_ll m
, fasta s
, run info
)
787 printf("matrix_cutoff_print with %s %d\n", m
->name
, m
->length
);
790 int max_strand
= 1; /* 1 is +, 0 is - */
792 int seen_any_match
= 0;
795 char *match
= (char *)calloc((m
->length
) + 1, sizeof(char));
796 char *max_match
= (char *)calloc((m
->length
) + 1, sizeof(char));
797 if (!match
|| !max_match
) {
798 info
->error
= MEMORY_ERROR
;
801 double affinities
[2];
803 while (m
->length
<= s
->length
- offset
) {
805 get_affinities(m
, s
->seq
, offset
, affinities
);
806 if (info
->only_max
) {
808 max_aff
= affinities
[0];
809 double max_local
= affinities
[0];
810 int max_strand_local
= 1;
811 if (affinities
[1] > affinities
[0]) {
812 max_local
= affinities
[1];
813 max_strand_local
= 0;
815 /* remembers only last higher match */
816 if (max_aff
<= max_local
) {
819 max_strand
= max_strand_local
;
821 get_rc(s
->seq
, offset
, m
->length
,
824 get_seq(s
->seq
, offset
, m
->length
,
828 if (affinities
[0] >= m
->cutoff
) {
829 get_seq(s
->seq
, offset
, m
->length
, match
);
830 printf("%s\t%s\t%g\t%d\t%s\n", m
->name
, match
,
831 affinities
[0], offset
, plus
);
835 printf("BELOW CUTOFF %g %s\t%g\t%d\t%s\n",
836 m
->cutoff
, m
->name
, affinities
[0],
839 if (affinities
[1] >= m
->cutoff
) {
841 get_rc(s
->seq
, offset
, m
->length
,
843 else /* very unlikely to have match already there for a plus match for the same matrix? XXX */
844 get_seq(s
->seq
, offset
, m
->length
,
846 printf("%s\t%s\t%g\t%d\t%s\n", m
->name
, match
,
847 affinities
[1], offset
, minus
);
851 printf("BELOW CUTOFF %g %s\t%g\t%d\t%s\n",
852 m
->cutoff
, m
->name
, affinities
[1],
858 if (info
->only_max
&& seen_any_match
) {
860 printf("%s\t%s\t%g\t%d\t%s\n", m
->name
, max_match
,
861 max_aff
, max_pos
, plus
);
863 printf("%s\t%s\t%g\t%d\t%s\n", m
->name
, max_match
,
864 max_aff
, max_pos
, minus
);
869 printf("matrix_run %d\n", offset
);
874 void find_single_strand_matches(matrix_ll m
, fasta s
, run info
)
877 char *match
= (char *)calloc((m
->length
) + 1, sizeof(char));
879 info
->error
= MEMORY_ERROR
;
882 while (m
->length
<= s
->length
- offset
) {
884 int found
= (int) info
->get_affinities_pointer(m
, s
->seq
, offset
, &dummy
, info
);
886 get_seq(s
->seq
, offset
, m
->length
, match
); // XXX remove only for testing purposes, inefficient
887 printf("%s\t%s\t%d\n", m
->name
, match
, offset
);