expressions: Merge all the little generator programs into generate.pl.
[pspp.git] / src / math / covariance.c
blob3ec90efbba05188c68bddd54ce53bc2a8e7a5d0f
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include "math/covariance.h"
21 #include <gsl/gsl_matrix.h>
23 #include "data/case.h"
24 #include "data/variable.h"
25 #include "libpspp/assertion.h"
26 #include "libpspp/misc.h"
27 #include "math/categoricals.h"
28 #include "math/interaction.h"
29 #include "math/moments.h"
31 #include "gl/xalloc.h"
33 #define n_MOMENTS (MOMENT_VARIANCE + 1)
36 /* Create a new matrix of NEW_SIZE x NEW_SIZE and copy the elements of
37 matrix IN into it. IN must be a square matrix, and in normal usage
38 it will be smaller than NEW_SIZE.
39 IN is destroyed by this function. The return value must be destroyed
40 when no longer required.
42 static gsl_matrix *
43 resize_matrix (gsl_matrix *in, size_t new_size)
45 size_t i, j;
47 gsl_matrix *out = NULL;
49 assert (in->size1 == in->size2);
51 if (new_size <= in->size1)
52 return in;
54 out = gsl_matrix_calloc (new_size, new_size);
56 for (i = 0; i < in->size1; ++i)
58 for (j = 0; j < in->size2; ++j)
60 double x = gsl_matrix_get (in, i, j);
62 gsl_matrix_set (out, i, j, x);
66 gsl_matrix_free (in);
68 return out;
71 struct covariance
73 /* The variables for which the covariance matrix is to be calculated. */
74 size_t n_vars;
75 const struct variable *const *vars;
77 /* Categorical variables. */
78 struct categoricals *categoricals;
80 /* Array containing number of categories per categorical variable. */
81 size_t *n_categories;
83 /* Dimension of the covariance matrix. */
84 size_t dim;
86 /* The weight variable (or NULL if none) */
87 const struct variable *wv;
89 /* A set of matrices containing the 0th, 1st and 2nd moments */
90 gsl_matrix **moments;
92 /* The class of missing values to exclude */
93 enum mv_class exclude;
95 /* An array of doubles representing the covariance matrix.
96 Only the top triangle is included, and no diagonals */
97 double *cm;
98 int n_cm;
100 /* 1 for single pass algorithm;
101 2 for double pass algorithm
103 short passes;
106 0 : No pass has been made
107 1 : First pass has been started
108 2 : Second pass has been
110 IE: How many passes have been (partially) made. */
111 short state;
113 /* Flags indicating that the first case has been seen */
114 bool pass_one_first_case_seen;
115 bool pass_two_first_case_seen;
117 gsl_matrix *unnormalised;
122 /* Return a matrix containing the M th moments.
123 The matrix is of size NxN where N is the number of variables.
124 Each row represents the moments of a variable.
125 In the absence of missing values, the columns of this matrix will
126 be identical. If missing values are involved, then element (i,j)
127 is the moment of the i th variable, when paired with the j th variable.
129 const gsl_matrix *
130 covariance_moments (const struct covariance *cov, int m)
132 return cov->moments[m];
137 /* Create a covariance struct.
139 struct covariance *
140 covariance_1pass_create (size_t n_vars, const struct variable *const *vars,
141 const struct variable *weight, enum mv_class exclude)
143 size_t i;
144 struct covariance *cov = xzalloc (sizeof *cov);
146 cov->passes = 1;
147 cov->state = 0;
148 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
150 cov->vars = vars;
152 cov->wv = weight;
153 cov->n_vars = n_vars;
154 cov->dim = n_vars;
156 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
158 for (i = 0; i < n_MOMENTS; ++i)
159 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
161 cov->exclude = exclude;
163 cov->n_cm = (n_vars * (n_vars - 1) ) / 2;
166 cov->cm = xcalloc (cov->n_cm, sizeof *cov->cm);
167 cov->categoricals = NULL;
169 return cov;
173 Create a covariance struct for a two-pass algorithm. If categorical
174 variables are involed, the dimension cannot be know until after the
175 first data pass, so the actual covariances will not be allocated
176 until then.
178 struct covariance *
179 covariance_2pass_create (size_t n_vars, const struct variable *const *vars,
180 struct categoricals *cats,
181 const struct variable *wv, enum mv_class exclude)
183 size_t i;
184 struct covariance *cov = xmalloc (sizeof *cov);
186 cov->passes = 2;
187 cov->state = 0;
188 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
190 cov->vars = vars;
192 cov->wv = wv;
193 cov->n_vars = n_vars;
194 cov->dim = n_vars;
196 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
198 for (i = 0; i < n_MOMENTS; ++i)
199 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
201 cov->exclude = exclude;
203 cov->n_cm = -1;
204 cov->cm = NULL;
206 cov->categoricals = cats;
207 cov->unnormalised = NULL;
209 return cov;
212 /* Return an integer, which can be used to index
213 into COV->cm, to obtain the I, J th element
214 of the covariance matrix. If COV->cm does not
215 contain that element, then a negative value
216 will be returned.
218 static int
219 cm_idx (const struct covariance *cov, int i, int j)
221 int as;
222 const int n2j = cov->dim - 2 - j;
223 const int nj = cov->dim - 2 ;
225 assert (i >= 0);
226 assert (j < cov->dim);
228 if ( i == 0)
229 return -1;
231 if (j >= cov->dim - 1)
232 return -1;
234 if ( i <= j)
235 return -1 ;
237 as = nj * (nj + 1) ;
238 as -= n2j * (n2j + 1) ;
239 as /= 2;
241 return i - 1 + as;
246 Returns true iff the variable corresponding to the Ith element of the covariance matrix
247 has a missing value for case C
249 static bool
250 is_missing (const struct covariance *cov, int i, const struct ccase *c)
252 const struct variable *var = i < cov->n_vars ?
253 cov->vars[i] :
254 categoricals_get_interaction_by_subscript (cov->categoricals, i - cov->n_vars)->vars[0];
256 const union value *val = case_data (c, var);
258 return var_is_value_missing (var, val, cov->exclude);
262 static double
263 get_val (const struct covariance *cov, int i, const struct ccase *c)
265 if ( i < cov->n_vars)
267 const struct variable *var = cov->vars[i];
269 const union value *val = case_data (c, var);
271 return val->f;
274 return categoricals_get_effects_code_for_case (cov->categoricals, i - cov->n_vars, c);
277 #if 0
278 void
279 dump_matrix (const gsl_matrix *m)
281 size_t i, j;
283 for (i = 0 ; i < m->size1; ++i)
285 for (j = 0 ; j < m->size2; ++j)
286 printf ("%02f ", gsl_matrix_get (m, i, j));
287 printf ("\n");
290 #endif
292 /* Call this function for every case in the data set */
293 void
294 covariance_accumulate_pass1 (struct covariance *cov, const struct ccase *c)
296 size_t i, j, m;
297 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
299 assert (cov->passes == 2);
300 if (!cov->pass_one_first_case_seen)
302 assert (cov->state == 0);
303 cov->state = 1;
306 if (cov->categoricals)
307 categoricals_update (cov->categoricals, c);
309 for (i = 0 ; i < cov->dim; ++i)
311 double v1 = get_val (cov, i, c);
313 if ( is_missing (cov, i, c))
314 continue;
316 for (j = 0 ; j < cov->dim; ++j)
318 double pwr = 1.0;
320 if ( is_missing (cov, j, c))
321 continue;
323 for (m = 0 ; m <= MOMENT_MEAN; ++m)
325 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
327 *x += pwr * weight;
328 pwr *= v1;
333 cov->pass_one_first_case_seen = true;
337 /* Call this function for every case in the data set */
338 void
339 covariance_accumulate_pass2 (struct covariance *cov, const struct ccase *c)
341 size_t i, j;
342 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
344 assert (cov->passes == 2);
345 assert (cov->state >= 1);
347 if (! cov->pass_two_first_case_seen)
349 size_t m;
350 assert (cov->state == 1);
351 cov->state = 2;
353 if (cov->categoricals)
354 categoricals_done (cov->categoricals);
356 cov->dim = cov->n_vars;
358 if (cov->categoricals)
359 cov->dim += categoricals_df_total (cov->categoricals);
361 cov->n_cm = (cov->dim * (cov->dim - 1) ) / 2;
362 cov->cm = xcalloc (cov->n_cm, sizeof *cov->cm);
364 /* Grow the moment matrices so that they're large enough to accommodate the
365 categorical elements */
366 for (i = 0; i < n_MOMENTS; ++i)
368 cov->moments[i] = resize_matrix (cov->moments[i], cov->dim);
371 /* Populate the moments matrices with the categorical value elements */
372 for (i = cov->n_vars; i < cov->dim; ++i)
374 for (j = 0 ; j < cov->dim ; ++j) /* FIXME: This is WRONG !!! */
376 double w = categoricals_get_weight_by_subscript (cov->categoricals, i - cov->n_vars);
378 gsl_matrix_set (cov->moments[MOMENT_NONE], i, j, w);
380 w = categoricals_get_sum_by_subscript (cov->categoricals, i - cov->n_vars);
382 gsl_matrix_set (cov->moments[MOMENT_MEAN], i, j, w);
386 /* FIXME: This is WRONG!! It must be fixed to properly handle missing values. For
387 now it assumes there are none */
388 for (m = 0 ; m < n_MOMENTS; ++m)
390 for (i = 0 ; i < cov->dim ; ++i)
392 double x = gsl_matrix_get (cov->moments[m], i, cov->n_vars -1);
393 for (j = cov->n_vars; j < cov->dim; ++j)
395 gsl_matrix_set (cov->moments[m], i, j, x);
400 /* Divide the means by the number of samples */
401 for (i = 0; i < cov->dim; ++i)
403 for (j = 0; j < cov->dim; ++j)
405 double *x = gsl_matrix_ptr (cov->moments[MOMENT_MEAN], i, j);
406 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
411 for (i = 0 ; i < cov->dim; ++i)
413 double v1 = get_val (cov, i, c);
415 if ( is_missing (cov, i, c))
416 continue;
418 for (j = 0 ; j < cov->dim; ++j)
420 int idx;
421 double ss ;
422 double v2 = get_val (cov, j, c);
424 const double s = pow2 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)) * weight;
426 if ( is_missing (cov, j, c))
427 continue;
430 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
431 *x += s;
434 ss =
435 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
437 (v2 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
438 * weight
441 idx = cm_idx (cov, i, j);
442 if (idx >= 0)
444 cov->cm [idx] += ss;
449 cov->pass_two_first_case_seen = true;
453 /* Call this function for every case in the data set.
454 After all cases have been passed, call covariance_calculate
456 void
457 covariance_accumulate (struct covariance *cov, const struct ccase *c)
459 size_t i, j, m;
460 const double weight = cov->wv ? case_data (c, cov->wv)->f : 1.0;
462 assert (cov->passes == 1);
464 if ( !cov->pass_one_first_case_seen)
466 assert ( cov->state == 0);
467 cov->state = 1;
470 for (i = 0 ; i < cov->dim; ++i)
472 const union value *val1 = case_data (c, cov->vars[i]);
474 if ( is_missing (cov, i, c))
475 continue;
477 for (j = 0 ; j < cov->dim; ++j)
479 double pwr = 1.0;
480 int idx;
481 const union value *val2 = case_data (c, cov->vars[j]);
483 if ( is_missing (cov, j, c))
484 continue;
486 idx = cm_idx (cov, i, j);
487 if (idx >= 0)
489 cov->cm [idx] += val1->f * val2->f * weight;
492 for (m = 0 ; m < n_MOMENTS; ++m)
494 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
496 *x += pwr * weight;
497 pwr *= val1->f;
502 cov->pass_one_first_case_seen = true;
507 Allocate and return a gsl_matrix containing the covariances of the
508 data.
510 static gsl_matrix *
511 cm_to_gsl (struct covariance *cov)
513 int i, j;
514 gsl_matrix *m = gsl_matrix_calloc (cov->dim, cov->dim);
516 /* Copy the non-diagonal elements from cov->cm */
517 for ( j = 0 ; j < cov->dim - 1; ++j)
519 for (i = j+1 ; i < cov->dim; ++i)
521 double x = cov->cm [cm_idx (cov, i, j)];
522 gsl_matrix_set (m, i, j, x);
523 gsl_matrix_set (m, j, i, x);
527 /* Copy the diagonal elements from cov->moments[2] */
528 for (j = 0 ; j < cov->dim ; ++j)
530 double sigma = gsl_matrix_get (cov->moments[2], j, j);
531 gsl_matrix_set (m, j, j, sigma);
534 return m;
538 static gsl_matrix *
539 covariance_calculate_double_pass (struct covariance *cov)
541 size_t i, j;
542 for (i = 0 ; i < cov->dim; ++i)
544 for (j = 0 ; j < cov->dim; ++j)
546 int idx;
547 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
548 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
550 idx = cm_idx (cov, i, j);
551 if ( idx >= 0)
553 x = &cov->cm [idx];
554 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
559 return cm_to_gsl (cov);
562 static gsl_matrix *
563 covariance_calculate_single_pass (struct covariance *cov)
565 size_t i, j;
566 size_t m;
568 for (m = 0; m < n_MOMENTS; ++m)
570 /* Divide the moments by the number of samples */
571 if ( m > 0)
573 for (i = 0 ; i < cov->dim; ++i)
575 for (j = 0 ; j < cov->dim; ++j)
577 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
578 *x /= gsl_matrix_get (cov->moments[0], i, j);
580 if ( m == MOMENT_VARIANCE)
581 *x -= pow2 (gsl_matrix_get (cov->moments[1], i, j));
587 /* Centre the moments */
588 for ( j = 0 ; j < cov->dim - 1; ++j)
590 for (i = j + 1 ; i < cov->dim; ++i)
592 double *x = &cov->cm [cm_idx (cov, i, j)];
594 *x /= gsl_matrix_get (cov->moments[0], i, j);
596 *x -=
597 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
599 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i);
603 return cm_to_gsl (cov);
607 /* Return a pointer to gsl_matrix containing the pairwise covariances. The
608 caller owns the returned matrix and must free it when it is no longer
609 needed.
611 Call this function only after all data have been accumulated. */
612 gsl_matrix *
613 covariance_calculate (struct covariance *cov)
615 if ( cov->state <= 0 )
616 return NULL;
618 switch (cov->passes)
620 case 1:
621 return covariance_calculate_single_pass (cov);
622 break;
623 case 2:
624 return covariance_calculate_double_pass (cov);
625 break;
626 default:
627 NOT_REACHED ();
632 Covariance computed without dividing by the sample size.
634 static gsl_matrix *
635 covariance_calculate_double_pass_unnormalized (struct covariance *cov)
637 return cm_to_gsl (cov);
640 static gsl_matrix *
641 covariance_calculate_single_pass_unnormalized (struct covariance *cov)
643 size_t i, j;
645 for (i = 0 ; i < cov->dim; ++i)
647 for (j = 0 ; j < cov->dim; ++j)
649 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
650 *x -= pow2 (gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
651 / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
654 for ( j = 0 ; j < cov->dim - 1; ++j)
656 for (i = j + 1 ; i < cov->dim; ++i)
658 double *x = &cov->cm [cm_idx (cov, i, j)];
660 *x -=
661 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
663 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i)
664 / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
668 return cm_to_gsl (cov);
672 /* Return a pointer to gsl_matrix containing the pairwise covariances. The
673 returned matrix is owned by the structure, and must not be freed.
675 Call this function only after all data have been accumulated. */
676 const gsl_matrix *
677 covariance_calculate_unnormalized (struct covariance *cov)
679 if ( cov->state <= 0 )
680 return NULL;
682 if (cov->unnormalised != NULL)
683 return cov->unnormalised;
685 switch (cov->passes)
687 case 1:
688 cov->unnormalised = covariance_calculate_single_pass_unnormalized (cov);
689 break;
690 case 2:
691 cov->unnormalised = covariance_calculate_double_pass_unnormalized (cov);
692 break;
693 default:
694 NOT_REACHED ();
697 return cov->unnormalised;
700 /* Function to access the categoricals used by COV
701 The return value is owned by the COV
703 const struct categoricals *
704 covariance_get_categoricals (const struct covariance *cov)
706 return cov->categoricals;
710 /* Destroy the COV object */
711 void
712 covariance_destroy (struct covariance *cov)
714 size_t i;
716 categoricals_destroy (cov->categoricals);
718 for (i = 0; i < n_MOMENTS; ++i)
719 gsl_matrix_free (cov->moments[i]);
721 gsl_matrix_free (cov->unnormalised);
722 free (cov->moments);
723 free (cov->cm);
724 free (cov);
727 size_t
728 covariance_dim (const struct covariance * cov)
730 return (cov->dim);
736 Routines to assist debugging.
737 The following are not thoroughly tested and in certain respects
738 unreliable. They should only be
739 used for aids to development. Not as user accessible code.
742 #include "libpspp/str.h"
743 #include "output/tab.h"
744 #include "data/format.h"
747 /* Create a table which can be populated with the encodings for
748 the covariance matrix COV */
749 struct tab_table *
750 covariance_dump_enc_header (const struct covariance *cov, int length)
752 struct tab_table *t = tab_create (cov->dim, length);
753 int n;
754 int i;
756 tab_title (t, "Covariance Encoding");
758 tab_box (t,
759 TAL_2, TAL_2, 0, 0,
760 0, 0, tab_nc (t) - 1, tab_nr (t) - 1);
762 tab_hline (t, TAL_2, 0, tab_nc (t) - 1, 1);
765 for (i = 0 ; i < cov->n_vars; ++i)
767 tab_text (t, i, 0, TAT_TITLE, var_get_name (cov->vars[i]));
768 tab_vline (t, TAL_1, i + 1, 0, tab_nr (t) - 1);
771 n = 0;
772 while (i < cov->dim)
774 struct string str;
775 int idx = i - cov->n_vars;
776 const struct interaction *iact =
777 categoricals_get_interaction_by_subscript (cov->categoricals, idx);
778 int df;
780 ds_init_empty (&str);
781 interaction_to_string (iact, &str);
783 df = categoricals_df (cov->categoricals, n);
785 tab_joint_text (t,
786 i, 0,
787 i + df - 1, 0,
788 TAT_TITLE, ds_cstr (&str));
790 if (i + df < tab_nr (t) - 1)
791 tab_vline (t, TAL_1, i + df, 0, tab_nr (t) - 1);
793 i += df;
794 n++;
795 ds_destroy (&str);
798 return t;
803 Append table T, which should have been returned by covariance_dump_enc_header
804 with an entry corresponding to case C for the covariance matrix COV
806 void
807 covariance_dump_enc (const struct covariance *cov, const struct ccase *c,
808 struct tab_table *t)
810 static int row = 0;
811 int i;
812 ++row;
813 for (i = 0 ; i < cov->dim; ++i)
815 double v = get_val (cov, i, c);
816 tab_double (t, i, row, 0, v, i < cov->n_vars ? NULL : &F_8_0, RC_OTHER);