pivot-table: Define numeric formats of categories as well as their cells.
[pspp.git] / src / math / linreg.c
blobd417599babd94f9fa9afc9030b457ca8e03bfcb4
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005, 2010, 2011, 2017 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include "math/linreg.h"
21 #include <gsl/gsl_blas.h>
22 #include <gsl/gsl_cblas.h>
23 #include <gsl/gsl_errno.h>
24 #include <gsl/gsl_fit.h>
25 #include <gsl/gsl_linalg.h>
26 #include <gsl/gsl_multifit.h>
28 #include "data/value.h"
29 #include "data/variable.h"
30 #include "linreg/sweep.h"
32 #include "gl/xalloc.h"
35 Find the least-squares estimate of b for the linear model:
37 Y = Xb + Z
39 where Y is an n-by-1 column vector, X is an n-by-p matrix of
40 independent variables, b is a p-by-1 vector of regression coefficients,
41 and Z is an n-by-1 normally-distributed random vector with independent
42 identically distributed components with mean 0.
44 This estimate is found via the sweep operator or singular-value
45 decomposition with gsl.
48 References:
50 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
51 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
53 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
54 ISBN 0-387-94979-8.
56 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
57 Springer. 1998. ISBN 0-387-98542-5.
60 struct linreg
62 double n_obs; /* Number of observations. */
63 int n_indeps; /* Number of independent variables. */
64 int n_coeffs; /* The intercept is not considered a
65 coefficient here. */
68 Pointers to the variables.
70 const struct variable *depvar;
71 const struct variable **indep_vars;
73 double *coeff;
74 double intercept;
76 Means and standard deviations of the variables.
77 If these pointers are null when pspp_linreg() is
78 called, pspp_linreg() will compute their values.
80 Entry i of indep_means is the mean of independent
81 variable i, whose observations are stored in the ith
82 column of the design matrix.
84 double depvar_mean;
85 gsl_vector *indep_means;
86 gsl_vector *indep_std;
89 Sums of squares.
91 double ssm; /* Sums of squares for the overall model. */
92 double sst; /* Sum of squares total. */
93 double sse; /* Sum of squares error. */
94 double mse; /* Mean squared error. This is just sse /
95 dfe, but since it is the best unbiased
96 estimate of the population variance, it
97 has its own entry here. */
99 Covariance matrix of the parameter estimates.
101 gsl_matrix *cov;
103 Degrees of freedom.
105 double dft;
106 double dfe;
107 double dfm;
109 int dependent_column; /* Column containing the dependent variable. Defaults to last column. */
110 int refcnt;
112 bool origin;
115 const struct variable **
116 linreg_get_vars (const struct linreg *c)
118 return c->indep_vars;
122 Allocate a linreg and return a pointer to it. n is the number of
123 cases, p is the number of independent variables.
125 struct linreg *
126 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
127 double n, size_t p, bool origin)
129 struct linreg *c;
130 size_t i;
132 c = xmalloc (sizeof (*c));
133 c->depvar = depvar;
134 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
135 c->dependent_column = p;
136 for (i = 0; i < p; i++)
138 c->indep_vars[i] = indep_vars[i];
140 c->indep_means = gsl_vector_alloc (p);
141 c->indep_std = gsl_vector_alloc (p);
143 c->n_obs = n;
144 c->n_indeps = p;
145 c->n_coeffs = p;
146 c->coeff = xnmalloc (p, sizeof (*c->coeff));
147 c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1);
148 c->dft = n;
149 if (!origin)
150 c->dft--;
152 c->dfm = p;
153 c->dfe = c->dft - c->dfm;
154 c->intercept = 0.0;
155 c->depvar_mean = 0.0;
157 Default settings.
160 c->refcnt = 1;
162 c->origin = origin;
164 return c;
168 void
169 linreg_ref (struct linreg *c)
171 c->refcnt++;
174 void
175 linreg_unref (struct linreg *c)
177 if (--c->refcnt == 0)
179 gsl_vector_free (c->indep_means);
180 gsl_vector_free (c->indep_std);
181 gsl_matrix_free (c->cov);
182 free (c->indep_vars);
183 free (c->coeff);
184 free (c);
188 static void
189 post_sweep_computations (struct linreg *l, gsl_matrix *sw)
191 double m;
192 size_t i;
193 size_t j;
194 int rc;
196 assert (sw != NULL);
197 assert (l != NULL);
199 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
200 l->mse = l->sse / l->dfe;
202 Get the intercept.
204 m = l->depvar_mean;
205 for (i = 0; i < l->n_indeps; i++)
207 double tmp = gsl_matrix_get (sw, i, l->n_indeps);
208 l->coeff[i] = tmp;
209 m -= tmp * linreg_get_indep_variable_mean (l, i);
212 Get the covariance matrix of the parameter estimates.
213 Only the upper triangle is necessary.
217 The loops below do not compute the entries related
218 to the estimated intercept.
220 for (i = 0; i < l->n_indeps; i++)
221 for (j = i; j < l->n_indeps; j++)
223 double tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
224 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
227 if (! l->origin)
229 gsl_matrix *xm;
230 gsl_matrix_view xtx;
231 gsl_matrix_view xmxtx;
233 Get the covariances related to the intercept.
235 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
236 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
237 xm = gsl_matrix_calloc (1, l->n_indeps);
238 for (i = 0; i < xm->size2; i++)
240 gsl_matrix_set (xm, 0, i,
241 linreg_get_indep_variable_mean (l, i));
243 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
244 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
245 gsl_matrix_free (xm);
246 if (rc == GSL_SUCCESS)
248 double tmp = l->mse / l->n_obs;
249 for (i = 1; i < 1 + l->n_indeps; i++)
251 tmp -= gsl_matrix_get (l->cov, 0, i)
252 * linreg_get_indep_variable_mean (l, i - 1);
254 gsl_matrix_set (l->cov, 0, 0, tmp);
256 l->intercept = m;
258 else
260 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
261 __FILE__, __LINE__, gsl_strerror (rc));
262 exit (rc);
268 Predict the value of the dependent variable with the new set of
269 predictors. VALS are assumed to be in the order corresponding to the
270 order of the coefficients in the linreg struct.
272 double
273 linreg_predict (const struct linreg *c, const double *vals, size_t n_vals)
275 size_t j;
276 double result;
278 if (vals == NULL || c == NULL)
280 return GSL_NAN;
282 assert (n_vals == c->n_coeffs);
283 if (c->coeff == NULL)
285 /* The stupid model: just guess the mean. */
286 return c->depvar_mean;
288 result = c->intercept;
290 for (j = 0; j < n_vals; j++)
292 result += linreg_coeff (c, j) * vals[j];
295 return result;
298 double
299 linreg_residual (const struct linreg *c, double obs, const double *vals, size_t n_vals)
301 if (vals == NULL || c == NULL)
303 return GSL_NAN;
305 return (obs - linreg_predict (c, vals, n_vals));
309 Mean of the independent variable.
311 double
312 linreg_get_indep_variable_mean (const struct linreg *c, size_t j)
314 assert (c != NULL);
315 return gsl_vector_get (c->indep_means, j);
318 void
319 linreg_set_indep_variable_mean (struct linreg *c, size_t j, double m)
321 assert (c != NULL);
322 gsl_vector_set (c->indep_means, j, m);
325 #if 0
326 static void
327 linreg_fit_qr (const gsl_matrix *cov, struct linreg *l)
329 double intcpt_coef = 0.0;
330 double intercept_variance = 0.0;
331 gsl_matrix *xtx;
332 gsl_matrix *q;
333 gsl_matrix *r;
334 gsl_vector *xty;
335 gsl_vector *tau;
336 gsl_vector *params;
337 size_t i;
338 size_t j;
340 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
341 xty = gsl_vector_alloc (cov->size1 - 1);
342 tau = gsl_vector_alloc (cov->size1 - 1);
343 params = gsl_vector_alloc (cov->size1 - 1);
345 for (i = 0; i < xtx->size1; i++)
347 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
348 for (j = 0; j < xtx->size2; j++)
350 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
353 gsl_linalg_QR_decomp (xtx, tau);
354 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
355 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
357 gsl_linalg_QR_unpack (xtx, tau, q, r);
358 gsl_linalg_QR_solve (xtx, tau, xty, params);
359 for (i = 0; i < params->size; i++)
361 l->coeff[i] = gsl_vector_get (params, i);
363 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
364 l->ssm = 0.0;
365 for (i = 0; i < l->n_indeps; i++)
367 l->ssm += gsl_vector_get (xty, i) * l->coeff[i];
369 l->sse = l->sst - l->ssm;
371 gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l),
372 r, q);
373 /* Copy the lower triangle into the upper triangle. */
374 for (i = 0; i < q->size1; i++)
376 gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i));
377 for (j = i + 1; j < q->size2; j++)
379 intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) *
380 linreg_get_indep_variable_mean (l, i) *
381 linreg_get_indep_variable_mean (l, j);
382 gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i));
386 if (!l->origin)
388 l->intercept = linreg_get_depvar_mean (l);
389 for (i = 0; i < l->n_indeps; i++)
391 double tmp = linreg_get_indep_variable_mean (l, i);
392 l->intercept -= l->coeff[i] * tmp;
393 intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i);
396 /* Covariances related to the intercept. */
397 intercept_variance += linreg_mse (l) / linreg_n_obs (l);
398 gsl_matrix_set (l->cov, 0, 0, intercept_variance);
399 for (i = 0; i < q->size1; i++)
401 for (j = 0; j < q->size2; j++)
403 intcpt_coef -= gsl_matrix_get (q, i, j)
404 * linreg_get_indep_variable_mean (l, j);
406 gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef);
407 gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef);
408 intcpt_coef = 0.0;
412 gsl_matrix_free (q);
413 gsl_matrix_free (r);
414 gsl_vector_free (xty);
415 gsl_vector_free (tau);
416 gsl_matrix_free (xtx);
417 gsl_vector_free (params);
419 #endif
421 #define REG_LARGE_DATA 1000
424 Estimate the model parameters from the covariance matrix. This
425 function assumes the covariance entries corresponding to the
426 dependent variable are in the final row and column of the covariance
427 matrix.
429 void
430 linreg_fit (const gsl_matrix *cov, struct linreg *l)
432 assert (l != NULL);
433 assert (cov != NULL);
435 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
437 #if 0
438 /* This QR decomposition path seems to produce the incorrect
439 values. See https://savannah.gnu.org/bugs/?51373 */
440 if ((l->n_obs * l->n_obs > l->n_indeps) && (l->n_obs > REG_LARGE_DATA))
443 For large data sets, use QR decomposition.
445 linreg_fit_qr (cov, l);
447 else
448 #endif
450 gsl_matrix *params = gsl_matrix_calloc (cov->size1, cov->size2);
451 gsl_matrix_memcpy (params, cov);
452 reg_sweep (params, l->dependent_column);
453 post_sweep_computations (l, params);
454 gsl_matrix_free (params);
458 double
459 linreg_mse (const struct linreg *c)
461 assert (c != NULL);
462 return (c->sse / c->dfe);
465 double
466 linreg_intercept (const struct linreg *c)
468 return c->intercept;
471 const gsl_matrix *
472 linreg_cov (const struct linreg *c)
474 return c->cov;
477 double
478 linreg_coeff (const struct linreg *c, size_t i)
480 return (c->coeff[i]);
483 const struct variable *
484 linreg_indep_var (const struct linreg *c, size_t i)
486 return (c->indep_vars[i]);
490 linreg_n_indeps (const struct linreg *c)
492 return c->n_indeps;
496 const struct variable *
497 linreg_dep_var (const struct linreg *c)
499 return c->depvar;
503 size_t
504 linreg_n_coeffs (const struct linreg *c)
506 return c->n_coeffs;
509 double
510 linreg_n_obs (const struct linreg *c)
512 return c->n_obs;
515 double
516 linreg_sse (const struct linreg *c)
518 return c->sse;
521 double
522 linreg_ssreg (const struct linreg *c)
524 return (c->sst - c->sse);
527 double linreg_sst (const struct linreg *c)
529 return c->sst;
532 double
533 linreg_dfmodel (const struct linreg *c)
535 return c->dfm;
538 double
539 linreg_dferror (const struct linreg *c)
541 return c->dfe;
544 double
545 linreg_dftotal (const struct linreg *c)
547 return c->dft;
550 void
551 linreg_set_depvar_mean (struct linreg *c, double x)
553 c->depvar_mean = x;
556 double
557 linreg_get_depvar_mean (const struct linreg *c)
559 return c->depvar_mean;