Change how checking for missing values works.
[pspp.git] / src / language / stats / descriptives.c
blobfdb454c2600af97bbffdb0a609d698d20885776c
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2009-2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include <float.h>
20 #include <limits.h>
21 #include <math.h>
22 #include <stdlib.h>
24 #include "data/casegrouper.h"
25 #include "data/casereader.h"
26 #include "data/casewriter.h"
27 #include "data/dataset.h"
28 #include "data/dictionary.h"
29 #include "data/transformations.h"
30 #include "data/variable.h"
31 #include "language/command.h"
32 #include "language/dictionary/split-file.h"
33 #include "language/lexer/lexer.h"
34 #include "language/lexer/variable-parser.h"
35 #include "libpspp/array.h"
36 #include "libpspp/assertion.h"
37 #include "libpspp/compiler.h"
38 #include "libpspp/i18n.h"
39 #include "libpspp/message.h"
40 #include "math/moments.h"
41 #include "output/pivot-table.h"
43 #include "gl/xalloc.h"
45 #include "gettext.h"
46 #define _(msgid) gettext (msgid)
47 #define N_(msgid) msgid
49 /* DESCRIPTIVES private data. */
51 struct dsc_proc;
53 /* Handling of missing values. */
54 enum dsc_missing_type
56 DSC_VARIABLE, /* Handle missing values on a per-variable basis. */
57 DSC_LISTWISE /* Discard entire case if any variable is missing. */
60 /* Describes properties of a distribution for the purpose of
61 calculating a Z-score. */
62 struct dsc_z_score
64 const struct variable *src_var; /* Variable on which z-score is based. */
65 struct variable *z_var; /* New z-score variable. */
66 double mean; /* Distribution mean. */
67 double std_dev; /* Distribution standard deviation. */
70 /* DESCRIPTIVES transformation (for calculating Z-scores). */
71 struct dsc_trns
73 struct dsc_z_score *z_scores; /* Array of Z-scores. */
74 int n_z_scores; /* Number of Z-scores. */
75 const struct variable **vars; /* Variables for listwise missing checks. */
76 size_t n_vars; /* Number of variables. */
77 enum dsc_missing_type missing_type; /* Treatment of missing values. */
78 enum mv_class exclude; /* Classes of missing values to exclude. */
79 const struct variable *filter; /* Dictionary FILTER BY variable. */
80 struct casereader *z_reader; /* Reader for count, mean, stddev. */
81 casenumber count; /* Number left in this SPLIT FILE group.*/
82 bool ok;
85 /* Statistics. Used as bit indexes, so must be 32 or fewer. */
86 enum dsc_statistic
88 DSC_MEAN = 0, DSC_SEMEAN, DSC_STDDEV, DSC_VARIANCE, DSC_KURTOSIS,
89 DSC_SEKURT, DSC_SKEWNESS, DSC_SESKEW, DSC_RANGE, DSC_MIN,
90 DSC_MAX, DSC_SUM, DSC_N_STATS,
92 /* Only valid as sort criteria. */
93 DSC_NAME = -2, /* Sort by name. */
94 DSC_NONE = -1 /* Unsorted. */
97 /* Describes one statistic. */
98 struct dsc_statistic_info
100 const char *identifier; /* Identifier. */
101 const char *name; /* Full name. */
102 enum moment moment; /* Highest moment needed to calculate. */
105 /* Table of statistics, indexed by DSC_*. */
106 static const struct dsc_statistic_info dsc_info[DSC_N_STATS] =
108 {"MEAN", N_("Mean"), MOMENT_MEAN},
109 {"SEMEAN", N_("S.E. Mean"), MOMENT_VARIANCE},
110 {"STDDEV", N_("Std Dev"), MOMENT_VARIANCE},
111 {"VARIANCE", N_("Variance"), MOMENT_VARIANCE},
112 {"KURTOSIS", N_("Kurtosis"), MOMENT_KURTOSIS},
113 {"SEKURTOSIS", N_("S.E. Kurt"), MOMENT_NONE},
114 {"SKEWNESS", N_("Skewness"), MOMENT_SKEWNESS},
115 {"SESKEWNESS", N_("S.E. Skew"), MOMENT_NONE},
116 {"RANGE", N_("Range"), MOMENT_NONE},
117 {"MINIMUM", N_("Minimum"), MOMENT_NONE},
118 {"MAXIMUM", N_("Maximum"), MOMENT_NONE},
119 {"SUM", N_("Sum"), MOMENT_MEAN},
122 /* Statistics calculated by default if none are explicitly
123 requested. */
124 #define DEFAULT_STATS \
125 ((1ul << DSC_MEAN) | (1ul << DSC_STDDEV) | (1ul << DSC_MIN) \
126 | (1ul << DSC_MAX))
128 /* A variable specified on DESCRIPTIVES. */
129 struct dsc_var
131 const struct variable *v; /* Variable to calculate on. */
132 char *z_name; /* Name for z-score variable. */
133 double valid, missing; /* Valid, missing counts. */
134 struct moments *moments; /* Moments. */
135 double min, max; /* Maximum and mimimum values. */
136 double stats[DSC_N_STATS]; /* All the stats' values. */
139 /* A DESCRIPTIVES procedure. */
140 struct dsc_proc
142 /* Per-variable info. */
143 struct dictionary *dict; /* Dictionary. */
144 struct dsc_var *vars; /* Variables. */
145 size_t n_vars; /* Number of variables. */
147 /* User options. */
148 enum dsc_missing_type missing_type; /* Treatment of missing values. */
149 enum mv_class exclude; /* Classes of missing values to exclude. */
151 /* Accumulated results. */
152 double missing_listwise; /* Sum of weights of cases missing listwise. */
153 double valid; /* Sum of weights of valid cases. */
154 bool bad_warn; /* Warn if bad weight found. */
155 enum dsc_statistic sort_by_stat; /* Statistic to sort by; -1: name. */
156 int sort_ascending; /* !0: ascending order; 0: descending. */
157 unsigned long show_stats; /* Statistics to display. */
158 unsigned long calc_stats; /* Statistics to calculate. */
159 enum moment max_moment; /* Highest moment needed for stats. */
161 /* Z scores. */
162 struct casewriter *z_writer; /* Mean and stddev per SPLIT FILE group. */
165 /* Parsing. */
166 static enum dsc_statistic match_statistic (struct lexer *);
167 static void free_dsc_proc (struct dsc_proc *);
169 /* Z-score functions. */
170 static bool try_name (const struct dictionary *dict,
171 struct dsc_proc *dsc, const char *name);
172 static char *generate_z_varname (const struct dictionary *dict,
173 struct dsc_proc *dsc,
174 const char *name, int *n_zs);
175 static void dump_z_table (struct dsc_proc *);
176 static void setup_z_trns (struct dsc_proc *, struct dataset *);
178 /* Procedure execution functions. */
179 static void calc_descriptives (struct dsc_proc *, struct casereader *,
180 struct dataset *);
181 static void display (struct dsc_proc *dsc);
183 /* Parser and outline. */
185 /* Handles DESCRIPTIVES. */
187 cmd_descriptives (struct lexer *lexer, struct dataset *ds)
189 struct dictionary *dict = dataset_dict (ds);
190 struct dsc_proc *dsc;
191 const struct variable **vars = NULL;
192 size_t n_vars = 0;
193 int save_z_scores = 0;
194 int n_zs = 0;
195 size_t i;
196 bool ok;
198 struct casegrouper *grouper;
199 struct casereader *group;
201 /* Create and initialize dsc. */
202 dsc = xmalloc (sizeof *dsc);
203 dsc->dict = dict;
204 dsc->vars = NULL;
205 dsc->n_vars = 0;
206 dsc->missing_type = DSC_VARIABLE;
207 dsc->exclude = MV_ANY;
208 dsc->missing_listwise = 0.;
209 dsc->valid = 0.;
210 dsc->bad_warn = 1;
211 dsc->sort_by_stat = DSC_NONE;
212 dsc->sort_ascending = 1;
213 dsc->show_stats = dsc->calc_stats = DEFAULT_STATS;
214 dsc->z_writer = NULL;
216 /* Parse DESCRIPTIVES. */
217 while (lex_token (lexer) != T_ENDCMD)
219 if (lex_match_id (lexer, "MISSING"))
221 lex_match (lexer, T_EQUALS);
222 while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
224 if (lex_match_id (lexer, "VARIABLE"))
225 dsc->missing_type = DSC_VARIABLE;
226 else if (lex_match_id (lexer, "LISTWISE"))
227 dsc->missing_type = DSC_LISTWISE;
228 else if (lex_match_id (lexer, "INCLUDE"))
229 dsc->exclude = MV_SYSTEM;
230 else
232 lex_error (lexer, NULL);
233 goto error;
235 lex_match (lexer, T_COMMA);
238 else if (lex_match_id (lexer, "SAVE"))
239 save_z_scores = 1;
240 else if (lex_match_id (lexer, "FORMAT"))
242 lex_match (lexer, T_EQUALS);
243 while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
245 if (lex_match_id (lexer, "LABELS")
246 || lex_match_id (lexer, "NOLABELS")
247 || lex_match_id (lexer, "INDEX")
248 || lex_match_id (lexer, "NOINDEX")
249 || lex_match_id (lexer, "LINE")
250 || lex_match_id (lexer, "SERIAL"))
252 /* Ignore. */
254 else
256 lex_error (lexer, NULL);
257 goto error;
259 lex_match (lexer, T_COMMA);
262 else if (lex_match_id (lexer, "STATISTICS"))
264 lex_match (lexer, T_EQUALS);
265 dsc->show_stats = 0;
266 while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
268 if (lex_match (lexer, T_ALL))
269 dsc->show_stats |= (1ul << DSC_N_STATS) - 1;
270 else if (lex_match_id (lexer, "DEFAULT"))
271 dsc->show_stats |= DEFAULT_STATS;
272 else
274 enum dsc_statistic s = match_statistic (lexer);
275 if (s == DSC_NONE)
277 lex_error (lexer, NULL);
278 goto error;
280 dsc->show_stats |= 1ul << s;
282 lex_match (lexer, T_COMMA);
284 if (dsc->show_stats == 0)
285 dsc->show_stats = DEFAULT_STATS;
287 else if (lex_match_id (lexer, "SORT"))
289 lex_match (lexer, T_EQUALS);
290 if (lex_match_id (lexer, "NAME"))
291 dsc->sort_by_stat = DSC_NAME;
292 else
294 dsc->sort_by_stat = match_statistic (lexer);
295 if (dsc->sort_by_stat == DSC_NONE)
296 dsc->sort_by_stat = DSC_MEAN;
298 if (lex_match (lexer, T_LPAREN))
300 if (lex_match_id (lexer, "A"))
301 dsc->sort_ascending = 1;
302 else if (lex_match_id (lexer, "D"))
303 dsc->sort_ascending = 0;
304 else
305 lex_error (lexer, NULL);
306 if (! lex_force_match (lexer, T_RPAREN))
307 goto error;
310 else if (n_vars == 0)
312 if (lex_next_token (lexer, 1) == T_EQUALS)
314 lex_match_id (lexer, "VARIABLES");
315 lex_match (lexer, T_EQUALS);
318 while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
320 int i;
322 if (!parse_variables_const (lexer, dict, &vars, &n_vars,
323 PV_APPEND | PV_NO_DUPLICATE | PV_NUMERIC))
324 goto error;
326 dsc->vars = xnrealloc ((void *)dsc->vars, n_vars, sizeof *dsc->vars);
327 for (i = dsc->n_vars; i < n_vars; i++)
329 struct dsc_var *dv = &dsc->vars[i];
330 dv->v = vars[i];
331 dv->z_name = NULL;
332 dv->moments = NULL;
334 dsc->n_vars = n_vars;
336 if (lex_match (lexer, T_LPAREN))
338 if (lex_token (lexer) != T_ID)
340 lex_error (lexer, NULL);
341 goto error;
343 if (try_name (dict, dsc, lex_tokcstr (lexer)))
345 struct dsc_var *dsc_var = &dsc->vars[dsc->n_vars - 1];
346 dsc_var->z_name = xstrdup (lex_tokcstr (lexer));
347 n_zs++;
349 else
350 msg (SE, _("Z-score variable name %s would be"
351 " a duplicate variable name."), lex_tokcstr (lexer));
352 lex_get (lexer);
353 if (!lex_force_match (lexer, T_RPAREN))
354 goto error;
358 else
360 lex_error (lexer, NULL);
361 goto error;
364 lex_match (lexer, T_SLASH);
366 if (n_vars == 0)
368 msg (SE, _("No variables specified."));
369 goto error;
372 /* Construct z-score varnames, show translation table. */
373 if (n_zs || save_z_scores)
375 struct caseproto *proto;
377 if (save_z_scores)
379 int n_gens = 0;
381 for (i = 0; i < dsc->n_vars; i++)
383 struct dsc_var *dsc_var = &dsc->vars[i];
384 if (dsc_var->z_name == NULL)
386 const char *name = var_get_name (dsc_var->v);
387 dsc_var->z_name = generate_z_varname (dict, dsc, name,
388 &n_gens);
389 if (dsc_var->z_name == NULL)
390 goto error;
392 n_zs++;
397 /* It would be better to handle Z scores correctly (however we define
398 that) when TEMPORARY is in effect, but in the meantime this at least
399 prevents a use-after-free error. See bug #38786. */
400 if (proc_make_temporary_transformations_permanent (ds))
401 msg (SW, _("DESCRIPTIVES with Z scores ignores TEMPORARY. "
402 "Temporary transformations will be made permanent."));
404 proto = caseproto_create ();
405 for (i = 0; i < 1 + 2 * n_zs; i++)
406 proto = caseproto_add_width (proto, 0);
407 dsc->z_writer = autopaging_writer_create (proto);
408 caseproto_unref (proto);
410 dump_z_table (dsc);
413 /* Figure out statistics to display. */
414 if (dsc->show_stats & (1ul << DSC_SKEWNESS))
415 dsc->show_stats |= 1ul << DSC_SESKEW;
416 if (dsc->show_stats & (1ul << DSC_KURTOSIS))
417 dsc->show_stats |= 1ul << DSC_SEKURT;
419 /* Figure out which statistics to calculate. */
420 dsc->calc_stats = dsc->show_stats;
421 if (n_zs > 0)
422 dsc->calc_stats |= (1ul << DSC_MEAN) | (1ul << DSC_STDDEV);
423 if (dsc->sort_by_stat >= 0)
424 dsc->calc_stats |= 1ul << dsc->sort_by_stat;
425 if (dsc->show_stats & (1ul << DSC_SESKEW))
426 dsc->calc_stats |= 1ul << DSC_SKEWNESS;
427 if (dsc->show_stats & (1ul << DSC_SEKURT))
428 dsc->calc_stats |= 1ul << DSC_KURTOSIS;
430 /* Figure out maximum moment needed and allocate moments for
431 the variables. */
432 dsc->max_moment = MOMENT_NONE;
433 for (i = 0; i < DSC_N_STATS; i++)
434 if (dsc->calc_stats & (1ul << i) && dsc_info[i].moment > dsc->max_moment)
435 dsc->max_moment = dsc_info[i].moment;
436 if (dsc->max_moment != MOMENT_NONE)
437 for (i = 0; i < dsc->n_vars; i++)
438 dsc->vars[i].moments = moments_create (dsc->max_moment);
440 /* Data pass. */
441 grouper = casegrouper_create_splits (proc_open_filtering (ds, false), dict);
442 while (casegrouper_get_next_group (grouper, &group))
443 calc_descriptives (dsc, group, ds);
444 ok = casegrouper_destroy (grouper);
445 ok = proc_commit (ds) && ok;
447 /* Z-scoring! */
448 if (ok && n_zs)
449 setup_z_trns (dsc, ds);
451 /* Done. */
452 free (vars);
453 free_dsc_proc (dsc);
454 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
456 error:
457 free (vars);
458 free_dsc_proc (dsc);
459 return CMD_FAILURE;
462 /* Returns the statistic named by the current token and skips past the token.
463 Returns DSC_NONE if no statistic is given (e.g., subcommand with no
464 specifiers). Emits an error if the current token ID does not name a
465 statistic. */
466 static enum dsc_statistic
467 match_statistic (struct lexer *lexer)
469 if (lex_token (lexer) == T_ID)
471 enum dsc_statistic stat;
473 for (stat = 0; stat < DSC_N_STATS; stat++)
474 if (lex_match_id (lexer, dsc_info[stat].identifier))
475 return stat;
477 lex_get (lexer);
478 lex_error (lexer, _("expecting statistic name: reverting to default"));
481 return DSC_NONE;
484 /* Frees DSC. */
485 static void
486 free_dsc_proc (struct dsc_proc *dsc)
488 size_t i;
490 if (dsc == NULL)
491 return;
493 for (i = 0; i < dsc->n_vars; i++)
495 struct dsc_var *dsc_var = &dsc->vars[i];
496 free (dsc_var->z_name);
497 moments_destroy (dsc_var->moments);
499 casewriter_destroy (dsc->z_writer);
500 free (dsc->vars);
501 free (dsc);
504 /* Z scores. */
506 /* Returns false if NAME is a duplicate of any existing variable name or
507 of any previously-declared z-var name; otherwise returns true. */
508 static bool
509 try_name (const struct dictionary *dict, struct dsc_proc *dsc,
510 const char *name)
512 size_t i;
514 if (dict_lookup_var (dict, name) != NULL)
515 return false;
516 for (i = 0; i < dsc->n_vars; i++)
518 struct dsc_var *dsc_var = &dsc->vars[i];
519 if (dsc_var->z_name != NULL && !utf8_strcasecmp (dsc_var->z_name, name))
520 return false;
522 return true;
525 /* Generates a name for a Z-score variable based on a variable
526 named VAR_NAME, given that *Z_CNT generated variable names are
527 known to already exist. If successful, returns the new name
528 as a dynamically allocated string. On failure, returns NULL. */
529 static char *
530 generate_z_varname (const struct dictionary *dict, struct dsc_proc *dsc,
531 const char *var_name, int *n_zs)
533 char *z_name, *trunc_name;
535 /* Try a name based on the original variable name. */
536 z_name = xasprintf ("Z%s", var_name);
537 trunc_name = utf8_encoding_trunc (z_name, dict_get_encoding (dict),
538 ID_MAX_LEN);
539 free (z_name);
540 if (try_name (dict, dsc, trunc_name))
541 return trunc_name;
542 free (trunc_name);
544 /* Generate a synthetic name. */
545 for (;;)
547 char name[16];
549 (*n_zs)++;
551 if (*n_zs <= 99)
552 sprintf (name, "ZSC%03d", *n_zs);
553 else if (*n_zs <= 108)
554 sprintf (name, "STDZ%02d", *n_zs - 99);
555 else if (*n_zs <= 117)
556 sprintf (name, "ZZZZ%02d", *n_zs - 108);
557 else if (*n_zs <= 126)
558 sprintf (name, "ZQZQ%02d", *n_zs - 117);
559 else
561 msg (SE, _("Ran out of generic names for Z-score variables. "
562 "There are only 126 generic names: ZSC001-ZSC0999, "
563 "STDZ01-STDZ09, ZZZZ01-ZZZZ09, ZQZQ01-ZQZQ09."));
564 return NULL;
567 if (try_name (dict, dsc, name))
568 return xstrdup (name);
570 NOT_REACHED();
573 /* Outputs a table describing the mapping between source
574 variables and Z-score variables. */
575 static void
576 dump_z_table (struct dsc_proc *dsc)
578 struct pivot_table *table = pivot_table_create (
579 N_("Mapping of Variables to Z-scores"));
581 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Names"),
582 N_("Source"), N_("Target"));
584 struct pivot_dimension *names = pivot_dimension_create (
585 table, PIVOT_AXIS_ROW, N_("Variables"));
586 names->hide_all_labels = true;
588 for (size_t i = 0; i < dsc->n_vars; i++)
589 if (dsc->vars[i].z_name != NULL)
591 int row = pivot_category_create_leaf (names->root,
592 pivot_value_new_number (i));
594 pivot_table_put2 (table, 0, row,
595 pivot_value_new_variable (dsc->vars[i].v));
596 pivot_table_put2 (table, 1, row,
597 pivot_value_new_user_text (dsc->vars[i].z_name, -1));
600 pivot_table_submit (table);
603 static void
604 descriptives_set_all_sysmis_zscores (const struct dsc_trns *t, struct ccase *c)
606 const struct dsc_z_score *z;
608 for (z = t->z_scores; z < t->z_scores + t->n_z_scores; z++)
609 *case_num_rw (c, z->z_var) = SYSMIS;
612 /* Transformation function to calculate Z-scores. Will return SYSMIS if any of
613 the following are true: 1) mean or standard deviation is SYSMIS 2) score is
614 SYSMIS 3) score is user missing and they were not included in the original
615 analyis. 4) any of the variables in the original analysis were missing
616 (either system or user-missing values that weren't included).
618 static enum trns_result
619 descriptives_trns_proc (void *trns_, struct ccase **c,
620 casenumber case_idx UNUSED)
622 struct dsc_trns *t = trns_;
623 struct dsc_z_score *z;
624 const struct variable **vars;
626 *c = case_unshare (*c);
628 if (t->filter)
630 double f = case_num (*c, t->filter);
631 if (f == 0.0 || var_is_num_missing (t->filter, f))
633 descriptives_set_all_sysmis_zscores (t, *c);
634 return TRNS_CONTINUE;
638 if (t->count <= 0)
640 struct ccase *z_case;
642 z_case = casereader_read (t->z_reader);
643 if (z_case)
645 size_t z_idx = 0;
647 t->count = case_num_idx (z_case, z_idx++);
648 for (z = t->z_scores; z < t->z_scores + t->n_z_scores; z++)
650 z->mean = case_num_idx (z_case, z_idx++);
651 z->std_dev = case_num_idx (z_case, z_idx++);
653 case_unref (z_case);
655 else
657 if (t->ok)
659 msg (SE, _("Internal error processing Z scores. "
660 "Please report this to %s."),
661 PACKAGE_BUGREPORT);
662 t->ok = false;
664 descriptives_set_all_sysmis_zscores (t, *c);
665 return TRNS_CONTINUE;
668 t->count--;
670 if (t->missing_type == DSC_LISTWISE)
672 assert(t->vars);
673 for (vars = t->vars; vars < t->vars + t->n_vars; vars++)
675 double score = case_num (*c, *vars);
676 if (var_is_num_missing (*vars, score) & t->exclude)
678 descriptives_set_all_sysmis_zscores (t, *c);
679 return TRNS_CONTINUE;
684 for (z = t->z_scores; z < t->z_scores + t->n_z_scores; z++)
686 double input = case_num (*c, z->src_var);
687 double *output = case_num_rw (*c, z->z_var);
689 if (z->mean == SYSMIS || z->std_dev == SYSMIS
690 || var_is_num_missing (z->src_var, input) & t->exclude)
691 *output = SYSMIS;
692 else
693 *output = (input - z->mean) / z->std_dev;
695 return TRNS_CONTINUE;
698 /* Frees a descriptives_trns struct. */
699 static bool
700 descriptives_trns_free (void *trns_)
702 struct dsc_trns *t = trns_;
703 bool ok = t->ok && !casereader_error (t->z_reader);
705 free (t->z_scores);
706 casereader_destroy (t->z_reader);
707 assert((t->missing_type != DSC_LISTWISE) ^ (t->vars != NULL));
708 free (t->vars);
709 free (t);
711 return ok;
714 static const struct trns_class descriptives_trns_class = {
715 .name = "DESCRIPTIVES (Z scores)",
716 .execute = descriptives_trns_proc,
717 .destroy = descriptives_trns_free,
720 /* Sets up a transformation to calculate Z scores. */
721 static void
722 setup_z_trns (struct dsc_proc *dsc, struct dataset *ds)
724 struct dsc_trns *t;
725 size_t cnt, i;
727 for (cnt = i = 0; i < dsc->n_vars; i++)
728 if (dsc->vars[i].z_name != NULL)
729 cnt++;
731 t = xmalloc (sizeof *t);
732 t->z_scores = xnmalloc (cnt, sizeof *t->z_scores);
733 t->n_z_scores = cnt;
734 t->missing_type = dsc->missing_type;
735 t->exclude = dsc->exclude;
736 if (t->missing_type == DSC_LISTWISE)
738 t->n_vars = dsc->n_vars;
739 t->vars = xnmalloc (t->n_vars, sizeof *t->vars);
740 for (i = 0; i < t->n_vars; i++)
741 t->vars[i] = dsc->vars[i].v;
743 else
745 t->n_vars = 0;
746 t->vars = NULL;
748 t->filter = dict_get_filter (dataset_dict (ds));
749 t->z_reader = casewriter_make_reader (dsc->z_writer);
750 t->count = 0;
751 t->ok = true;
752 dsc->z_writer = NULL;
754 for (cnt = i = 0; i < dsc->n_vars; i++)
756 struct dsc_var *dv = &dsc->vars[i];
757 if (dv->z_name != NULL)
759 struct dsc_z_score *z;
760 struct variable *dst_var;
761 char *label;
763 dst_var = dict_create_var_assert (dataset_dict (ds), dv->z_name, 0);
765 label = xasprintf (_("Z-score of %s"),var_to_string (dv->v));
766 var_set_label (dst_var, label);
767 free (label);
769 z = &t->z_scores[cnt++];
770 z->src_var = dv->v;
771 z->z_var = dst_var;
775 add_transformation (ds, &descriptives_trns_class, t);
778 /* Statistical calculation. */
780 static bool listwise_missing (struct dsc_proc *dsc, const struct ccase *c);
782 /* Calculates and displays descriptive statistics for the cases
783 in CF. */
784 static void
785 calc_descriptives (struct dsc_proc *dsc, struct casereader *group,
786 struct dataset *ds)
788 const struct variable *filter = dict_get_filter (dataset_dict (ds));
789 struct casereader *pass1, *pass2;
790 casenumber count;
791 struct ccase *c;
792 size_t z_idx;
793 size_t i;
795 c = casereader_peek (group, 0);
796 if (c == NULL)
798 casereader_destroy (group);
799 return;
801 output_split_file_values (ds, c);
802 case_unref (c);
804 group = casereader_create_filter_weight (group, dataset_dict (ds),
805 NULL, NULL);
807 pass1 = group;
808 pass2 = dsc->max_moment <= MOMENT_MEAN ? NULL : casereader_clone (pass1);
810 for (i = 0; i < dsc->n_vars; i++)
812 struct dsc_var *dv = &dsc->vars[i];
814 dv->valid = dv->missing = 0.0;
815 if (dv->moments != NULL)
816 moments_clear (dv->moments);
817 dv->min = DBL_MAX;
818 dv->max = -DBL_MAX;
820 dsc->missing_listwise = 0.;
821 dsc->valid = 0.;
823 /* First pass to handle most of the work. */
824 count = 0;
825 for (; (c = casereader_read (pass1)) != NULL; case_unref (c))
827 double weight = dict_get_case_weight (dataset_dict (ds), c, NULL);
829 if (filter)
831 double f = case_num (c, filter);
832 if (f == 0.0 || var_is_num_missing (filter, f))
833 continue;
836 /* Check for missing values. */
837 if (listwise_missing (dsc, c))
839 dsc->missing_listwise += weight;
840 if (dsc->missing_type == DSC_LISTWISE)
841 continue;
843 dsc->valid += weight;
845 for (i = 0; i < dsc->n_vars; i++)
847 struct dsc_var *dv = &dsc->vars[i];
848 double x = case_num (c, dv->v);
850 if (var_is_num_missing (dv->v, x) & dsc->exclude)
852 dv->missing += weight;
853 continue;
856 if (dv->moments != NULL)
857 moments_pass_one (dv->moments, x, weight);
859 if (x < dv->min)
860 dv->min = x;
861 if (x > dv->max)
862 dv->max = x;
865 count++;
867 if (!casereader_destroy (pass1))
869 casereader_destroy (pass2);
870 return;
873 /* Second pass for higher-order moments. */
874 if (dsc->max_moment > MOMENT_MEAN)
876 for (; (c = casereader_read (pass2)) != NULL; case_unref (c))
878 double weight = dict_get_case_weight (dataset_dict (ds), c, NULL);
880 if (filter)
882 double f = case_num (c, filter);
883 if (f == 0.0 || var_is_num_missing (filter, f))
884 continue;
887 /* Check for missing values. */
888 if (dsc->missing_type == DSC_LISTWISE && listwise_missing (dsc, c))
889 continue;
891 for (i = 0; i < dsc->n_vars; i++)
893 struct dsc_var *dv = &dsc->vars[i];
894 double x = case_num (c, dv->v);
896 if (var_is_num_missing (dv->v, x) & dsc->exclude)
897 continue;
899 if (dv->moments != NULL)
900 moments_pass_two (dv->moments, x, weight);
903 if (!casereader_destroy (pass2))
904 return;
907 /* Calculate results. */
908 if (dsc->z_writer && count > 0)
910 c = case_create (casewriter_get_proto (dsc->z_writer));
911 z_idx = 0;
912 *case_num_rw_idx (c, z_idx++) = count;
914 else
915 c = NULL;
917 for (i = 0; i < dsc->n_vars; i++)
919 struct dsc_var *dv = &dsc->vars[i];
920 double W;
921 int j;
923 for (j = 0; j < DSC_N_STATS; j++)
924 dv->stats[j] = SYSMIS;
926 dv->valid = W = dsc->valid - dv->missing;
928 if (dv->moments != NULL)
929 moments_calculate (dv->moments, NULL,
930 &dv->stats[DSC_MEAN], &dv->stats[DSC_VARIANCE],
931 &dv->stats[DSC_SKEWNESS], &dv->stats[DSC_KURTOSIS]);
932 if (dsc->calc_stats & (1ul << DSC_SEMEAN)
933 && dv->stats[DSC_VARIANCE] != SYSMIS && W > 0.)
934 dv->stats[DSC_SEMEAN] = sqrt (dv->stats[DSC_VARIANCE]) / sqrt (W);
935 if (dsc->calc_stats & (1ul << DSC_STDDEV)
936 && dv->stats[DSC_VARIANCE] != SYSMIS)
937 dv->stats[DSC_STDDEV] = sqrt (dv->stats[DSC_VARIANCE]);
938 if (dsc->calc_stats & (1ul << DSC_SEKURT))
939 if (dv->stats[DSC_KURTOSIS] != SYSMIS)
940 dv->stats[DSC_SEKURT] = calc_sekurt (W);
941 if (dsc->calc_stats & (1ul << DSC_SESKEW)
942 && dv->stats[DSC_SKEWNESS] != SYSMIS)
943 dv->stats[DSC_SESKEW] = calc_seskew (W);
944 dv->stats[DSC_RANGE] = ((dv->min == DBL_MAX || dv->max == -DBL_MAX)
945 ? SYSMIS : dv->max - dv->min);
946 dv->stats[DSC_MIN] = dv->min == DBL_MAX ? SYSMIS : dv->min;
947 dv->stats[DSC_MAX] = dv->max == -DBL_MAX ? SYSMIS : dv->max;
948 if (dsc->calc_stats & (1ul << DSC_SUM))
949 dv->stats[DSC_SUM] = W * dv->stats[DSC_MEAN];
951 if (dv->z_name && c != NULL)
953 *case_num_rw_idx (c, z_idx++) = dv->stats[DSC_MEAN];
954 *case_num_rw_idx (c, z_idx++) = dv->stats[DSC_STDDEV];
958 if (c != NULL)
959 casewriter_write (dsc->z_writer, c);
961 /* Output results. */
962 display (dsc);
965 /* Returns true if any of the descriptives variables in DSC's
966 variable list have missing values in case C, false otherwise. */
967 static bool
968 listwise_missing (struct dsc_proc *dsc, const struct ccase *c)
970 size_t i;
972 for (i = 0; i < dsc->n_vars; i++)
974 struct dsc_var *dv = &dsc->vars[i];
975 double x = case_num (c, dv->v);
977 if (var_is_num_missing (dv->v, x) & dsc->exclude)
978 return true;
980 return false;
983 /* Statistical display. */
985 static algo_compare_func descriptives_compare_dsc_vars;
987 /* Displays a table of descriptive statistics for DSC. */
988 static void
989 display (struct dsc_proc *dsc)
991 struct pivot_table *table = pivot_table_create (
992 N_("Descriptive Statistics"));
993 pivot_table_set_weight_var (table, dict_get_weight (dsc->dict));
995 struct pivot_dimension *statistics = pivot_dimension_create (
996 table, PIVOT_AXIS_COLUMN, N_("Statistics"));
997 pivot_category_create_leaf_rc (
998 statistics->root, pivot_value_new_text (N_("N")), PIVOT_RC_COUNT);
999 for (int i = 0; i < DSC_N_STATS; i++)
1000 if (dsc->show_stats & (1ul << i))
1001 pivot_category_create_leaf (statistics->root,
1002 pivot_value_new_text (dsc_info[i].name));
1004 if (dsc->sort_by_stat != DSC_NONE)
1005 sort (dsc->vars, dsc->n_vars, sizeof *dsc->vars,
1006 descriptives_compare_dsc_vars, dsc);
1008 struct pivot_dimension *variables = pivot_dimension_create (
1009 table, PIVOT_AXIS_ROW, N_("Variable"));
1010 for (size_t i = 0; i < dsc->n_vars; i++)
1012 const struct dsc_var *dv = &dsc->vars[i];
1014 int row = pivot_category_create_leaf (variables->root,
1015 pivot_value_new_variable (dv->v));
1017 int column = 0;
1018 pivot_table_put2 (table, column++, row,
1019 pivot_value_new_number (dv->valid));
1021 for (int j = 0; j < DSC_N_STATS; j++)
1022 if (dsc->show_stats & (1ul << j))
1024 union value v = { .f = dv->stats[j] };
1025 struct pivot_value *pv = (j == DSC_MIN || j == DSC_MAX
1026 ? pivot_value_new_var_value (dv->v, &v)
1027 : pivot_value_new_number (dv->stats[j]));
1028 pivot_table_put2 (table, column++, row, pv);
1032 int row = pivot_category_create_leaves (
1033 variables->root, N_("Valid N (listwise)"), N_("Missing N (listwise)"));
1034 pivot_table_put2 (table, 0, row, pivot_value_new_number (dsc->valid));
1035 pivot_table_put2 (table, 0, row + 1,
1036 pivot_value_new_number (dsc->missing_listwise));
1037 pivot_table_submit (table);
1040 /* Compares `struct dsc_var's A and B according to the ordering
1041 specified by CMD. */
1042 static int
1043 descriptives_compare_dsc_vars (const void *a_, const void *b_, const void *dsc_)
1045 const struct dsc_var *a = a_;
1046 const struct dsc_var *b = b_;
1047 const struct dsc_proc *dsc = dsc_;
1049 int result;
1051 if (dsc->sort_by_stat == DSC_NAME)
1052 result = utf8_strcasecmp (var_get_name (a->v), var_get_name (b->v));
1053 else
1055 double as = a->stats[dsc->sort_by_stat];
1056 double bs = b->stats[dsc->sort_by_stat];
1058 result = as < bs ? -1 : as > bs;
1061 if (!dsc->sort_ascending)
1062 result = -result;
1064 return result;