Change how checking for missing values works.
[pspp.git] / src / language / stats / median.c
blob93ba08e21033ab35a2a81a6adb6938bf56e35dae
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
18 #include <config.h>
19 #include "median.h"
21 #include <gsl/gsl_cdf.h>
23 #include "data/format.h"
26 #include "data/variable.h"
27 #include "data/case.h"
28 #include "data/dictionary.h"
29 #include "data/dataset.h"
30 #include "data/casereader.h"
31 #include "data/casewriter.h"
32 #include "data/subcase.h"
33 #include "data/value.h"
35 #include "math/percentiles.h"
36 #include "math/sort.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/hmap.h"
40 #include "libpspp/array.h"
41 #include "libpspp/str.h"
42 #include "libpspp/misc.h"
44 #include "output/pivot-table.h"
46 #include "gettext.h"
47 #define N_(msgid) msgid
48 #define _(msgid) gettext (msgid)
51 struct val_node
53 struct hmap_node node;
54 union value val;
55 casenumber le;
56 casenumber gt;
59 struct results
61 const struct variable *var;
62 struct val_node **sorted_array;
63 double n;
64 double median;
65 double chisq;
70 static int
71 val_node_cmp_3way (const void *a_, const void *b_, const void *aux)
73 const struct variable *indep_var = aux;
74 const struct val_node *const *a = a_;
75 const struct val_node *const *b = b_;
77 return value_compare_3way (&(*a)->val, &(*b)->val, var_get_width (indep_var));
80 static void
81 show_frequencies (const struct n_sample_test *nst, const struct results *results, int n_vals, const struct dictionary *);
83 static void
84 show_test_statistics (const struct n_sample_test *nst, const struct results *results, int, const struct dictionary *);
87 static struct val_node *
88 find_value (const struct hmap *map, const union value *val,
89 const struct variable *var)
91 struct val_node *foo = NULL;
92 size_t hash = value_hash (val, var_get_width (var), 0);
93 HMAP_FOR_EACH_WITH_HASH (foo, struct val_node, node, hash, map)
94 if (value_equal (val, &foo->val, var_get_width (var)))
95 break;
97 return foo;
100 void
101 median_execute (const struct dataset *ds,
102 struct casereader *input,
103 enum mv_class exclude,
104 const struct npar_test *test,
105 bool exact UNUSED,
106 double timer UNUSED)
108 const struct dictionary *dict = dataset_dict (ds);
109 const struct variable *wvar = dict_get_weight (dict);
110 bool warn = true;
111 int v;
112 const struct median_test *mt = UP_CAST (test, const struct median_test,
113 parent.parent);
115 const struct n_sample_test *nst = UP_CAST (test, const struct n_sample_test,
116 parent);
118 const bool n_sample_test = (value_compare_3way (&nst->val2, &nst->val1,
119 var_get_width (nst->indep_var)) > 0);
121 struct results *results = XCALLOC (nst->n_vars, struct results);
122 int n_vals = 0;
123 for (v = 0; v < nst->n_vars; ++v)
125 double count = 0;
126 double cc = 0;
127 double median = mt->median;
128 const struct variable *var = nst->vars[v];
129 struct ccase *c;
130 struct hmap map = HMAP_INITIALIZER (map);
131 struct casereader *r = casereader_clone (input);
135 if (n_sample_test == false)
137 struct val_node *vn = XZALLOC (struct val_node);
138 value_clone (&vn->val, &nst->val1, var_get_width (nst->indep_var));
139 hmap_insert (&map, &vn->node, value_hash (&nst->val1,
140 var_get_width (nst->indep_var), 0));
142 vn = xzalloc (sizeof *vn);
143 value_clone (&vn->val, &nst->val2, var_get_width (nst->indep_var));
144 hmap_insert (&map, &vn->node, value_hash (&nst->val2,
145 var_get_width (nst->indep_var), 0));
148 if (median == SYSMIS)
150 struct percentile *ptl;
151 struct order_stats *os;
153 struct casereader *rr;
154 struct subcase sc;
155 struct casewriter *writer;
156 subcase_init_var (&sc, var, SC_ASCEND);
157 rr = casereader_clone (r);
158 writer = sort_create_writer (&sc, casereader_get_proto (rr));
160 for (; (c = casereader_read (rr)) != NULL;)
162 if (var_is_value_missing (var, case_data (c, var)) & exclude)
164 case_unref (c);
165 continue;
168 cc += dict_get_case_weight (dict, c, &warn);
169 casewriter_write (writer, c);
171 subcase_destroy (&sc);
172 casereader_destroy (rr);
174 rr = casewriter_make_reader (writer);
176 ptl = percentile_create (0.5, cc);
177 os = &ptl->parent;
179 order_stats_accumulate (&os, 1,
181 wvar,
182 var,
183 exclude);
185 median = percentile_calculate (ptl, PC_HAVERAGE);
186 statistic_destroy (&ptl->parent.parent);
189 results[v].median = median;
192 for (; (c = casereader_read (r)) != NULL; case_unref (c))
194 struct val_node *vn ;
195 const double weight = dict_get_case_weight (dict, c, &warn);
196 const union value *val = case_data (c, var);
197 const union value *indep_val = case_data (c, nst->indep_var);
199 if (var_is_value_missing (var, case_data (c, var)) & exclude)
201 continue;
204 if (n_sample_test)
206 int width = var_get_width (nst->indep_var);
207 /* Ignore out of range values */
208 if (
209 value_compare_3way (indep_val, &nst->val1, width) < 0
211 value_compare_3way (indep_val, &nst->val2, width) > 0
214 continue;
218 vn = find_value (&map, indep_val, nst->indep_var);
219 if (vn == NULL)
221 if (n_sample_test == true)
223 int width = var_get_width (nst->indep_var);
224 vn = xzalloc (sizeof *vn);
225 value_clone (&vn->val, indep_val, width);
227 hmap_insert (&map, &vn->node, value_hash (indep_val, width, 0));
229 else
231 continue;
235 if (val->f <= median)
236 vn->le += weight;
237 else
238 vn->gt += weight;
240 count += weight;
242 casereader_destroy (r);
245 int x = 0;
246 struct val_node *vn = NULL;
247 double r_0 = 0;
248 double r_1 = 0;
249 HMAP_FOR_EACH (vn, struct val_node, node, &map)
251 r_0 += vn->le;
252 r_1 += vn->gt;
255 results[v].n = count;
256 results[v].sorted_array = XCALLOC (hmap_count (&map), struct val_node *);
257 results[v].var = var;
259 HMAP_FOR_EACH (vn, struct val_node, node, &map)
261 double e_0j = r_0 * (vn->le + vn->gt) / count;
262 double e_1j = r_1 * (vn->le + vn->gt) / count;
264 results[v].chisq += pow2 (vn->le - e_0j) / e_0j;
265 results[v].chisq += pow2 (vn->gt - e_1j) / e_1j;
267 results[v].sorted_array[x++] = vn;
270 n_vals = x;
271 hmap_destroy (&map);
273 sort (results[v].sorted_array, x, sizeof (*results[v].sorted_array),
274 val_node_cmp_3way, nst->indep_var);
279 casereader_destroy (input);
281 show_frequencies (nst, results, n_vals, dict);
282 show_test_statistics (nst, results, n_vals, dict);
284 for (v = 0; v < nst->n_vars; ++v)
286 int i;
287 const struct results *rs = results + v;
289 for (i = 0; i < n_vals; ++i)
291 struct val_node *vn = rs->sorted_array[i];
292 value_destroy (&vn->val, var_get_width (nst->indep_var));
293 free (vn);
295 free (rs->sorted_array);
297 free (results);
302 static void
303 show_frequencies (const struct n_sample_test *nst, const struct results *results, int n_vals, const struct dictionary *dict)
305 struct pivot_table *table = pivot_table_create (N_("Frequencies"));
306 pivot_table_set_weight_var (table, dict_get_weight (dict));
308 struct pivot_dimension *indep = pivot_dimension_create__ (
309 table, PIVOT_AXIS_COLUMN, pivot_value_new_variable (nst->indep_var));
310 indep->root->show_label = true;
311 for (int i = 0; i < n_vals; ++i)
312 pivot_category_create_leaf_rc (
313 indep->root, pivot_value_new_var_value (
314 nst->indep_var, &results->sorted_array[i]->val), PIVOT_RC_COUNT);
315 pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Statistics"),
316 N_("> Median"), N_("≤ Median"));
318 struct pivot_dimension *dep = pivot_dimension_create (
319 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
321 for (int v = 0; v < nst->n_vars; ++v)
323 const struct results *rs = &results[v];
325 int dep_idx = pivot_category_create_leaf (
326 dep->root, pivot_value_new_variable (rs->var));
328 for (int indep_idx = 0; indep_idx < n_vals; indep_idx++)
330 const struct val_node *vn = rs->sorted_array[indep_idx];
331 pivot_table_put3 (table, indep_idx, 0, dep_idx,
332 pivot_value_new_number (vn->gt));
333 pivot_table_put3 (table, indep_idx, 1, dep_idx,
334 pivot_value_new_number (vn->le));
338 pivot_table_submit (table);
341 static void
342 show_test_statistics (const struct n_sample_test *nst,
343 const struct results *results,
344 int n_vals,
345 const struct dictionary *dict)
347 struct pivot_table *table = pivot_table_create (N_("Test Statistics"));
348 pivot_table_set_weight_var (table, dict_get_weight (dict));
350 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
351 N_("N"), PIVOT_RC_COUNT,
352 N_("Median"),
353 N_("Chi-Square"), PIVOT_RC_OTHER,
354 N_("df"), PIVOT_RC_COUNT,
355 N_("Asymp. Sig."), PIVOT_RC_SIGNIFICANCE);
357 struct pivot_dimension *variables = pivot_dimension_create (
358 table, PIVOT_AXIS_ROW, N_("Variables"));
360 for (int v = 0; v < nst->n_vars; ++v)
362 double df = n_vals - 1;
363 const struct results *rs = &results[v];
365 int var_idx = pivot_category_create_leaf (
366 variables->root, pivot_value_new_variable (rs->var));
368 double entries[] = {
369 rs->n,
370 rs->median,
371 rs->chisq,
373 gsl_cdf_chisq_Q (rs->chisq, df),
375 for (size_t i = 0; i < sizeof entries / sizeof *entries; i++)
377 struct pivot_value *value
378 = pivot_value_new_number (entries[i]);
379 if (i == 1)
380 value->numeric.format = *var_get_print_format (rs->var);
381 pivot_table_put2 (table, i, var_idx, value);
385 pivot_table_submit (table);