src/math/wilcoxon-sig.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2009 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 /* Thanks to Rob van Son for writing the original version of this
  18    file.  This version has been completely rewritten; only the
  19    name of the function has been retained.  In the process of
  20    rewriting, it was sped up from O(2**N) to O(N**3). */
  21
  22 #include <config.h>
  23 #include "wilcoxon-sig.h"
  24 #include <assert.h>
  25 #include <limits.h>
  26 #include <math.h>
  27 #include <stdlib.h>
  28 #include "xalloc.h"
  29
  30 /* For integers N and W, with 0 <= N < CHAR_BIT*sizeof(long),
  31    calculates and returns the value of the function S(N,W),
  32    defined as the number of subsets of 1, 2, 3, ..., N that sum
  33    to at least W.  There are 2**N subsets of N items, so S(N,W)
  34    is in the range 0...2**N.
  35
  36    There are a few trivial cases:
  37
  38            * For W <= 0, S(N,W) = 2**N.
  39
  40            * For W > N*(N+1)/2, S(N,W) = 0.
  41
  42            * S(1,1) = 1.
  43
  44    Notably, these trivial cases include all values of W for N = 1.
  45
  46    Now consider the remaining, nontrivial cases, that is, N > 1 and
  47    1 <= W <= N*(N+1)/2.  In this case, apply the following identity:
  48
  49            S(N,W) = S(N-1, W) + S(N-1, W-N).
  50
  51    The first term on the right hand is the number of subsets that do
  52    not include N that sum to at least W; the second term is the
  53    number of subsets that do include N that sum to at least W.
  54
  55    Then we repeatedly apply the identity to the result, reducing the
  56    value of N by 1 each time until we reach N=1.  Some expansions
  57    yield trivial cases, e.g. if W - N <= 0 (in which case we add a
  58    2**N term to the final result) or if W is greater than the new N.
  59
  60    Here is an example:
  61
  62    S(7,7) = S(6,7) + S(6,0)
  63           = S(6,7) + 64
  64
  65           = (S(5,7) + S(5,1)) + 64
  66
  67           = (S(4,7) + S(4,2)) + (S(4,1) + S(4,0)) + 64
  68           = S(4,7) + S(4,2) + S(4,1) + 80
  69
  70           = (S(3,7) + S(3,3)) + (S(3,2) + S(3,2)) + (S(3,1) + S(3,0)) + 80
  71           = S(3,3) + 2*S(3,2) + S(3,1) + 88
  72
  73           = (S(2,3) + S(2,0)) + 2*(S(2,2) + S(2,0)) + (S(2,1) + S(2,0)) + 88
  74           = S(2,3) + 2*S(2,2) + S(2,1) + 104
  75
  76           = (S(1,3) + S(1,1)) + 2*(S(1,2) + S(1,0)) + (S(1,1) + S(2,0)) + 104
  77           = 2*S(1,1) + 112
  78
  79           = 114
  80
  81    This function runs in O(N*W) = O(N**3) time.  It seems very
  82    likely that it can be made to run in O(N**2) time or perhaps
  83    even better, but N is, practically speaking, quite small.
  84    Plus, the return value may be as large as N**2, so N must not
  85    be larger than 31 on 32-bit systems anyhow, and even 63**3 is
  86    only 250,047.
  87 */
  88 static unsigned long int
  89 count_sums_to_W (unsigned long int n, unsigned long int w)
  90 {
  91   /* The array contain ints even though everything else is long,
  92      but no element in the array can have a value bigger than N,
  93      and using int will save some memory on 64-bit systems. */
  94   unsigned long int total;
  95   unsigned long int max;
  96   int *array;
  97
  98   assert (n < CHAR_BIT * sizeof (unsigned long int));
  99   if (n == 0)
 100     return 0;
 101   else if (w <= 0)
 102     return 1 << n;
 103   else if (w > n * (n + 1) / 2)
 104     return 0;
 105   else if (n == 1)
 106     return 1;
 107
 108   array = xcalloc (w + 1, sizeof *array);
 109   array[w] = 1;
 110
 111   max = w;
 112   total = 0;
 113   for (; n > 1; n--)
 114     {
 115       unsigned long int max_sum = n * (n + 1) / 2;
 116       int i;
 117
 118       if (max_sum < max)
 119         max = max_sum;
 120
 121       for (i = 1; i <= max; i++)
 122         if (array[i] != 0)
 123           {
 124             int new_w = i - n;
 125             if (new_w <= 0)
 126                total += array[i] * (1 << (n - 1));
 127             else
 128               array[new_w] += array[i];
 129           }
 130     }
 131   total += array[1];
 132   free (array);
 133   return total;
 134 }
 135
 136 /* Returns the exact, two-tailed level of significance for the
 137    Wilcoxon Matched-Pairs Signed-Ranks test, given sum of ranks
 138    of positive (or negative samples) W and sample size N.
 139
 140    Returns -1 if the exact significance level cannot be
 141    calculated because W is out of the supported range. */
 142 double
 143 LevelOfSignificanceWXMPSR (double w, long int n)
 144 {
 145   unsigned long int max_w;
 146
 147   /* Limit N to valid range that won't cause integer overflow. */
 148   if (n < 0 || n >= CHAR_BIT * sizeof (unsigned long int))
 149     return -1;
 150
 151   max_w = n * (n + 1) / 2;
 152   if (w < max_w / 2)
 153     w = max_w - w;
 154
 155   return count_sums_to_W (n, ceil (w)) / (double) (1 << n) * 2;
 156 }