Initial commit of newLISP.
[newlisp.git] / modules / stat.lsp
blob5a710b5d2be10ae939515d3974417e909ade03a0
1 ;; @module stat.lsp
2 ;; @description Basic statistics library
3 ;; @version 1.3 - comments redone for automatic documentation
4 ;; @author Lutz Mueller, 2001
5 ;;
6 ;; <h2>Functions for statistics and plotting with GNU plot</h2>
7 ;; To use this module it has to be loaded at the beginning of the
8 ;; program file:
9 ;; <pre>
10 ;; (load "/usr/share/newlisp/stat.lsp")
11 ;; </pre>
12 ;; All functions work on integers and floats or a mix of both. <lists> are normal
13 ;; LISP lists. <matrices> are lists of lists, one list for each row in the
14 ;; two dimensional data matrix. See the function 'stat:matrix' on how to make matrices
15 ;; from lists.
17 ;; This file also shows how to use some of the built in matrix math functions
18 ;; like 'multiply', 'transpose', 'invert' and 'fft'.
20 ;; Note, that the plot functions need 'gnuplot' installed, a free graphing
21 ;; package available for most operating systems, see: @link http://www.gnuplot.org www.gnuplot.org
23 ;; This is the oldest module written in newLISP. Some parts could be rewritten for faster
24 ;; performance on big data sets using arrays instead of lists.
25 ;; <br/>
26 ;; <center><h2>Plot functions (requires 'gnuplot')</h2></center>
28 ;; The current directory for newLISP should be writable to use these functions.
30 ;; @syntax (stat:plot <p1> <p2> ... <pN>)
31 ;; @param <p1> First value to plot.
32 ;; @param <p2> Second value to plot.
33 ;; @param <pN> Nth value to plot.
34 ;; @return The process ID of the gnuplot process.
36 ;; @syntax (stat:plotXY X Y)
37 ;; @param <X> List of x-coordinates to plot.
38 ;; @param <Y> List of y-coordinates to plot.
39 ;; @return The process ID of the gnuplot process.
41 ;; <br/>
42 ;; <center><h2>General uni- and bi- variate statistics</h2></center>
44 ;; @syntax (stat:sum <X>)
45 ;; @param <X> A list of numbers,
46 ;; @return Sum of data in list <X>.
48 ;; @syntax (stat:mean <X>)
49 ;; @param <X> A list of numbers.
50 ;; @return The mean of data in list <X>.
52 ;; @syntax (stat:var <X>)
53 ;; @param <X> A list of numbers.
54 ;; @return The variance of the data in list <X>.
56 ;; @syntax (stat:sdev <X>)
57 ;; @param <X> A list of numbers.
58 ;; @return Standard deviation of data in list <X>.
60 ;; @syntax (stat:sum-sq <X>)
61 ;; @param <X> A list of numbers.
62 ;; @return Sum of <x*x> data elements in list <X>.
64 ;; @syntax (stat:sum-xy <X> <Y>)
65 ;; @param <X> A list of numbers.
66 ;; @param <Y> A list of numbers.
67 ;; @return Sum of products <x*y> data elements in lists <X> and <Y>.
69 ;; @syntax (stat:cov <X> <Y>)
70 ;; @param <X> A list of numbers.
71 ;; @param <Y> A list of numbers.
72 ;; @return Covariance of data in lists <X> and <Y>
74 ;; @syntax (stat:sum-d2 <X>)
75 ;; @param <X> A list of numbers.
76 ;; @return Sum of squared diffs <(x - mean(X))^2> in list <X>.
78 ;; @syntax (stat:corr <X> <Y>)
79 ;; @param <X> A list of numbers.
80 ;; @param <Y> A list of numbers.
81 ;; @return Correlation coefficient of lists <X> and <Y>.
83 ;; @syntax (stat:regression <X> <Y>)
84 ;; @param <X> A list of numbers.
85 ;; @param <Y> A list of numbers.
86 ;; returns <(b0 b1)> coefficients of regression <Y = b0 + b1*X>.
88 ;; @syntax (stat:fit <X> <Y>)
89 ;; @param <X> A list of numbers.
90 ;; @param <Y> A list of numbers.
91 ;; @return fitted line based on '(stat:regression X Y)'.
93 ;; @syntax (stat:sum-d2xy <X> <Y>)
94 ;; @return Sum of squared differences <(x - y)^2> of elements in lists <X> and <Y>.
96 ;; @syntax (stat:moments <X>)
97 ;; @param <X> A list of numbers.
98 ;; @return Calculates all moments of list <X>.
100 ;; @syntax (stat:f-prob <F> <df1> <df2>)
101 ;; @param <F> The variance ratio.
102 ;; @param <df1> Degrees of freedom.
103 ;; @param <df2> Degrees of freedom.
104 ;; @return Probablity of F variance ratio for <df1>, <df2> degress of freedom.
106 ;; <br/>
107 ;; <center><h2>Multi variate statistics</h2></center>
109 ;; @syntax (stat:multiple-reg <X> <offY>)
110 ;; @param <X> A list of numbers.
111 ;; @param <offY> Zero based offset into <Y>.
112 ;; @return Multiple regression of vars in <X> onto <Y> at <offsetY>.
114 ;; @syntax (stat:cov-matrix <X>)
115 ;; @param <X> A list of numbers.
116 ;; @return Covariance matrix of <X> with <N> rows and <k> columns.
118 ;; @syntax (stat:corr-matrix <X>)
119 ;; @param <X> A list of numbers.
120 ;; @return Correlation matrix of <X> with <N> rows and <k> columns.
122 ;; <br/>
123 ;; <center><h2>Time series</h2></center>
125 ;; @syntax (stat:smooth <X> <alpha>)
126 ;; @param <X> A list of numbers.
127 ;; @param <alpha> Smoothing coefficient <0 &lt; alpha &lt; 1>.
128 ;; @return Exponentially smoothed sequence in <X>.
130 ;; @syntax (stat:lag <X> <n>)
131 ;; @param <X> A list of numbers.
132 ;; @param <n> Lag n.
133 ;; @return A differenced list of <X> with a lag of <n>.
134 ;; If the length of list <X> is <l> then the length of the resulting
135 ;; differenced list is <l - n>.
137 ;; @syntax (stat:cumulate <X>)
138 ;; @param <X> A list of numbers.
139 ;; @return The cumulated list of <X>.
141 ;; @syntax (stat:power <TS>)
142 ;; @param <TS> A time series of numbers.
143 ;; @return The power spectrum of a time series
145 ;; <br/>
146 ;; <center><h2>Matrix and list utilities</h2></center>
148 ;; @syntax (stat:matrix <C1> .... <CN>)
149 ;; @param <C1> The first column list of values.
150 ;; @param <CN> The Nth column list of values.
151 ;; @return A matrix off <1> to <N> columns <C>.
153 ;; @syntax (stat:diagonal <item> <N>)
154 ;; @param <item> The diagonal element.
155 ;; @return A diagonal matrix of length <N> with <item> in the diagonal.
157 ;; @syntax (stat:get-diagonal <X>)
158 ;; @param <X> An matrix filled with numbers.
159 ;; @return A list from the diagonal elements of <X>.
161 ;; @syntax (stat:mat-map <op> <A> <B>)
162 ;; @return Matrix map, e.g. '(stat:mat-map + A B)'.
163 ;; Used for adding and subtracting matrices.
165 (context 'stat)
167 ;---------------------------- gnuplot interface -------------------------------
169 ; These plot functions need gnuplot installed and works fine under UNIX
170 ; or WIN32.
172 ; The plot functions produce data files for the plot data and acommand files
173 ; for gnuplot, then execute gnuplot.
175 ; The routines need write access to the current directory to produce temporary files
176 ; used by gnuplot:
178 ; The file 'plot' contains the generated plot commands for gnuplot
179 ; data is saved in files with the names of symbols containing the data
180 ; or in files named 'series-1', 'series-2' etc.
182 ; e.g.: (plot '(1 3 2 5 4)) will produce a 'series-1' ascii file
183 ; but (plot age height) will produce 'age' and 'height' data files
188 ; plot one or more lists of numbers 'args' is used to access the list of
189 ; input parameters to 'plot'
191 (define (plot)
192 (let ( fileList '()
193 cnt 0
194 unix (< (& 0xF (last (sys-info))) 5)
195 fileName "")
196 ; for each list write an ascii file
197 (dolist (elmnt (args))
198 (if (list? elmnt)
199 (begin
200 (inc 'cnt)
201 (if (symbol? elmnt)
202 (set 'fileName (string elmnt))
203 (if unix
204 (set 'fileName (append (env "HOME") "/tmp/series-" (string cnt)))
205 (set 'fileName (append "series-" (string cnt)))))
207 (write-file fileName (list2ascii elmnt))
208 (push (append "'" fileName "'") fileList))))
210 (set 'fileList (join (reverse fileList) ","))
211 ; write file with plot commands depending on OS - Windows doesn't have PWD
212 (if unix
213 (write-file (append (env "HOME") "/tmp/plot") (append "set data style lines; plot " fileList "\r\n"))
214 (write-file "plot" (append "set data style lines; plot " fileList ";pause -1;\r\n")))
216 (if unix
217 (process (append "gnuplot -persist " (env "HOME") "/tmp/plot"))
218 (process "gnuplot plot"))))
222 ; plot to lists of numbers in XY-fashion
224 ; optionally define a style e.g: "line"
225 ; if style is unused dots are plotted
227 (define (plotXY X Y style , lst st fle unix tempDir)
228 (set 'tempDir (append (env "HOME") "/tmp"))
229 (set 'unix (< (& 0xF (last (sys-info))) 5 ))
230 (set 'lst (map list (map string X) (map string Y)))
231 (if unix
232 (set 'fle (open (append (env "HOME") "/tmp/plot-XY") "write"))
233 (set 'fle (open "plot-XY" "write")))
234 (dolist (x lst)
235 (write-line (join x " ") fle))
236 (close fle)
237 (if (not style)
238 (set 'st "")
239 (set 'st (append "set data style " style "; ")))
241 (if unix
242 (write-file (append tempDir "/plot") (append st "plot '" tempDir "/plot-XY';"))
243 (write-file "plot" (append st "plot 'plot-XY'; pause -1;")))
244 (if unix
245 (process (append "gnuplot -persist " tempDir "/plot"))
246 (process "gnuplot plot")))
249 ;------------------- General uni and bi-variate statistics --------------------
251 ; sum of a data vector X
252 (define (sum X)
253 (apply add X))
255 ; mean of a data vector X
256 (define (mean X)
257 (div (sum X) (length X)))
259 ; variance of a data vecor X
260 (define (var X)
261 (div (sum-d2 X) (sub (length X) 1)))
263 ; standard deviation of a data vector X
264 (define (sdev X)
265 (sqrt (var X)))
267 ; sum of squares of a data vector X
268 (define (sum-sq X)
269 (apply add (map mul X X)))
271 ; sum of the product of data vectors X*Y
272 (define (sum-xy X Y)
273 (apply add (map mul X Y)))
275 ; covariance of data vectors X Y
276 (define (cov X Y)
277 (sub (sum-xy X Y) (div (mul (sum X) (sum Y)) (length X))))
279 ; sum of sqared differenses of X to mean of X
280 (define (sum-d2 X)
281 (sub (sum-sq X) (div (mul (sum X) (sum X)) (length X))))
283 ; Pearson r, product moment correlation of data vectors X and Y
284 (define (corr X Y)
285 (div (cov X Y) (sqrt (mul (sum-d2 X) (sum-d2 Y)))))
287 ; regression Yest = b0 + b1*X calculates intercept b0 and slope b1
288 (define (regression X Y)
289 (set 'b1 (div (cov X Y) (sum-d2 X)))
290 (set 'b0 (sub (mean Y) (mul b1 (mean X))))
291 (list b0 b1))
293 ; fitted line using regression Y on X
294 (define (fit X Y, coeffs b0 b1)
295 (set 'coeffs (regression X Y))
296 (set 'b0 (first coeffs))
297 (set 'b1 (last coeffs))
298 (map (lambda (x) (add b0 (mul x b1))) X))
300 ; sum of squared differences of X and Y
301 (define (sum-d2xy X Y)
302 (apply add (map (lambda (x y) (mul (sub x y) (sub x y))) X Y)))
306 ; moments of a vector of numbers
308 (define (moments vector, n median mean avg-dev std-dev var skew kurtosis dev sum)
309 (set 'n (length vector))
311 (set 'sum (apply add vector))
312 (set 'mean (div sum n))
314 (set 'avg-dev 0 'std-dev 0 'var 0 'skew 0 'kurtosis 0)
316 (set 'dev (map sub vector (dup mean n)))
317 (set 'avg-dev (div (apply add (map abs dev)) n))
318 (set 'var (div (apply add (map mul dev dev)) (- n 1)))
319 (set 'skew (apply add (map mul dev dev dev)))
320 (set 'kurtosis (apply add (map mul dev dev dev dev)))
322 (set 'std-dev (sqrt var))
324 (if (> var 0.0)
325 (begin
326 (set 'skew (div skew (mul n var std-dev)))
327 (set 'kurtosis (sub (div kurtosis (mul n var var)) 3.0))))
329 (sort vector)
330 (set 'mid (/ n 2))
332 (if (= 0 (% n 2))
333 (set 'median (div (add (nth mid vector) (nth (- mid 1) vector)) 2))
334 (set 'median (nth mid vector)))
336 (list n median mean avg-dev std-dev var skew kurtosis)
338 ; (println (format "n: %d" n))
339 ; (println (format "median: %f" median))
340 ; (println (format "mean: %f" mean))
341 ; (println (format "average_deviation: %f" avg-dev))
342 ; (println (format "standard_deviation: %f" std-dev))
343 ; (println (format "variance: %f" var))
344 ; (println (format "skew: %f" skew))
345 ; (println (format "kurtosis: %f" kurtosis))
348 ;-------------------------------- Time Series ----------------------------------
350 ; expontial smoothing with 0 < alpha <= 1
351 (define (smooth lst alpha , previous slist)
352 (set 'previous (first lst))
353 (set 'slist '())
354 (dolist (elmnt lst)
355 (set 'previous (add (mul alpha elmnt) (mul (sub 1 alpha) previous)))
356 (push previous slist))
357 (reverse slist))
361 ; seasonal difference list with variable lag
362 ; the resulting list is lag shorterm than the original
364 (define (lag lst n , sLst)
365 (set 'sLst lst)
366 (dotimes (i n) (pop lst))
367 (set 'sLst (slice sLst 0 (length lst)))
368 (map sub lst sLst))
372 ; cumulate of a list
374 (define (cumulate lst , sc cum)
375 (set 'sc 0.0)
376 (set 'cum '())
377 (dolist (x lst)
378 (push (inc 'sc x) cum))
379 (reverse cum))
382 ; power spectrum
384 ; takes a rows by 2 columns matrix with real part in the first and
385 ; imagenary part the in the second column. If all numbers are real
386 ; then the second column is just 0's.
388 ; returns a matrix with two rows. First row contains power numbers
389 ; and second row contains the respective frequencies
392 (define (power ts , lenOrg fts n n2 ps mid frqs)
393 ; remember original length
394 (set 'lenOrg (length ts))
395 ; do discrete fourier transform
396 (set 'fts (transpose (fft ts)))
397 ; calc power spectrum
398 (set 'n (length (transpose fts)))
399 (set 'n2 (mul n n))
400 (set 'ps (map (lambda (x y) (add (mul x x) (mul y y))) (nth 0 fts) (nth 1 fts)))
401 (set 'ps (map (lambda (x) (mul (div x n2) 2)) ps))
402 ; the first and last are not multiplied by 2, divide them back
403 (nth-set (ps 0) (div (first ps) 2))
404 (set 'mid (sub (div n 2) 1))
405 (replace mid ps (div (nth mid ps) 2))
406 ; calc a vector with frequencies, adjusted for the new power-2 length
407 ; which came back from the FFT
408 (set 'frqs (sequence 1 n))
409 (set 'frqs (map (lambda (x) (mul (div x n) lenOrg)) frqs))
410 (transpose (matrix ps frqs)))
414 ;------------------------- multivariate statistics -----------------------------
417 ; multiple regression of variables in X onto one of varsiables in X, Y
418 ; indicated by column offset offY
420 ; X is N rows by k columns, the column at offset offY is Y
422 ; returns a matrix with two rows:
423 ; first row is regression coefficients and multiple R: b0, b1, b2 ....., R
424 ; second row is sum of squares: regression-SQ, error-SQ, total-SQ
425 ; (the unused part of the second row is zero padded)
427 ; the SQs can be used to calculate mean sqares for regression and error:
429 ; regression-MSQ = regression-SQ / (k - 1)
430 ; error-MSQ = error-sq / (n - k - 1)
432 ; F-ratio = regression-MSQ / error-MSQ
433 ; with k and (n - k - 1) df degreees of freedom
438 (define (multiple-reg X offY , Y Ycoffs b b0 R2 Yest sqErr sqTotal sqReg sq d)
439 (set 'covX (cov-matrix X))
440 (set 'Y (extract-col X offY))
441 ; covX is the covariance matrix
442 (pop covX offY)
443 (set 'cvX (transpose covX))
444 ; the covariance matrix is reduced to cvX and the
445 ; extracted values put in Ycoffs
446 (set 'Ycoffs (matrix (pop cvX offY)))
447 ; b contains the regression coefficients except for b0
448 (set 'b (multiply (invert cvX) Ycoffs))
449 ; calculate multiple R2 as b'*b / sqTotal
450 (set 'sqTotal (sum-d2 Y))
451 (set 'R2 (div (first (first (multiply (transpose b) Ycoffs))) sqTotal))
452 ; estimate Y without b0
453 (set 'Yest (multiply (reduce-col X offY) b))
454 ; calculate b0, d is the difference between Y and the Y estimate
455 ; b0 is the mean of differences between Y and Yest
456 (set 'd (mat-map sub (matrix Y) Yest))
457 (set 'b0 (mean (first (transpose d))))
458 ; estimate Y including b0
459 (set 'Yest (mat-map add Yest (matrix (dup b0 (length Yest)))))
460 ; error sum of squares
461 (set 'sqErr (sum-d2xy Y (first (transpose Yest))))
462 ; regression sum of squares
463 (set 'sqReg (sub sqTotal sqErr))
464 ; make list b out of b0, b1, b2 ... sqrt(R2)
465 (set 'b (append (list b0) (first (transpose b)) (list (sqrt R2))))
466 ; make list sq out of sqReg, sqErr and sqTotal
467 (set 'sq (list sqReg sqErr sqTotal))
468 ; return matrix with two rows:
469 (transpose (matrix b sq)))
474 ; covariance matrix cov
476 ; matrix x with N rows and k columns
479 (define (cov-matrix X , XtX N I sumX sumX2)
480 (set 'XtX (multiply (transpose X) X))
481 (set 'N (length X))
482 (set 'I (matrix (dup t1 N)))
483 (set 'sumX (multiply (transpose X) I))
484 (set 'sumX2 (multiply sumX (transpose sumX)))
485 (set 'sumX2 (multiply sumX2 (diagonal (div 1 N) (length sumX2))))
486 (mat-map sub XtX sumX2))
490 ; correlation matrix
492 ; matrix X with N rows and k columns
495 (define (corr-matrix X , covX N d dd)
496 (set 'covX (cov-matrix X))
497 (set 'd (matrix (get-diagonal covX)))
498 (set 'dd (multiply d (transpose d)))
499 (set 'dd (map (lambda (z) (map sqrt z)) dd))
500 (mat-map div covX dd))
504 ; probablity of F variance ratio with degrees of freedom df1 df2
507 (define (f-prob F df1 df2)
508 (let (prob (mul 2 (betai (div df2 (add df2 (mul df1 F)))
509 (mul 0.5 df2)
510 (mul 0.5 df1))))
511 (div (if (> prob 1) (sub 2 prob) prob) 2)))
514 ;----------------------------- utility functions -------------------------------
517 ; make a matrix from 1 up to 16 lists
519 (define (matrix)
520 (transpose (args)))
524 ; make a diagonal matrix n by n and elmnt in the diagonal
527 (define (diagonal elmnt n, m lst)
528 (set 'm '())
529 (dotimes (i n)
530 (set 'lst (dup 0 n))
531 (nth-set (lst i) elmnt)
532 (push lst m))
533 (reverse m))
536 ; get the diagonal from a square matrix
538 (define (get-diagonal X , d x)
539 (set 'd '())
540 (dotimes (idx (length X))
541 (push (nth idx (nth idx X)) d))
542 (reverse d))
545 ; matrix map
547 ; e.g.: (mat-map sub A B) ;; for matrix subtraction
549 (define (mat-map op A B)
550 (map (lambda (x y) (map op x y)) A B))
553 ; reduce matrix by a column at offset
555 ; returns the reduced matrix
557 (define (reduce-col mat off, X)
558 (set 'mat (transpose mat))
559 (pop mat off)
560 (transpose mat))
563 ; extract a column from a matrix
565 ; returns the extracted column
567 (define (extract-col mat off, X)
568 (pop (transpose mat) off))
571 ; convert list to ascii lines terminated by CR-LF
572 ; for storage in files usable by Gnuplot, R, Excel etc.
574 ; example:
576 ; (write-file "MyData.txt" (list2ascii mydata-list))
578 (define (list2ascii lst)
579 (append (join (map string lst) "\r\n") "\r\n"))
581 ; eof