factoring out the make-dataframe2 methods into relevant files. We don-t know about...
[CommonLispStat.git] / src / stat-models / regression-sheeple.lisp
blob38ed45761ea8802a48133dd68173f3a03f59e76c
1 ;;; -*- mode: lisp -*-
3 ;;;;
4 ;;;; regression.lsp XLISP-STAT regression model proto and methods
5 ;;;; XLISP-STAT 2.1 Copyright (c) 1990, by Luke Tierney
6 ;;;; Additions to Xlisp 2.1, Copyright (c) 1989 by David Michael Betz
7 ;;;; You may give out copies of this software; for conditions see the file
8 ;;;; COPYING included with this distribution.
9 ;;;;
10 ;;;;
11 ;;;; Incorporates modifications suggested by Sandy Weisberg.
13 ;;; Sheeple is another prototype-based system.
14 (asdf:oos 'asdf:load-op 'sheeple)
16 (defpackage :regression-sheeple
17 (:use common-lisp
18 sheeple
19 lisp-matrix
20 lisp-stat-data-examples)
21 (:export regression-model regression-model-proto x y intercept
22 sweep-matrix basis weights included total-sum-of-squares
23 residual-sum-of-squares predictor-names response-name
24 case-labels))
26 (in-package :regression-clon)
28 ;;; Support functions
30 (defun flatten-list (lst)
31 "Flattens a list of lists into a single list. Only useful when
32 we've mucked up data. Sign of usage means poor coding!"
33 (cond ((null lst) ;; endp?
34 nil)
35 ((listp lst)
36 (append (flatten-list (car lst)) (flatten-list (cdr lst))))
38 (list lst))))
40 (defun repeat (a b)
41 "Args: (vals times)
42 Repeats VALS. If TIMES is a number and VALS is a non-null, non-array atom,
43 a list of length TIMES with all elements eq to VALS is returned. If VALS
44 is a list and TIMES is a number then VALS is appended TIMES times. If
45 TIMES is a list of numbers then VALS must be a list of equal length and
46 the simpler version of repeat is mapped down the two lists.
47 Examples: (repeat 2 5) returns (2 2 2 2 2)
48 (repeat '(1 2) 3) returns (1 2 1 2 1 2)
49 but not:
50 (repeat '(4 5 6) '(1 2 3)) returns (4 5 5 6 6 6)
51 (repeat '((4) (5 6)) '(2 3)) returns (4 4 5 6 5 6 5 6)"
52 (if (> b 2)
53 (flatten-list (append (repeat (list a) (- b 1)) (list a)))
54 a))
56 (cond ((compound-data-p b)
57 (let* ((reps (coerce (compound-data-seq (map-elements #'repeat a b))
58 'list))
59 (result (first reps))
60 (tail (last (first reps))))
61 (dolist (next (rest reps) result)
62 (when next
63 (setf (rest tail) next)
64 (setf tail (last next))))))
65 (t (let* ((a (if (compound-data-p a)
66 (coerce (compound-data-seq a) 'list)
67 (list a)))
68 (result nil))
69 (dotimes (i b result)
70 (let ((next (copy-list a)))
71 (if result (setf (rest (last next)) result))
72 (setf result next)))))))
77 ;;;; Regresion Model Prototype
79 (defsheep =regression-model-proto= ()
80 (( x y intercept sweep-matrix basis weights
81 included total-sum-of-squares residual-sum-of-squares
82 predictor-names response-name case-labels)))
83 ;; (:documentation "port of proto to SHEEPLE for Normal Linear Regression Model.")
85 ;; The doc for this function string is at the limit of XLISP's string
86 ;; constant size - making it longer may cause problems
87 (defun regression-model (x y &key
88 (intercept T)
89 (print T)
90 weights
91 (included (repeat t (length y)))
92 predictor-names
93 response-name
94 case-labels)
95 "Args: (x y &key (intercept T) (print T) weights
96 included predictor-names response-name case-labels)
97 X - list of independent variables or X matrix
98 Y - dependent variable.
99 INTERCEPT - T to include (default), NIL for no intercept
100 PRINT - if not NIL print summary information
101 WEIGHTS - if supplied should be the same length as Y; error variances are
102 assumed to be inversely proportional to WEIGHTS
103 PREDICTOR-NAMES
104 RESPONSE-NAME
105 CASE-LABELS - sequences of strings or symbols.
106 INCLUDED - if supplied should be the same length as Y, with elements nil
107 to skip a in computing estimates (but not in residual analysis).
108 Returns a regression model object. To examine the model further assign the
109 result to a variable and send it messages.
110 Example (data are in file absorbtion.lsp in the sample data directory/folder):
111 (def m (regression-model (list iron aluminum) absorbtion))
112 (send m :help)
113 (send m :plot-residuals)"
114 (check-type x matrix-like)
115 (let ((m (clone =regression-model-proto=)))
116 (setf (field-value x m) x)
117 (setf (field-value y m) y)
118 (send nil :intercept m intercept)
119 (send nil :weights m weights)
120 (send nil :included m included)
121 (send nil :predictor-names m predictor-names)
122 (send nil :response-name m response-name)
123 (send nil :case-labels m case-labels)
124 (if print (send nil :display m))
127 (define-method
128 is-new regression-model-proto ()
129 (send nil self :needs-computing t))
131 (define-method
132 save regression-model-proto ()
133 "Message args: ()
134 Returns an expression that will reconstruct the regression model."
135 `(regression-model ',(send nil :x self)
136 ',(send nil :y self)
137 :intercept ',(send nil :intercept self)
138 :weights ',(send nil :weights self)
139 :included ',(send nil :included self)
140 :predictor-names ',(send nil :predictor-names self)
141 :response-name ',(send nil :response-name self)
142 :case-labels ',(send nil :case-labels self)))
145 ;;; Computing and Display Methods
148 (define-method
149 compute regression-model-proto ()
150 "Message args: (). Recomputes the estimates. For internal use by other messages"
151 (let* ((included (if-else (send nil :included) 1 0))
152 (x (send nil :x self))
153 (y (send nil :y self))
154 (intercept (send nil :intercept self))
155 (weights (send nil :weights self))
156 (w (if weights (* included weights) included))
157 ;; (m (make-sweep-matrix x y w))
158 (n (array-dimension x 1))
159 (p (- (array-dimension m 0) 1))
160 (tss (aref m p p))
161 (tol (* .0001 (mapcar #'standard-deviation (column-list x))))
162 (sweep-result
163 (if intercept
164 (sweep-operator m (iseq 1 n) tol)
165 (sweep-operator m (iseq 0 n) (cons 0.0 tol)))))
166 (setf (field-value 'sweep-matrix self) (first sweep-result))
167 (setf (field-value 'total-sum-of-squares self) tss)
168 (setf (field-value 'residual-sum-of-squares self)
169 (aref (first sweep-result) p p))
170 (setf (field-value 'basis self)
171 (let ((b (remove 0 (second sweep-result))))
172 (if b
173 (- (reverse b) 1)
174 (error "no columns could be swept"))))))
176 (define-method
177 needs-computing regression-model-proto (&optional set)
178 "Toggle switch as needed to figure out if we recompute."
179 (if set (setf (field-value 'sweep-matrix self) nil))
180 (null (field-value 'sweep-matrix self)))
182 (define-method
183 display regression-model-proto ()
184 "Message args: ()
185 Prints the least squares regression summary. Variables not used in the fit
186 are marked as aliased."
187 (let ((coefs (coerce (send nil :coef-estimates self) 'list))
188 (se-s (send nil :coef-standard-errors self))
189 (x (send nil :x self))
190 (p-names (send nil :predictor-names self)))
191 (if (send nil :weights self)
192 (format t "~%Weighted Least Squares Estimates:~2%")
193 (format t "~%Least Squares Estimates:~2%"))
194 (when (send nil :intercept self)
195 (format t "Constant ~10f ~A~%"
196 (car coefs) (list (car se-s)))
197 (setf coefs (cdr coefs))
198 (setf se-s (cdr se-s)))
199 (dotimes (i (array-dimension x 1))
200 (cond
201 ((member i (send nil :basis self))
202 (format t "~22a ~10f ~A~%"
203 (select p-names i) (car coefs) (list (car se-s)))
204 (setf coefs (cdr coefs) se-s (cdr se-s)))
205 (t (format t "~22a aliased~%" (select p-names i)))))
206 (format t "~%")
207 (format t "R Squared: ~10f~%" (send nil :r-squared self))
208 (format t "Sigma hat: ~10f~%" (send nil :sigma-hat self))
209 (format t "Number of cases: ~10d~%" (send nil :num-cases self))
210 (if (/= (send nil :num-cases self) (send nil :num-included self))
211 (format t "Number of cases used: ~10d~%" (send nil :num-included self)))
212 (format t "Degrees of freedom: ~10d~%" (send nil :df self))
213 (format t "~%")))
216 ;;; Slot accessors and mutators
219 (define-method
220 x regression-model-proto (&optional new-x)
221 "Message args: (&optional new-x)
222 With no argument returns the x matrix as supplied to m. With an argument
223 NEW-X sets the x matrix to NEW-X and recomputes the estimates."
224 (when (and new-x (typep new-x 'matrix-like))
225 (setf (field-value 'x self) new-x)
226 (send nil :needs-computing self t))
227 (field-value 'x self))
229 (define-method
230 y regression-model-proto (&optional new-y)
231 "Message args: (&optional new-y)
232 With no argument returns the y sequence as supplied to m. With an argument
233 NEW-Y sets the y sequence to NEW-Y and recomputes the estimates."
234 (when (and new-y (or (typep new-y 'matrix-like)
235 (typep new-y 'sequence)))
236 (setf (field-value 'y self) new-y)
237 (send nil :needs-computing self t))
238 (field-value 'y self))
240 (define-method
241 intercept regression-model-proto (&optional (val nil set))
242 "Message args: (&optional new-intercept)
243 With no argument returns T if the model includes an intercept term, nil if
244 not. With an argument NEW-INTERCEPT the model is changed to include or
245 exclude an intercept, according to the value of NEW-INTERCEPT."
246 (when set
247 (setf (field-value 'intercept self ) val)
248 (send nil :needs-computing self t))
249 (field-value 'intercept self))
251 (define-method
252 weights regression-model-proto (&optional (new-w nil set))
253 "Message args: (&optional new-w)
254 With no argument returns the weight sequence as supplied to m; NIL means
255 an unweighted model. NEW-W sets the weights sequence to NEW-W and
256 recomputes the estimates."
257 (when set
258 (setf (field-value 'weights self) new-w)
259 (send nil :needs-computing self t))
260 (field-value 'weights self))
262 (define-method
263 total-sum-of-squares regression-model-proto ()
264 "Message args: ()
265 Returns the total sum of squares around the mean."
266 (if (send nil self :needs-computing) (send nil :compute self))
267 (field-value 'total-sum-of-squares self))
269 (define-method
270 residual-sum-of-squares regression-model-proto ()
271 "Message args: ()
272 Returns the residual sum of squares for the model."
273 (if (send nil :needs-computing self) (send nil self :compute self))
274 (field-value 'residual-sum-of-squares self))
276 (define-method
277 basis regression-model-proto ()
278 "Message args: ()
279 Returns the indices of the variables used in fitting the model."
280 (if (send nil :needs-computing self) (send nil :compute self))
281 (field-value 'basis self))
283 (define-method
284 sweep-matrix regression-model-proto ()
285 "Message args: ()
286 Returns the swept sweep matrix. For internal use"
287 (if (send nil :needs-computing self) (send nil :compute self))
288 (field-value 'sweep-matrix self))
290 (define-method
291 included regression-model-proto (&optional new-included)
292 "Message args: (&optional new-included)
293 With no argument, NIL means a case is not used in calculating
294 estimates, and non-nil means it is used. NEW-INCLUDED is a sequence
295 of length of y of nil and t to select cases. Estimates are
296 recomputed."
297 (when (and new-included
298 (= (length new-included) (send nil :num-cases self)))
299 (setf (field-value 'included self) (copy-seq new-included))
300 (send nil :needs-computing self t))
301 (if (field-value 'included self)
302 (field-value 'included self)
303 (repeat t (send nil :num-cases self))))
305 (define-method
306 predictor-names regression-model-proto (&optional (names nil set))
307 "Message args: (&optional (names nil set))
308 With no argument returns the predictor names. NAMES sets the names."
309 (if set (setf (field-value 'predictor-names self) (mapcar #'string names)))
310 (let ((p (array-dimension (send nil :x self) 1))
311 (p-names (field-value 'predictor-names self)))
312 (if (not (and p-names (= (length p-names) p)))
313 (setf (field-value 'predictor-names self)
314 (mapcar #'(lambda (a) (format nil "Variable ~a" a))
315 (iseq 0 (- p 1))))))
316 (field-value 'predictor-names self))
318 (define-method
319 response-name regression-model-proto (&optional (name "Y" set))
320 "Message args: (&optional name)
321 With no argument returns the response name. NAME sets the name."
322 (if set (setf (field-value 'response-name self) (if name (string name) "Y")))
323 (field-value 'response-name self))
325 (define-method
326 case-labels regression-model-proto (&optional (labels nil set))
327 "Message args: (&optional labels)
328 With no argument returns the case-labels. LABELS sets the labels."
329 (if set (setf (field-value 'case-labels self)
330 (if labels
331 (mapcar #'string labels)
332 (mapcar #'(lambda (x) (format nil "~d" x))
333 (iseq 0 (- (send nil :num-cases self) 1))))))
334 (field-value 'case-labels self))
337 ;;; Other Methods
338 ;;; None of these methods access any slots directly.
341 (define-method
342 num-cases regression-model-proto ()
343 "Message args: ()
344 Returns the number of cases in the model."
345 (length (send nil :y self)))
347 (define-method
348 num-included regression-model-proto ()
349 "Message args: ()
350 Returns the number of cases used in the computations."
351 (sum (if-else (send nil :included self) 1 0)))
353 (define-method
354 num-coefs regression-model-proto ()
355 "Message args: ()
356 Returns the number of coefficients in the fit model (including the
357 intercept if the model includes one)."
358 (if (send nil :intercept self)
359 (+ 1 (length (send nil :basis self)))
360 (length (send nil :basis self))))
362 (define-method
363 df regression-model-proto ()
364 "Message args: ()
365 Returns the number of degrees of freedom in the model."
366 (- (send nil :num-included self) (send nil :num-coefs self)))
368 (define-method
369 x-matrix regression-model-proto ()
370 "Message args: ()
371 Returns the X matrix for the model, including a column of 1's, if
372 appropriate. Columns of X matrix correspond to entries in basis."
373 (field-value :x self))
375 (define-method
376 leverages regression-model-proto ()
377 "Message args: ()
378 Returns the diagonal elements of the hat matrix."
379 (let* ((weights (send nil :weights self))
380 (x (send nil :x-matrix self))
381 (raw-levs (m* (* (m* x (send nil :xtxinv self)) x)
382 (repeat 1 (send nil :num-coefs self)))))
383 (if weights (* weights raw-levs) raw-levs)))
385 (define-method
386 fit-values regression-model-proto ()
387 "Message args: ()
388 Returns the fitted values for the model."
389 (m* (send nil :x-matrix self) (send nil :coef-estimates self)))
391 (define-method
392 raw-residuals regression-model-proto ()
393 "Message args: ()
394 Returns the raw residuals for a model."
395 (- (send nil :y self) (send nil :fit-values self)))
397 (define-method
398 residuals regression-model-proto ()
399 "Message args: ()
400 Returns the raw residuals for a model without weights. If the model
401 includes weights the raw residuals times the square roots of the
402 weights are returned."
403 (let ((raw-residuals (send nil :raw-residuals self))
404 (weights (send nil :weights self)))
405 (if weights
406 (* (sqrt weights) raw-residuals)
407 raw-residuals)))
409 (define-method
410 sum-of-squares regression-model-proto ()
411 "Message args: ()
412 Returns the error sum of squares for the model."
413 (send nil :residual-sum-of-squares self))
415 (define-method
416 sigma-hat regression-model-proto ()
417 "Message args: ()
418 Returns the estimated standard deviation of the deviations about the
419 regression line."
420 (let ((ss (send nil :sum-of-squares self))
421 (df (send nil :df self)))
422 (if (/= df 0) (sqrt (/ ss df)))))
424 ;; for models without an intercept the 'usual' formula for R^2 can
425 ;; give negative results; hence the max.
426 (define-method
427 r-squared regression-model-proto ()
428 "Message args: ()
429 Returns the sample squared multiple correlation coefficient, R
430 squared, for the regression."
431 (max (- 1 (/ (send nil :sum-of-squares self)
432 (send nil :total-sum-of-squares self)))
435 (define-method
436 coef-estimates regression-model-proto ()
437 "Message args: ()
438 Returns the OLS (ordinary least squares) estimates of the regression
439 coefficients. Entries beyond the intercept correspond to entries in
440 basis."
441 (let ((n (array-dimension (send nil :x self) 1))
442 (indices (if (send nil :intercept self)
443 (cons 0 (+ 1 (send nil :basis self)))
444 (+ 1 (send nil :basis self))))
445 (m (send nil :sweep-matrix self)))
446 (coerce (compound-data-seq (select m (+ 1 n) indices)) 'list)))
448 (define-method
449 xtxinv regression-model-proto ()
450 "Message args: ()
451 Returns ((X^T) X)^(-1) or ((X^T) W X)^(-1)."
452 (let ((indices (if (send nil :intercept self)
453 (cons 0 (1+ (send nil :basis self)))
454 (1+ (send nil :basis self)))))
455 (select (send nil :sweep-matrix self) indices indices)))
457 (define-method
458 coef-standard-errors regression-model-proto ()
459 "Message args: ()
460 Returns estimated standard errors of coefficients. Entries beyond the
461 intercept correspond to entries in basis."
462 (let ((s (send nil :sigma-hat self)))
463 (if s (* (send nil :sigma-hat self)
464 (sqrt (diagonal (send nil :xtxinv self)))))))
466 (define-method
467 studentized-residuals regression-model-proto ()
468 "Message args: ()
470 Computes the internally studentized residuals for included cases and
471 externally studentized residuals for excluded cases."
472 (let ((res (send nil :residuals self))
473 (lev (send nil :leverages self))
474 (sig (send nil :sigma-hat self))
475 (inc (send nil :included self)))
476 (if-else inc
477 (/ res (* sig (sqrt (pmax .00001 (- 1 lev)))))
478 (/ res (* sig (sqrt (+ 1 lev)))))))
480 (define-method
481 externally-studentized-residuals regression-model-proto ()
482 "Message args: ()
483 Computes the externally studentized residuals."
484 (let* ((res (send nil :studentized-residuals self))
485 (df (send nil :df self)))
486 (if-else (send nil :included self)
487 (* res (sqrt (/ (- df 1) (- df (^ res 2)))))
488 res)))
490 (define-method
491 cooks-distances regression-model-proto ()
492 "Message args: ()
493 Computes Cook's distances."
494 (let ((lev (send nil :leverages self))
495 (res (/ (^ (send nil :studentized-residuals self) 2)
496 (send nil :num-coefs self))))
497 (if-else (send nil :included self)
498 (* res (/ lev (- 1 lev)))
499 (* res lev))))
502 (define-method
503 plot-residuals regression-model-proto (&optional x-values)
504 "Message args: (&optional x-values)
506 Opens a window with a plot of the residuals. If X-VALUES are not
507 supplied the fitted values are used. The plot can be linked to other
508 plots with the link-views function. Returns a plot object."
509 (plot-points (if x-values
510 x-values
511 (send nil :fit-values self))
512 (send nil :residuals self)
513 :title "Residual Plot"
514 :point-labels (send nil :case-labels self)))
516 (define-method
517 plot-bayes-residuals regression-model-proto (&optional x-values)
518 "Message args: (&optional x-values)
520 Opens a window with a plot of the standardized residuals and two
521 standard error bars for the posterior distribution of the actual
522 deviations from the line. See Chaloner and Brant. If X-VALUES are not
523 supplied the fitted values are used. The plot can be linked to other
524 plots with the link-views function. Returns a plot object."
525 (let* ((r (/ (send nil :residuals self)
526 (send nil :sigma-hat self)))
527 (d (* 2 (sqrt (send nil :leverages self))))
528 (low (- r d))
529 (high (+ r d))
530 (x-values (if x-values
531 x-values
532 (send nil :fit-values self)))
533 (p (plot-points x-values r
534 :title "Bayes Residual Plot"
535 :point-labels (send nil :case-labels self))))
536 (map 'list
537 #'(lambda (a b c d)
538 (send nil :plotline p a b c d nil))
539 x-values low x-values high)
540 (send nil :adjust-to-data p)
544 ;;; examples
547 (defparameter *lm-ex-obj*
548 (regression-model absorbtion iron))