don't record PDF files in repo
[CommonLispStat.git] / src / stat-models / regression.lisp
blob64e141d032edca87b957ab3403ca7588ed5adb80
1 ;;; -*- mode: lisp -*-
2 ;;;
3 ;;; Copyright (c) 2008--, by A.J. Rossini <blindglobe@gmail.com>
4 ;;; See COPYRIGHT file for any additional restrictions (BSD license).
5 ;;; Since 1991, ANSI was finally finished. Modified to match ANSI
6 ;;; Common Lisp.
8 ;;;; Originally from:
9 ;;;; regression.lsp XLISP-STAT regression model proto and methods
10 ;;;; XLISP-STAT 2.1 Copyright (c) 1990, by Luke Tierney
11 ;;;; Additions to Xlisp 2.1, Copyright (c) 1989 by David Michael Betz
12 ;;;; You may give out copies of this software; for conditions see the file
13 ;;;; COPYING included with this distribution.
14 ;;;;
15 ;;;; Incorporates modifications suggested by Sandy Weisberg.
17 ;;; This version uses lisp-matrix for underlying numerics.
19 (in-package :lisp-stat-regression-linear)
21 ;;; Regresion Model Prototype
23 ;; The general strategy behind the fitting of models using prototypes
24 ;; is that we need to think about want the actual fits are, and then
25 ;; the fits can be used to recompute as components are changes. One
26 ;; catch here is that we'd like some notion of trace-ability, in
27 ;; particular, there is not necessarily a fixed way to take care of the
28 ;; audit trail. save-and-die might be a means of recording the final
29 ;; approach, but we are challenged by the problem of using advice and
30 ;; other such features to capture stages and steps that are considered
31 ;; along the goals of estimating a model.
33 (defclass regression-model-class () ; (statistical-model)
34 (;; Data
36 :initform nil
37 :initarg :y
38 :accessor response-variable
39 :type vector-like
40 :documentation "vector-like containing univariate response values."
43 :initform nil
44 :initarg :x
45 :accessor design-matrix
46 :type matrix-like
47 :documentation "matrix-like containing design/data matrix. If
48 interceptp is T, assume first column is intercept (ones), and hence
49 a bit special.")
50 (interceptp
51 :initform nil
52 :initarg :interceptp
53 :accessor interceptp
54 :type boolean
55 :documentation "When T, treat first column of design/data matrix
56 special as intercept fit.")
58 ;; Metadata
59 (covariate-names
60 :initform nil
61 :initarg :covariate-names
62 :accessor covariate-names
63 :type list
64 :documentation "List of strings representing covariate names.
65 Might include symbols at some point")
66 (response-name
67 :initform nil
68 :initarg :response-name
69 :accessor response-name
70 :type list)
71 (case-labels
72 :initform nil
73 :initarg :case-labels
74 :accessor case-labels
75 :type list)
76 (doc-string
77 :initform ""
78 :initarg :docs
79 :accessor doc-string
80 :type string))
81 (:documentation "Normal Linear Regression Model with CLOS.
82 Historical design based on LispStat."))
85 (defclass regression-model-fit-class ()
86 (;; data/design/state
87 (data-model
88 :initform nil
89 :initarg :data-model
90 :accessor data-model
91 :type regression-model-class)
94 (needs-computing
95 :initform T
96 :initarg :needs-computing
97 :accessor needs-computing)
98 ;; above modified to TRUE when the following are changed
99 (included
100 :initform nil
101 :initarg :included
102 :accessor included
103 :type matrix-like)
104 (weights
105 :initform nil
106 :initarg :weights
107 :accessor weights
108 :type matrix-like)
109 (weights-types
110 :initform nil
111 :initarg :weight-types
112 :accessor weight-types
113 :type matrix-like)
115 ;; computational artifacts for storage; when NEEDS-COMPUTING is
116 ;; NIL, these are up-to-date.
117 (basis
118 :initform nil
119 :accessor basis
120 :type matrix-like)
121 (estimates
122 :initform nil
123 :initarg :estimates
124 :accessor estimates
125 :type vector-like)
126 (estimates-covariance-matrix
127 :initform nil
128 :initarg :estimates-covariance-matrix
129 :accessor covariance-matrix
130 :type matrix-like)
131 (total-sum-of-squares
132 :initform 0d0
133 :accessor tss
134 :type number)
135 (residual-sum-of-squares
136 :initform 0d0
137 :accessor rss
138 :type number)
140 (doc-string
141 :initform "No info given."
142 :initarg :doc
143 :accessor doc-string
144 :type string))
145 (:documentation "Normal Linear Regression Model _FIT_ through CLOS."))
147 ;;;;;;;; Helper functions
149 (defun xtxinv (x &key (intercept T))
150 "In: X Out: (XtX)^-1
152 X is NxP, resulting in PxP. Represents Var[\hat\beta], the varest for
153 \hat \beta from Y = X \beta + \eps. Done by Cholesky decomposition,
154 with LAPACK's dpotri routine, factorizing with dpotrf.
156 <example>
157 (let ((m1 (rand 7 5)))
158 (xtxinv m1))
159 </example>"
160 (check-type x matrix-like)
161 (let ((myx (if intercept
162 (bind2 (ones (matrix-dimension x 0) 1)
163 x :by :column)
164 x)))
165 (minv-cholesky (m* (transpose myx) myx))))
167 ;; might add args: (method 'gelsy), or do we want to put a more
168 ;; general front end, linear-least-square, across the range of
169 ;; LAPACK solvers?
170 (defun lm (x y &key (intercept T) rcond)
171 "fit the linear model:
172 y = x \beta + e
174 and estimate \beta. X,Y should be in cases-by-vars form, i.e. X
175 should be n x p, Y should be n x 1. Returns estimates, n and p.
176 Probably should return a form providing the call, as well.
178 R's lm object returns: coefficients, residuals, effects, rank, fitted,
179 qr-results for numerical considerations, DF_resid. Need to
180 encapsulate into a class or struct."
181 (check-type x matrix-like)
182 (check-type y vector-like) ; vector-like might be too strict?
183 (assert (= (nrows y) (nrows x)) ; same number of obsns/cases
184 (x y) "Can not multiply x:~S by y:~S" x y)
185 (let ((x1 (if intercept
186 (bind2 (ones (matrix-dimension x 0) 1)
187 x :by :column)
188 x)))
189 (let ((betahat (gelsy (m* (transpose x1) x1)
190 (m* (transpose x1) y)
191 (if rcond rcond
192 (* (coerce (expt 2 -52) 'double-float)
193 (max (nrows x1)
194 (ncols y))))))
195 (betahat1 (gelsy x1
197 (if rcond rcond
198 (* (coerce (expt 2 -52) 'double-float)
199 (max (nrows x1)
200 (ncols y)))))))
201 (format t "")
202 (list betahat ; LA-SIMPLE-VECTOR-DOUBLE
203 betahat1 ; LA-SLICE-VECVIEW-DOUBLE
204 (xtxinv x1); (sebetahat betahat x y) ; TODO: write me!
205 (nrows x) ; surrogate for n
206 (ncols x1) ; surrogate for p
207 (v- (first betahat) (first betahat1)) ))))
209 (defun regression-model
210 (x y &key (intercept T)
211 (predictor-names nil) (response-name nil) (case-labels nil)
212 (doc "Undocumented Regression Model Instance"))
213 "Args: (x y &key (intercept T) (print T) (weights nil)
214 included predictor-names response-name case-labels)
215 X - list of independent variables or X matrix
216 Y - dependent variable.
217 INTERCEPT - T to include (default), NIL for no intercept
218 PRINT - if not NIL print summary information
219 WEIGHTS - if supplied should be the same length as Y; error
220 variances are
221 assumed to be inversely proportional to WEIGHTS
222 PREDICTOR-NAMES, RESPONSE-NAME, CASE-LABELS
223 - sequences of strings or symbols.
224 INCLUDED - if supplied, should be length Y or 1, with
225 elements nil to skip or T to include for computing estimates
226 (always included in residual analysis).
227 Returns a regression model object."
228 (check-type x matrix-like)
229 (check-type y vector-like)
230 (let ((newmodel
231 (make-instance 'regression-model-class
232 :y y
233 :x x
234 :interceptp intercept
235 :case-labels case-labels
236 :covariate-names predictor-names
237 :response-name response-name
238 :docs doc )))
239 newmodel))
241 (defun fit-model (model &key (included T) (wgts nil) (docs "No Docs"))
242 (let ((result (make-instance 'regression-model-fit-class
243 :data-model model
244 :needs-computing T
245 :included included
246 :weights wgts
247 :estimates (first (lm (design-matrix model)
248 (response-variable model)
249 :intercept (interceptp model)))
250 :estimates-covariance-matrix
251 (xtxinv (design-matrix model)
252 :intercept (interceptp model))
253 :doc docs)))
254 result))
256 (defmethod print-object ((obj regression-model-fit-class) stream)
257 (print-unreadable-object (obj stream :type t)
258 (format stream "Estimates: ~%~A~%" (estimates obj))
259 (format stream "Covariance: ~%~A~%" (covariance-matrix obj))
260 (format stream "Doc: ~%~A~%" (doc-string obj))))
262 (defmethod print-object ((obj regression-model-class) stream)
263 "Need better formatting and output -- clearly this is a WRONG EXAMPLE."
264 (print-unreadable-object (obj stream :type t)
265 (format stream "Response: ~%~A~%" (response-variable obj))
266 (format stream "Design: ~%~A~%" (design-matrix obj))
267 (format stream "Covariates: ~%~A~%" (covariate-names obj))
268 (format stream "Doc: ~%~A~%" (doc-string obj))))
270 ;;; Computing and Display Methods
272 ;; [X|Y]t [X|Y]
273 ;; = XtX XtY
274 ;; YtX YtY
275 ;; so with (= (dim X) (list n p))
276 ;; we end up with p x p p x 1
277 ;; 1 x p 1 x 1
279 ;; and this can be implemented by
281 (setf XY (bind2 X Y :by :row))
282 (setf XYtXY (m* (transpose XY) XY))
284 ;; which is too procedural. Sigh, I meant
286 (setf XYtXY (let ((XY (bind2 X Y :by :row)))
287 (m* (transpose XY) XY)))
289 ;; which at least looks lispy.