document some tasks in dataframe.lisp that need resolution.
[CommonLispStat.git] / src / stat-models / regression.lisp
blob5c81605321113e8e91468372492418d482aa7547
1 ;;; -*- mode: lisp -*-
2 ;;;
3 ;;; Copyright (c) 2008--, by A.J. Rossini <blindglobe@gmail.com>
4 ;;; See COPYRIGHT file for any additional restrictions (BSD license).
5 ;;; Since 1991, ANSI was finally finished. Modified to match ANSI
6 ;;; Common Lisp.
8 ;;; This version uses LAPACK and lisp-matrix for underlying numerics.
9 ;;; No longer contains code from XLispStat.
11 (in-package :lisp-stat-regression-linear)
13 ;; The general strategy behind the fitting of models using prototypes
14 ;; is that we need to think about want the actual fits are, and then
15 ;; the fits can be used to recompute as components are changes. One
16 ;; catch here is that we'd like some notion of trace-ability, in
17 ;; particular, there is not necessarily a fixed way to take care of the
18 ;; audit trail. save-and-die might be a means of recording the final
19 ;; approach, but we are challenged by the problem of using advice and
20 ;; other such features to capture stages and steps that are considered
21 ;; along the goals of estimating a model.
23 ;; In migrating the thought process to R, we use more of a cross
24 ;; between the S3 and prototype approaches, but some of the thinking
25 ;; is CLOSy.
27 ;; statistical-model shuld contain some of the ingredients for
28 ;; traceability and embodiment of statistical characteristics.
30 (defclass regression-model-class () ; (statistical-model)
31 ((dependent-response
32 :initform nil
33 :initarg :y
34 :accessor dependent-response
35 :type vector-like
36 :documentation "univariate responses/dependent var.")
37 (predictor-design
38 :initform nil
39 :initarg :x
40 :accessor design-matrix
41 :type matrix-like
42 :documentation "design/data matrix, complete. when interceptp T,
43 first column is intercept (ones), and hence special.")
44 (interceptp
45 :initform nil
46 :initarg :interceptp
47 :accessor interceptp
48 :type boolean
49 :documentation "special indicator of first column status.")
51 ;; Metadata
52 (covariate-names
53 :initform nil
54 :initarg :covariate-names
55 :accessor covariate-names
56 :type list
57 :documentation "List of strings representing covariate names.
58 Might include symbols at some point")
59 (response-name
60 :initform nil
61 :initarg :response-name
62 :accessor response-name
63 :type list)
64 (case-labels
65 :initform nil
66 :initarg :case-labels
67 :accessor case-labels
68 :type list)
69 (doc-string
70 :initform "no doc"
71 :initarg :docs
72 :accessor doc-string
73 :type string))
74 (:documentation "Normal Linear Regression Model with CLOS."))
76 (defclass regression-model-fit-class ()
77 ((data-model
78 :initform nil
79 :initarg :data-model
80 :accessor data-model
81 :type regression-model-class
82 :documentation "data/design")
83 (needs-computing
84 :initform T
85 :initarg :needs-computing
86 :accessor needs-computing
87 :documentation "ensure that this is set to TRUE when numerically
88 dependent pieces are modified (not metadata, for example). Check
89 before access of results and printouts, so as to recompute if
90 necessary.")
91 (included
92 :initform nil
93 :initarg :included
94 :accessor included
95 :type matrix-like)
96 (weights
97 :initform nil
98 :initarg :weights
99 :accessor weights
100 :type matrix-like)
101 (weights-types
102 :initform nil
103 :initarg :weight-types
104 :accessor weight-types
105 :type matrix-like)
107 ;; computational artifacts for storage; when NEEDS-COMPUTING is
108 ;; NIL, these are up-to-date.
109 (basis
110 :initform nil
111 :accessor basis
112 :type matrix-like)
113 (estimates
114 :initform nil
115 :initarg :estimates
116 :accessor estimates
117 :type vector-like)
118 (estimates-covariance-matrix
119 :initform nil
120 :initarg :estimates-covariance-matrix
121 :accessor covariance-matrix
122 :type matrix-like)
123 (total-sum-of-squares
124 :initform 0d0
125 :accessor tss
126 :type number)
127 (residual-sum-of-squares
128 :initform 0d0
129 :accessor rss
130 :type number)
131 (doc-string
132 :initform "No info given."
133 :initarg :doc
134 :accessor doc-string
135 :type string))
136 (:documentation "Normal Linear Regression Model _FIT_ through CLOS."))
138 ;;;;;;;; Helper functions
140 ;;(declaim (ftype (function (matrix-like) matrix-like) xtxinv))
142 (defun xtxinv (x)
143 "In: X Out: (XtX)^-1
145 X is NxP, resulting in PxP. Represents Var[\hat\beta], the varest for
146 \hat \beta from Y = X \beta + \eps. Done by Cholesky decomposition,
147 with LAPACK's dpotri routine, factorizing with dpotrf.
149 <example>
150 (let ((m1 (rand 7 5)))
151 (xtxinv m1))
152 </example>"
153 (check-type x matrix-like)
154 (minv-cholesky (m* (transpose x) x)))
157 (let ((myx (if intercept
158 (bind2 (ones (matrix-dimension x 0) 1)
159 x :by :column)
160 x)))
163 ;; might add args: (method 'gelsy), or do we want to put a more
164 ;; general front end, linear-least-square, across the range of
165 ;; LAPACK solvers?
166 (defun lm (x y &key (intercept T) rcond)
167 "fit the linear model:
168 y = x \beta + e
170 and estimate \beta. X,Y should be in cases-by-vars form, i.e. X
171 should be n x p, Y should be n x 1. Returns estimates, n and p.
172 Probably should return a form providing the call, as well.
174 R's lm object returns: coefficients, residuals, effects, rank, fitted,
175 qr-results for numerical considerations, DF_resid. Need to
176 encapsulate into a class or struct."
177 (check-type x matrix-like)
178 (check-type y vector-like) ; vector-like might be too strict?
179 (assert (= (nrows y) (nrows x)) ; same number of obsns/cases
180 (x y) "Can not multiply x:~S by y:~S" x y)
181 (let ((x1 (if intercept
182 (bind2 (ones (matrix-dimension x 0) 1)
183 x :by :column)
184 x)))
185 (let ((betahat (gelsy (m* (transpose x1) x1)
186 (m* (transpose x1) y)
187 (if rcond rcond
188 (* (coerce (expt 2 -52) 'double-float)
189 (max (nrows x1)
190 (ncols y))))))
191 (betahat1 (gelsy x1
193 (if rcond rcond
194 (* (coerce (expt 2 -52) 'double-float)
195 (max (nrows x1)
196 (ncols y)))))))
197 (format t "")
198 (list betahat ; LA-SIMPLE-VECTOR-DOUBLE
199 betahat1 ; LA-SLICE-VECVIEW-DOUBLE
200 (xtxinv x1); (sebetahat betahat x y) ; TODO: write me!
201 (nrows x) ; surrogate for n
202 (ncols x1) ; surrogate for p
203 (v- (first betahat) (first betahat1)) ))))
205 (defun regression-model
206 (x y &key (intercept T)
207 (predictor-names nil) (response-name nil) (case-labels nil)
208 (doc "Undocumented Regression Model Instance"))
209 "Args: (x y &key (intercept T) (print T) (weights nil)
210 included predictor-names response-name case-labels)
211 X - list of independent variables or X matrix
212 Y - dependent variable.
213 INTERCEPT - T to include (default), NIL for no intercept
214 PRINT - if not NIL print summary information
215 WEIGHTS - if supplied should be the same length as Y; error
216 variances are
217 assumed to be inversely proportional to WEIGHTS
218 PREDICTOR-NAMES, RESPONSE-NAME, CASE-LABELS
219 - sequences of strings or symbols.
220 INCLUDED - if supplied, should be length Y or 1, with
221 elements nil to skip or T to include for computing estimates
222 (always included in residual analysis).
223 Returns a regression model object."
224 (check-type x matrix-like)
225 (check-type y vector-like)
226 (let ((newmodel
227 (make-instance 'regression-model-class
228 :y y
229 :x x
230 :interceptp intercept
231 :case-labels case-labels
232 :covariate-names predictor-names
233 :response-name response-name
234 :docs doc )))
235 newmodel))
237 (defun fit-model (model &key (included T) (wgts nil) (docs "No Docs"))
238 (let ((result (make-instance 'regression-model-fit-class
239 :data-model model
240 :needs-computing T
241 :included included
242 :weights wgts
243 :estimates (first (lm (design-matrix model)
244 (dependent-response model)
245 :intercept (interceptp model)))
246 :estimates-covariance-matrix
247 (xtxinv (design-matrix model))
248 :doc docs)))
249 result))
251 (defmethod print-object ((obj regression-model-fit-class) stream)
252 (print-unreadable-object (obj stream :type t)
253 (format stream "Estimates: ~%~A~%" (estimates obj))
254 (format stream "Covariance: ~%~A~%" (covariance-matrix obj))
255 (format stream "Doc: ~%~A~%" (doc-string obj))))
257 (defmethod print-object ((obj regression-model-class) stream)
258 "Need better formatting and output -- clearly this is a WRONG EXAMPLE."
259 (print-unreadable-object (obj stream :type t)
260 (format stream "Response: ~%~A~%" (dependent-response obj))
261 (format stream "Design: ~%~A~%" (design-matrix obj))
262 (format stream "Covariates: ~%~A~%" (covariate-names obj))
263 (format stream "Doc: ~%~A~%" (doc-string obj))))
265 ;;; Computing and Display Methods
267 ;; [X|Y]t [X|Y]
268 ;; = XtX XtY
269 ;; YtX YtY
270 ;; so with (= (dim X) (list n p))
271 ;; we end up with p x p p x 1
272 ;; 1 x p 1 x 1
274 ;; and this can be implemented by
276 (setf XY (bind2 X Y :by :row))
277 (setf XYtXY (m* (transpose XY) XY))
279 ;; which is too procedural. Sigh, I meant
281 (setf XYtXY (let ((XY (bind2 X Y :by :row)))
282 (m* (transpose XY) XY)))
284 ;; which at least looks lispy.