clean up of regression code/design/comments.
[CommonLispStat.git] / TODO.lisp
blob40726e1bd352702dc7b114cdd2814c2dceefd835
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-01-09 11:55:10 tony>
4 ;;; Creation: <2008-09-08 08:06:30 tony>
5 ;;; File: TODO.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c) 2007-2008, AJ Rossini <blindglobe@gmail.com>. BSD.
8 ;;; Purpose: Stuff that needs to be made working sits inside the progns...
10 ;;; What is this talk of 'release'? Klingons do not make software
11 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
12 ;;; designers and quality assurance people in its wake.
14 ;;; This file contains the current challenges to solve, including a
15 ;;; description of the setup and the work to solve....
17 ;;; SET UP
19 (in-package :cl-user)
20 ;;(asdf:oos 'asdf:compile-op 'lispstat)
21 ;;(asdf:oos 'asdf:load-op 'lispstat)
24 (in-package :lisp-stat-unittests)
26 (describe (run-tests :suite 'lisp-stat-ut))
27 (run-tests :suite 'lisp-stat-ut)
29 ;; tests = 54, failures = 7, errors = 3
31 (in-package :ls-user)
33 ;;; FIXME: Example: currently not relevant, yet
35 (describe
36 (lift::run-test
37 :test-case 'lisp-stat-unittests::create-proto
38 :suite 'lisp-stat-unittests::lisp-stat-ut-proto))
41 :;; FIXME: data frames and structural inheritance
43 ;; Serious flaw -- need to consider that we are not really well
44 ;; working with the data structures, in that Luke created compound as
45 ;; a base class, which turns out to be slightly backward if we are to
46 ;; maintain the numerical structures as well as computational
47 ;; efficiency.
50 #+nil
51 (progn ;; FIXME: Regression modeling
53 (defparameter m nil
54 "holding variable.")
55 ;; need to make vectors and matrices from the lists...
57 (def m (regression-model (list->vector-like iron)
58 (list->vector-like absorbtion) :print nil))
59 ;;Good
60 (send m :print)
61 (send m :own-slots)
62 (send m :own-methods)
63 ;; (lsos::ls-objects-methods m)
64 (send m :show)
66 (def m (regression-model (list->vector-like iron)
67 (list->vector-like absorbtion)))
69 (def m (regression-model (listoflists->matrix-like (list iron aluminum))
70 (list->vector-like absorbtion) :print nil))
72 (defparameter *indep-vars-1-matrix*
73 (make-matrix 1 (length iron)
74 :initial-contents
75 (list (mapcar #'(lambda (x) (coerce x 'double-float))
76 iron))))
77 ;; *indep-vars-1-matrix*
79 (defparameter *indep-vars-2-matrix*
80 (make-matrix 2 (length iron)
81 :initial-contents
82 (list
83 (mapcar #'(lambda (x) (coerce x 'double-float))
84 iron)
85 (mapcar #'(lambda (x) (coerce x 'double-float))
86 aluminum))))
87 ;; *indep-vars-2-matrix*
90 ;; FAILS due to coercion issues; it just isn't lispy, it's R'y.
91 ;; (defparameter *dep-var* (make-vector (length absorbtion)
92 ;; :initial-contents (list absorbtion)))
93 ;; BUT this should be the right type.
94 (defparameter *dep-var*
95 (make-vector (length absorbtion)
96 :type :row
97 :initial-contents
98 (list
99 (mapcar #'(lambda (x) (coerce x 'double-float))
100 absorbtion))))
101 ;; *dep-var*
104 (defparameter *dep-var-int*
105 (make-vector (length absorbtion)
106 :type :row
107 :element-type 'integer
108 :initial-contents (list absorbtion)))
110 (typep *dep-var* 'matrix-like) ; => T
111 (typep *dep-var* 'vector-like) ; => T
113 (typep *indep-vars-1-matrix* 'matrix-like) ; => T
114 (typep *indep-vars-1-matrix* 'vector-like) ; => T
115 (typep *indep-vars-2-matrix* 'matrix-like) ; => T
116 (typep *indep-vars-2-matrix* 'vector-like) ; => F
118 (def m1 (regression-model-new *indep-vars-1-matrix* *dep-var* ))
119 (def m2 (regression-model-new *indep-vars-2-matrix* *dep-var* ))
121 iron
122 ;; following fails, need to ensure that we work on list elts, not just
123 ;; elts within a list:
124 ;; (coerce iron 'real)
126 ;; the following is a general list-conversion coercion approach -- is
127 ;; there a more efficient way?
128 (mapcar #'(lambda (x) (coerce x 'double-float)) iron)
130 (coerce 1 'real)
132 (send m :compute)
133 (send m :sweep-matrix)
134 (format t "~%~A~%" (send m :sweep-matrix))
136 ;; need to get multiple-linear regression working (simple linear regr
137 ;; works)... to do this, we need to redo the whole numeric structure,
138 ;; I'm keeping these in as example of brokenness...
140 (send m :basis) ;; this should be positive?
141 (send m :coef-estimates) )
143 #+nil
144 (progn ;; FIXME: Need to clean up data examples, licenses, attributions, etc.
145 ;; The following breaks because we should use a package to hold
146 ;; configuration details, and this would be the only package outside
147 ;; of packages.lisp, as it holds the overall defsystem structure.
148 (load-data "iris.lsp") ;; (the above partially fixed).
149 (variables)
150 diabetes )
152 #+nil
153 (progn ;; FIXME: Data.Frames probably deserve to be related to lists --
154 ;; either lists of cases, or lists of variables. We probably do not
155 ;; want to mix them, but want to be able to convert between such
156 ;; structures.
158 (defparameter *my-case-data*
159 '((:cases
160 (:case1 Y Med 3.4 5)
161 (:case2 N Low 3.2 3)
162 (:case3 Y High 3.1 4))
163 (:var-names (list "Response" "Level" "Pressure" "Size"))))
165 *my-case-data*
167 (elt *my-case-data* 1)
168 (elt *my-case-data* 0)
169 (elt *my-case-data* 2) ;; error
170 (elt (elt *my-case-data* 0) 1)
171 (elt (elt *my-case-data* 0) 0)
172 (elt (elt (elt *my-case-data* 0) 1) 0)
173 (elt (elt (elt *my-case-data* 0) 1) 1)
174 (elt (elt (elt *my-case-data* 0) 1) 2)
175 (elt (elt *my-case-data* 0) 3))
177 #+nil
178 (progn ;; FIXME: read data from CSV file. To do.
180 ;; challenge is to ensure that we get mixed arrays when we want them,
181 ;; and single-type (simple) arrays in other cases.
183 (defparameter *csv-num* (read-csv "Data/example-num.csv" :type 'numeric))
184 (defparameter *csv-mix* (read-csv "Data/example-mixed.csv" :type 'data))
186 ;; The handling of these types should be compariable to what we do for
187 ;; matrices, but without the numerical processing. i.e. mref, bind2,
188 ;; make-dataframe, and the class structure should be similar.
190 ;; With numerical data, there should be a straightforward mapping from
191 ;; the data.frame to a matrix. With categorical data (including
192 ;; dense categories such as doc-strings, as well as sparse categories
193 ;; such as binary data), we need to include metadata about ordering,
194 ;; coding, and such. So the structures should probably consider
196 ;; Using the CSV file:
198 (asdf:oos 'asdf:compile-op 'csv :force t)
199 (asdf:oos 'asdf:load-op 'parse-number)
200 (asdf:oos 'asdf:load-op 'csv)
201 (fare-csv:read-csv-file "Data/example-numeric.csv")
203 ;; but I think the cl-csv package is broken, need to use the dsv-style
204 ;; package.
206 ;; now we've got the DSV code in the codebase, auto-loaded I hope:
207 cybertiggyr-dsv:*field-separator*
208 (defparameter *example-numeric.csv*
209 (cybertiggyr-dsv:load-escaped "Data/example-numeric.csv"
210 :field-separator #\,))
211 *example-numeric.csv*
213 ;; the following fails because we've got a bit of string conversion
214 ;; to do. 2 thoughts: #1 modify dsv package, but mucking with
215 ;; encapsulation. #2 add a coercion tool (better, but potentially
216 ;; inefficient).
217 #+nil(coerce (nth 3 (nth 3 *example-numeric.csv*)) 'double-float)
219 ;; cases, simple to not so
220 (defparameter *test-string1* "1.2")
221 (defparameter *test-string2* " 1.2")
222 (defparameter *test-string3* " 1.2 ")
228 #+nil
229 (progn ;; experiments with GSL and the Lisp interface.
230 (asdf:oos 'asdf:load-op 'gsll)
231 (asdf:oos 'asdf:load-op 'gsll-tests)
233 ;; the following should be equivalent
234 (setf *t1* (LIST 6.18d0 6.647777777777779d0 6.18d0))
235 (setf *t2* (MULTIPLE-VALUE-LIST
236 (LET ((VEC
237 (gsll:make-marray 'DOUBLE-FLOAT
238 :INITIAL-CONTENTS '(-3.21d0 1.0d0 12.8d0)))
239 (WEIGHTS
240 (gsll:MAKE-MARRAY 'DOUBLE-FLOAT
241 :INITIAL-CONTENTS '(3.0d0 1.0d0 2.0d0))))
242 (LET ((MEAN (gsll:MEAN VEC)))
243 (LIST (gsll:ABSOLUTE-DEVIATION VEC)
244 (gsll:WEIGHTED-ABSOLUTE-DEVIATION VEC WEIGHTS)
245 (gsll:ABSOLUTE-DEVIATION VEC MEAN))))))
246 (eql *t1* *t2*)
248 ;; from (gsll:examples 'gsll::numerical-integration) ...
249 (gsll:integration-qng gsll::one-sine 0.0d0 PI)
252 (defun-single axpb (x) (+ (* 2 x) 3)) ;; a<-2, b<-3
253 (gsll:integration-qng axpb 1d0 2d0)
255 (let ((a 2)
256 (b 3))
257 (defun-single axpb2 (x) (+ (* a x) b)))
258 (gsll:integration-qng axpb2 1d0 2d0)
261 #| BAD
262 (gsll:integration-qng
263 (let ((a 2)
264 (b 3))
265 (defun-single axpb2 (x) (+ (* a x) b)))
266 1d0 2d0)
269 ;; right, but weird expansion...
270 (gsll:integration-qng
271 (let ((a 2)
272 (b 3))
273 (defun axpb2 (x) (+ (* a x) b))
274 (def-single-function axpb2)
275 axpb2)
276 1d0 2d0)
283 #+nil
284 (progn ;; philosophy time
286 (setf my-model (model :name "ex1"
287 :data-slots (list x y z)
288 :param-slots (list alpha beta gamma)
289 :math-form (regression-model :formula '(= y (+ (* beta x)
290 (* alpha y)
291 (* gamma z)
292 normal-error)))))
293 (setf my-dataset (statistical-table :table data-frame-contents
294 :metadata (list (:case-names (list ))
295 (:var-names (list ))
296 (:documentation "string of doc"))))
298 (setf my-analysis (analysis
299 :model my-model
300 :data my-dataset
301 :parameter-map (pairing (model-param-slots my-model)
302 (data-var-names my-dataset))))
304 ;; ontological implications -- the analysis is an abstract class of
305 ;; data, model, and mapping between the model and data. The fit is
306 ;; the instantiation of such. This provides a statistical object
307 ;; computation theory which can be realized as "executable
308 ;; statistics" or "computable statistics".
309 (setf my-analysis (analyze my-fit
310 :estimation-method 'linear-least-squares-regression))
312 ;; one of the tricks here is that one needs to provide the structure
313 ;; from which to consider estimation, and more importantly, the
314 ;; validity of the estimation.
317 (setf linear-least-squares-regression
318 (estimation-method-definition
319 :variable-defintions ((list
320 ;; from MachLearn: supervised,
321 ;; unsupervised
322 :data-response-vars list-drv ; nil if unsup
324 :param-vars list-pv
325 :data-predictor-vars list-dpv
326 ;; nil in this case. these
327 ;; describe "out-of-box" specs
328 :hyper-vars list-hv))
329 :form '(regression-additive-error
330 :central-form (linear-form drv pv dpv)
331 :error-form 'normal-error)
332 :resulting-decision '(point-estimation interval-estimation)
333 :philosophy 'frequentist
334 :documentation "use least squares to fit a linear regression
335 model to data."))
337 (defparameter *statistical-philosophies*
338 '(frequentist bayesian fiducial decision-analysis)
339 "can be combined to build decision-making approaches and
340 characterizations")
342 (defparameter *decisions*
343 '(estimation selection testing)
344 "possible results from a...")
345 ;; is this really true? One can embedded hypothesis testing within
346 ;; estimation, as the hypothesis estimated to select. And
347 ;; categorical/continuous rear their ugly heads, but not really in
348 ;; an essential way.
350 (defparameter *ontology-of-decision-procedures*
351 (list :decisions
352 (list :estimation
353 (list :point
354 (list :maximum-likelihood
355 :minimum-entropy
356 :least-squares
357 :method-of-moments)
358 :interval
359 (list :maximum-likelihood
361 :testing
362 (list :fisherian
363 :neyman-pearson
364 (list :traditional
365 :bioequivalence-inversion)
366 :selection
367 (list :ranking
368 :top-k-of-n-select))
369 :parametric
370 :partially-parametric))
371 "start of ontology")
381 #+nil
382 (progn ;;; QR factorization
383 ;; Need to incorporate the xGEQRF routines, to support linear
384 ;; regression work.
386 ;; Some issues exist in the LAPACK vs. LINPACK variants, hence R
387 ;; uses LINPACK primarily, rather than LAPACK. See comments in R
388 ;; source for issues.
390 ;; LAPACK suggests to use the xGELSY driver (GE general matrix, LS
391 ;; least squares, need to lookup Y intent (used to be an X alg, see
392 ;; release notes).
394 ;; Goal is to start from X, Y and then realize that if
395 ;; Y = X \beta, then, i.e. 8x1 = 8xp px1 + 8x1
396 ;; XtX \hat\beta = Xt Y
397 ;; so that we can solve the equation W \beta = Z where W and Z
398 ;; are known, to estimate \beta.
399 (defparameter *xv*
400 (make-vector
402 :initial-contents '((1d0 3d0 2d0 4d0 3d0 5d0 4d0 6d0))))
404 (defparameter *xv+1*
405 (make-matrix
407 :initial-contents '((1d0 1d0)
408 (1d0 3d0)
409 (1d0 2d0)
410 (1d0 4d0)
411 (1d0 3d0)
412 (1d0 5d0)
413 (1d0 4d0)
414 (1d0 6d0))))
416 (defparameter *xm*
417 (make-matrix
419 :initial-contents '((1d0 3d0 2d0 4d0 3d0 5d0 4d0 6d0)
420 (1d0 2d0 3d0 4d0 5d0 6d0 7d0 8d0))))
422 (defparameter *y*
423 (make-vector
425 :initial-contents '((1d0 2d0 3d0 4d0 5d0 6d0 7d0 8d0))))
427 ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
428 (defparameter *xtx* (m* *xv* (transpose *xv*)))
429 (defparameter *xty* (m* *xv* (transpose *y*)))
430 (defparameter *rcond* 1)
431 (defparameter *betahat* (gelsy *xtx* *xty* *rcond*))
432 *betahat*
435 (#<LA-SIMPLE-VECTOR-DOUBLE (1 x 1)
436 1.293103448275862>
439 ## Test case in R:
440 x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
441 y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
442 lm (y ~ x -1)
443 ## =>
444 Call:
445 lm(formula = y ~ x - 1)
447 Coefficients:
449 1.293
453 ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
454 (defparameter *xtx* (m* *xv+1* (transpose *xv+1*)))
455 (defparameter *xty* (m* *xv+1* (transpose *y*)))
456 (defparameter *rcond* 1)
457 (defparameter *betahat* (gelsy *xtx* *xty* *rcond*))
458 *betahat*
462 ;; which suggests one might do (modulo ensuring correct orientations)
463 (defun lm (x y)
464 (let ((betahat (gelsy (m* x (transpose x))
465 (m* x (transpose y)))))
467 (values betahat (sebetahat betahat x y))))
468 ;; to get a results list containing betahat and SEs
470 (values-list '(1 3 4))