move lin regr code out of lisp-matrix completely. whoops.
[CommonLispStat.git] / TODO.lisp
blob6d14935cd725e981f4075fa908d5a5bd1b2da2b5
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-03-17 20:39:16 tony>
4 ;;; Creation: <2008-09-08 08:06:30 tony>
5 ;;; File: TODO.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c) 2007-2008, AJ Rossini <blindglobe@gmail.com>. BSD.
8 ;;; Purpose: Stuff that needs to be made working sits inside the progns...
10 ;;; What is this talk of 'release'? Klingons do not make software
11 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
12 ;;; designers and quality assurance people in its wake.
14 ;;; This file contains the current challenges to solve, including a
15 ;;; description of the setup and the work to solve....
17 ;;; SET UP
19 (in-package :cl-user)
20 ;;(asdf:oos 'asdf:compile-op 'lispstat)
21 ;;(asdf:oos 'asdf:load-op 'lispstat)
23 (in-package :lisp-stat-unittests)
25 ;; tests = 54, failures = 7, errors = 3
27 (describe (run-tests :suite 'lisp-stat-ut))
28 (run-tests :suite 'lisp-stat-ut)
31 ;; FIXME: Example: currently not relevant, yet
32 (describe
33 (lift::run-test
34 :test-case 'lisp-stat-unittests::create-proto
35 :suite 'lisp-stat-unittests::lisp-stat-ut-proto))
38 (in-package :ls-user)
41 (progn ;; Data setup
43 (defparameter *indep-vars-2-matrix*
44 (make-matrix (length iron) 2
45 :initial-contents
46 (mapcar #'(lambda (x y)
47 (list (coerce x 'double-float)
48 (coerce y 'double-float)))
49 iron aluminum)))
52 (defparameter *dep-var*
53 (make-vector (length absorbtion)
54 :type :row
55 :initial-contents
56 (list
57 (mapcar #'(lambda (x) (coerce x 'double-float))
58 absorbtion))))
60 (defparameter *dep-var-int*
61 (make-vector (length absorbtion)
62 :type :row
63 :element-type 'integer
64 :initial-contents (list absorbtion)))
67 (defparameter *xv+1a*
68 (make-matrix
69 8 2
70 :initial-contents #2A((1d0 1d0)
71 (1d0 3d0)
72 (1d0 2d0)
73 (1d0 4d0)
74 (1d0 3d0)
75 (1d0 5d0)
76 (1d0 4d0)
77 (1d0 6d0))))
79 (defparameter *xv+1b*
80 (bind2
81 (ones 8 1)
82 (make-matrix
83 8 1
84 :initial-contents '((1d0)
85 (3d0)
86 (2d0)
87 (4d0)
88 (3d0)
89 (5d0)
90 (4d0)
91 (6d0)))
92 :by :column))
94 (m= *xv+1a* *xv+1b*) ; => T
96 (princ "Data Set up"))
99 (progn
100 ;; REVIEW: general Lisp use guidance
102 (fdefinition 'make-matrix)
103 (documentation 'make-matrix 'function)
105 #| Examples from CLHS, a bit of guidance.
107 ;; This function assumes its callers have checked the types of the
108 ;; arguments, and authorizes the compiler to build in that assumption.
109 (defun discriminant (a b c)
110 (declare (number a b c))
111 "Compute the discriminant for a quadratic equation."
112 (- (* b b) (* 4 a c))) => DISCRIMINANT
113 (discriminant 1 2/3 -2) => 76/9
115 ;; This function assumes its callers have not checked the types of the
116 ;; arguments, and performs explicit type checks before making any assumptions.
117 (defun careful-discriminant (a b c)
118 "Compute the discriminant for a quadratic equation."
119 (check-type a number)
120 (check-type b number)
121 (check-type c number)
122 (locally (declare (number a b c))
123 (- (* b b) (* 4 a c)))) => CAREFUL-DISCRIMINANT
124 (careful-discriminant 1 2/3 -2) => 76/9
129 #+nil
130 (progn ;; FIXME: Regression modeling
132 ;; data setup in previous FIXME
133 (defparameter *m* nil
134 "holding variable.")
135 ;; need to make vectors and matrices from the lists...
137 ;; BROKEN
138 (def *m* (regression-model (list->vector-like iron)
139 (list->vector-like absorbtion)))
141 (def m (regression-model (list->vector-like iron)
142 (list->vector-like absorbtion) :print nil))
143 ;;Good
144 (send m :print)
145 (send m :own-slots)
146 (send m :own-methods)
147 ;; (lsos::ls-objects-methods m) ; bogus?
148 (send m :show)
150 (def m (regression-model (list->vector-like iron)
151 (list->vector-like absorbtion)))
153 (def m (regression-model (listoflists->matrix-like (list iron aluminum))
154 (list->vector-like absorbtion) :print nil))
157 (send m :compute)
158 (send m :sweep-matrix)
159 (format t "~%~A~%" (send m :sweep-matrix))
161 ;; need to get multiple-linear regression working (simple linear regr
162 ;; works)... to do this, we need to redo the whole numeric structure,
163 ;; I'm keeping these in as example of brokenness...
165 (send m :basis) ;; this should be positive?
166 (send m :coef-estimates) )
168 #+nil
169 (progn ;; FIXME: Need to clean up data examples, licenses, attributions, etc.
170 ;; The following breaks because we should use a package to hold
171 ;; configuration details, and this would be the only package outside
172 ;; of packages.lisp, as it holds the overall defsystem structure.
173 (load-data "iris.lsp") ;; (the above partially fixed).
174 (variables)
175 diabetes )
180 (progn ;; FIXME: read data from CSV file. To do.
183 ;; challenge is to ensure that we get mixed arrays when we want them,
184 ;; and single-type (simple) arrays in other cases.
187 (defparameter *csv-num*
188 (cybertiggyr-dsv::load-escaped
189 #p"/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric.csv"
190 :field-separator #\,
191 :trace T))
193 (nth 0 (nth 0 *csv-num*))
195 (defparameter *csv-num*
196 (cybertiggyr-dsv::load-escaped
197 #p"/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric2.dsv"
198 :field-separator #\:))
200 (nth 0 (nth 0 *csv-num*))
203 ;; The handling of these types should be compariable to what we do for
204 ;; matrices, but without the numerical processing. i.e. mref, bind2,
205 ;; make-dataframe, and the class structure should be similar.
207 ;; With numerical data, there should be a straightforward mapping from
208 ;; the data.frame to a matrix. With categorical data (including
209 ;; dense categories such as doc-strings, as well as sparse categories
210 ;; such as binary data), we need to include metadata about ordering,
211 ;; coding, and such. So the structures should probably consider
213 ;; Using the CSV file:
215 (defun parse-number (s)
216 (let* ((*read-eval* nil)
217 (n (read-from-string s)))
218 (if (numberp n) n)))
220 (parse-number "34")
221 (parse-number "34 ")
222 (parse-number " 34")
223 (parse-number " 34 ")
225 (+ (parse-number "3.4") 3)
226 (parse-number "3.4 ")
227 (parse-number " 3.4")
228 (+ (parse-number " 3.4 ") 3)
230 (parse-number "a")
232 ;; (coerce "2.3" 'number) => ERROR
233 ;; (coerce "2" 'float) => ERROR
235 (defparameter *csv-num*
236 (cybertiggyr-dsv::load-escaped
237 #p"/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric.csv"
238 :field-separator #\,
239 :filter #'parse-number
240 :trace T))
242 (nth 0 (nth 0 *csv-num*))
244 (defparameter *csv-num*
245 (cybertiggyr-dsv::load-escaped
246 #p"/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric2.dsv"
247 :field-separator #\:
248 :filter #'parse-number))
250 (nth 0 (nth 0 *csv-num*))
252 ;; now we've got the DSV code in the codebase, auto-loaded I hope:
253 cybertiggyr-dsv:*field-separator*
254 (defparameter *example-numeric.csv*
255 (cybertiggyr-dsv:load-escaped "Data/example-numeric.csv"
256 :field-separator #\,))
257 *example-numeric.csv*
259 ;; the following fails because we've got a bit of string conversion
260 ;; to do. 2 thoughts: #1 modify dsv package, but mucking with
261 ;; encapsulation. #2 add a coercion tool (better, but potentially
262 ;; inefficient).
263 #+nil(coerce (nth 3 (nth 3 *example-numeric.csv*)) 'double-float)
265 ;; cases, simple to not so
266 (defparameter *test-string1* "1.2")
267 (defparameter *test-string2* " 1.2")
268 (defparameter *test-string3* " 1.2 ")
272 #+nil
273 (progn ;; experiments with GSL and the Lisp interface.
274 (asdf:oos 'asdf:load-op 'gsll)
275 (asdf:oos 'asdf:load-op 'gsll-tests)
277 ;; the following should be equivalent
278 (setf *t1* (LIST 6.18d0 6.647777777777779d0 6.18d0))
279 (setf *t2* (MULTIPLE-VALUE-LIST
280 (LET ((VEC
281 (gsll:make-marray 'DOUBLE-FLOAT
282 :INITIAL-CONTENTS '(-3.21d0 1.0d0 12.8d0)))
283 (WEIGHTS
284 (gsll:MAKE-MARRAY 'DOUBLE-FLOAT
285 :INITIAL-CONTENTS '(3.0d0 1.0d0 2.0d0))))
286 (LET ((MEAN (gsll:MEAN VEC)))
287 (LIST (gsll:ABSOLUTE-DEVIATION VEC)
288 (gsll:WEIGHTED-ABSOLUTE-DEVIATION VEC WEIGHTS)
289 (gsll:ABSOLUTE-DEVIATION VEC MEAN))))))
290 (eql *t1* *t2*)
292 ;; from (gsll:examples 'gsll::numerical-integration) ...
293 (gsll:integration-qng gsll::one-sine 0.0d0 PI)
295 (gsll:defun-single axpb (x) (+ (* 2 x) 3)) ;; a<-2, b<-3
296 (gsll:integration-qng axpb 1d0 2d0)
298 (let ((a 2)
299 (b 3))
300 (defun-single axpb2 (x) (+ (* a x) b)))
301 (gsll:integration-qng axpb2 1d0 2d0)
303 ;; BAD
304 ;; (gsll:integration-qng
305 ;; (let ((a 2)
306 ;; (b 3))
307 ;; (defun-single axpb2 (x) (+ (* a x) b)))
308 ;; 1d0 2d0)
310 ;; right, but weird expansion...
311 (gsll:integration-qng
312 (let ((a 2)
313 (b 3))
314 (defun axpb2 (x) (+ (* a x) b))
315 (gsll:def-single-function axpb2)
316 axpb2)
317 1d0 2d0)
319 ;; Linear least squares
321 (gsll:gsl-lookup "gsl_linalg_LU_decomp") ; => gsll:lu-decomposition
322 (gsll:gsl-lookup "gsl_linalg_LU_solve") ; => gsll:lu-solve
327 #+nil
328 (progn ;; philosophy time
330 (setf my-model (model :name "ex1"
331 :data-slots (list w x y z)
332 :param-slots (list alpha beta gamma)
333 :math-form (regression-model :formula '(= w (+ (* beta x)
334 (* alpha y)
335 (* gamma z)
336 normal-error))
337 :centrality 'median ; 'mean
340 #| or:
341 #R"W ~ x+ y + z "
344 (setf my-dataset (statistical-table :table data-frame-contents
345 :metadata (list (:case-names (list ))
346 (:var-names (list ))
347 (:documentation "string of doc"))))
349 (setf my-analysis (analysis
350 :model my-model
351 :data my-dataset
352 :parameter-map (pairing (model-param-slots my-model)
353 (data-var-names my-dataset))))
355 ;; ontological implications -- the analysis is an abstract class of
356 ;; data, model, and mapping between the model and data. The fit is
357 ;; the instantiation of such. This provides a statistical object
358 ;; computation theory which can be realized as "executable
359 ;; statistics" or "computable statistics".
360 (setf my-analysis (analyze my-fit
361 :estimation-method 'linear-least-squares-regression))
363 ;; one of the tricks here is that one needs to provide the structure
364 ;; from which to consider estimation, and more importantly, the
365 ;; validity of the estimation.
368 (setf linear-least-squares-regression
369 (estimation-method-definition
370 :variable-defintions ((list
371 ;; from MachLearn: supervised,
372 ;; unsupervised
373 :data-response-vars list-drv ; nil if unsup
375 :param-vars list-pv
376 :data-predictor-vars list-dpv
377 ;; nil in this case. these
378 ;; describe "out-of-box" specs
379 :hyper-vars list-hv))
380 :form '(regression-additive-error
381 :central-form (linear-form drv pv dpv)
382 :error-form 'normal-error)
383 :resulting-decision '(point-estimation interval-estimation)
384 :philosophy 'frequentist
385 :documentation "use least squares to fit a linear regression
386 model to data."))
388 (defparameter *statistical-philosophies*
389 '(frequentist bayesian fiducial decision-analysis)
390 "can be combined to build decision-making approaches and
391 characterizations")
393 (defparameter *decisions*
394 '(estimation selection testing)
395 "possible results from a...")
396 ;; is this really true? One can embedded hypothesis testing within
397 ;; estimation, as the hypothesis estimated to select. And
398 ;; categorical/continuous rear their ugly heads, but not really in
399 ;; an essential way.
401 (defparameter *ontology-of-decision-procedures*
402 (list :decisions
403 (list :estimation
404 (list :point
405 (list :maximum-likelihood
406 :minimum-entropy
407 :least-squares
408 :method-of-moments)
409 :interval
410 (list :maximum-likelihood
412 :testing
413 (list :fisherian
414 :neyman-pearson
415 (list :traditional
416 :bioequivalence-inversion)
417 :selection
418 (list :ranking
419 :top-k-of-n-select))
420 :parametric
421 :partially-parametric))
422 "start of ontology"))
425 ;;;; LM
427 (progn
429 (defparameter *y*
430 (make-vector
432 :type :row
433 :initial-contents '((1d0 2d0 3d0 4d0 5d0 6d0 7d0 8d0))))
436 (defparameter *xv+1*
437 (make-matrix
439 :initial-contents '((1d0 1d0)
440 (1d0 3d0)
441 (1d0 2d0)
442 (1d0 4d0)
443 (1d0 3d0)
444 (1d0 5d0)
445 (1d0 4d0)
446 (1d0 6d0))))
449 ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
450 (defparameter *xtx-2* (m* (transpose *xv+1*) *xv+1*))
451 ;; #<LA-SIMPLE-MATRIX-DOUBLE 2 x 2
452 ;; 8.0d0 28.0d0
453 ;; 28.0d0 116.0d0>
455 (defparameter *xty-2* (m* (transpose *xv+1*) (transpose *y*)))
456 ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
457 ;; 36.0d0
458 ;; 150.0d0>
460 (defparameter *rcond-2* 0.000001)
461 (defparameter *betahat-2* (gelsy *xtx-2* *xty-2* *rcond-2*))
462 ;; *xtx-2* => "details of complete orthogonal factorization"
463 ;; according to man page:
464 ;; #<LA-SIMPLE-MATRIX-DOUBLE 2 x 2
465 ;; -119.33147112141039d0 -29.095426104883202d0
466 ;; 0.7873402682880205d0 -1.20672274167718d0>
468 ;; *xty-2* => output becomes solution:
469 ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
470 ;; -0.16666666666668312d0
471 ;; 1.333333333333337d0>
473 *betahat-2* ; which matches R, see below
475 (documentation 'gelsy 'function)
478 ;; (#<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
479 ;; -0.16666666666668312 1.333333333333337>
480 ;; 2)
482 ;; ## Test case in R:
483 ;; x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
484 ;; y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
485 ;; lm(y~x)
486 ;; ## => Call: lm(formula = y ~ x)
488 ;; Coefficients: (Intercept) x
489 ;; -0.1667 1.3333
491 ;; summary(lm(y~x))
492 ;; ## =>
494 ;; Call:
495 ;; lm(formula = y ~ x)
497 ;; Residuals:
498 ;; Min 1Q Median 3Q Max
499 ;; -1.833e+00 -6.667e-01 -3.886e-16 6.667e-01 1.833e+00
501 ;; Coefficients:
502 ;; Estimate Std. Error t value Pr(>|t|)
503 ;; (Intercept) -0.1667 1.1587 -0.144 0.89034
504 ;; x 1.3333 0.3043 4.382 0.00466 **
505 ;; ---
506 ;; Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
508 ;; Residual standard error: 1.291 on 6 degrees of freedom
509 ;; Multiple R-squared: 0.7619, Adjusted R-squared: 0.7222
510 ;; F-statistic: 19.2 on 1 and 6 DF, p-value: 0.004659
514 ;; which suggests one might do (modulo ensuring correct
515 ;; orientations). When this is finalized, it should migrate to
516 ;; CLS.
520 (defparameter *n* 20) ; # rows = # obsns
521 (defparameter *p* 10) ; # cols = # vars
522 (defparameter *x-temp* (rand *n* *p*))
523 (defparameter *b-temp* (rand *p* 1))
524 (defparameter *y-temp* (m* *x-temp* *b-temp*))
525 ;; so Y=Xb + \eps
526 (defparameter *rcond* (* (coerce (expt 2 -52) 'double-float)
527 (max (nrows *x-temp*) (ncols *y-temp*))))
528 (defparameter *orig-x* (copy *x-temp*))
529 (defparameter *orig-b* (copy *b-temp*))
530 (defparameter *orig-y* (copy *y-temp*))
532 (defparameter *lm-result* (lm *x-temp* *y-temp*))
533 (princ (first *lm-result*))
534 (princ (second *lm-result*))
535 (princ (third *lm-result*))
536 (v= (third *lm-result*)
537 (v- (first (first *lm-result*))
538 (first (second *lm-result*))))
543 ;; Some issues exist in the LAPACK vs. LINPACK variants, hence R
544 ;; uses LINPACK primarily, rather than LAPACK. See comments in R
545 ;; source for issues.
548 ;; Goal is to start from X, Y and then realize that if
549 ;; Y = X \beta, then, i.e. 8x1 = 8xp px1 + 8x1
550 ;; XtX \hat\beta = Xt Y
551 ;; so that we can solve the equation W \beta = Z where W and Z
552 ;; are known, to estimate \beta.
554 ;; the above is known to be numerically instable -- some processing
555 ;; of X is preferred and should be done prior. And most of the
556 ;; transformation-based work does precisely that.
558 ;; recall: Var[Y] = E[(Y - E[Y])(Y-E[Y])t]
559 ;; = E[Y Yt] - 2 \mu \mut + \mu \mut
560 ;; = E[Y Yt] - \mu \mut
562 ;; Var Y = E[Y^2] - \mu^2
565 ;; For initial estimates of covariance of \hat\beta:
567 ;; \hat\beta = (Xt X)^-1 Xt Y
568 ;; with E[ \hat\beta ]
569 ;; = E[ (Xt X)^-1 Xt Y ]
570 ;; = E[(Xt X)^-1 Xt (X\beta)]
571 ;; = \beta
573 ;; So Var[\hat\beta] = ...
574 ;; (Xt X)
575 ;; and this gives SE(\beta_i) = (* (sqrt (mref Var i i)) adjustment)
578 ;; from docs:
580 (setf *temp-result*
581 (let ((*default-implementation* :foreign-array))
582 (let* ((m 10)
583 (n 10)
584 (a (rand m n))
585 (x (rand n 1))
586 (b (m* a x))
587 (rcond (* (coerce (expt 2 -52) 'double-float)
588 (max (nrows a) (ncols a))))
589 (orig-a (copy a))
590 (orig-b (copy b))
591 (orig-x (copy x)))
592 (list x (gelsy a b rcond))
593 ;; no applicable conversion?
594 ;; (m- (#<FA-SIMPLE-VECTOR-DOUBLE (10 x 1))
595 ;; (#<FA-SIMPLE-VECTOR-DOUBLE (10 x 1)) )
596 (v- x (first (gelsy a b rcond))))))
599 (princ *temp-result*)
601 (setf *temp-result*
602 (let ((*default-implementation* :lisp-array))
603 (let* ((m 10)
604 (n 10)
605 (a (rand m n))
606 (x (rand n 1))
607 (b (m* a x))
608 (rcond (* (coerce (expt 2 -52) 'double-float)
609 (max (nrows a) (ncols a))))
610 (orig-a (copy a))
611 (orig-b (copy b))
612 (orig-x (copy x)))
613 (list x (gelsy a b rcond))
614 (m- x (first (gelsy a b rcond)))
616 (princ *temp-result*)
619 (defparameter *xv*
620 (make-vector
622 :type :row ;; default, not usually needed!
623 :initial-contents '((1d0 3d0 2d0 4d0 3d0 5d0 4d0 6d0))))
625 (defparameter *y*
626 (make-vector
628 :type :row
629 :initial-contents '((1d0 2d0 3d0 4d0 5d0 6d0 7d0 8d0))))
631 ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
632 (defparameter *xtx-1* (m* *xv* (transpose *xv*)))
633 (defparameter *xty-1* (m* *xv* (transpose *y*)))
634 (defparameter *rcond-in* (* (coerce (expt 2 -52) 'double-float)
635 (max (nrows *xtx-1*)
636 (ncols *xty-1*))))
638 (defparameter *betahat* (gelsy *xtx-1* *xty-1* *rcond-in*))
640 ;; (#<LA-SIMPLE-VECTOR-DOUBLE (1 x 1)
641 ;; 1.293103448275862>
642 ;; 1)
644 ;; ## Test case in R:
645 ;; x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
646 ;; y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
647 ;; lm(y~x-1)
648 ;; ## =>
649 ;; Call:
650 ;; lm(formula = y ~ x - 1)
652 ;; Coefficients:
653 ;; x
654 ;; 1.293
656 (first *betahat*))
659 #+nil
660 (progn
661 (let ((df (make-instance 'dataframe-array))
665 (let ((df2 (make-instance 'dataframe-array
666 :nrows 10
667 :ncols 5 )))
668 df2)
672 #+nil
673 (progn
675 (plot-ex))