make it actually work on filename read-in. Obviously, we need a macro to do it right.
[CommonLispStat.git] / TODO.lisp
blob36abb7b14e5e06926e058e893ea7877c5929a279
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-04-20 08:08:46 tony>
4 ;;; Creation: <2008-09-08 08:06:30 tony>
5 ;;; File: TODO.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c) 2007-2008, AJ Rossini <blindglobe@gmail.com>. BSD.
8 ;;; Purpose: Stuff that needs to be made working sits inside the
9 ;;; progns... This file contains the current challenges to
10 ;;; solve, including a description of the setup and the work
11 ;;; to solve....
13 ;;; What is this talk of 'release'? Klingons do not make software
14 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
15 ;;; designers and quality assurance people in its wake.
17 ;;; SET UP
19 (in-package :cl-user)
20 ;;(asdf:oos 'asdf:load-op 'lisp-matrix)
21 ;;(asdf:oos 'asdf:compile-op 'lispstat :force t)
22 ;;(asdf:oos 'asdf:load-op 'lispstat)
24 (in-package :lisp-stat-unittests)
26 ;; tests = 80, failures = 8, errors = 15
27 (run-tests :suite 'lisp-stat-ut)
28 (describe (run-tests :suite 'lisp-stat-ut))
30 ;; FIXME: Example: currently not relevant, yet
31 ;; (describe (lift::run-test :test-case 'lisp-stat-unittests::create-proto
32 ;; :suite 'lisp-stat-unittests::lisp-stat-ut-proto))
34 (describe (lift::run-tests :suite 'lisp-stat-ut-dataframe))
35 (lift::run-tests :suite 'lisp-stat-ut-dataframe)
37 (describe
38 (lift::run-test
39 :test-case 'lisp-stat-unittests::create-proto
40 :suite 'lisp-stat-unittests::lisp-stat-ut-proto))
42 (describe 'lisp-stat-ut)
44 (in-package :ls-user)
46 (progn ;; FIXME: Regression modeling (some data future-ish).
48 ;; TODO:
49 ;; - confirm estimates for multivariate case,
50 ;; - pretty-print output
51 ;; - fix up API -- what do we want this to look like?
53 (defparameter *m*
54 (regression-model (list->vector-like iron) ;; BROKEN
55 (list->vector-like absorbtion))
56 "holding variable.")
58 (defparameter *m-fit*
59 (fit-model *m*))
61 (princ *m*)
62 (princ *m-fit*)
64 (estimates *m-fit*)
65 (covariance-matrix *m-fit*)
67 (defparameter *m3*
68 (regression-model (transpose (listoflist->matrix-like (list iron aluminum)
69 :orientation :row-major))
70 (list->vector-like absorbtion) ))
71 (princ *m3*)
72 (defparameter *m3-fit*
73 (fit-model *m3*))
75 ;; Should the above look something like:
76 (defparameter *m3-fit*
77 (spec-and-fit-model '(absorbtion = iron aluminum)))
78 ;; in which case we split the list before/after the "=" character.
81 (estimates *m3-fit*)
82 (covariance-matrix *m3-fit*))
85 #+nil
86 (progn ;; FIXME: Need to clean up data examples, licenses, attributions, etc.
87 ;; The following breaks because we should use a package to hold
88 ;; configuration details, and this would be the only package outside
89 ;; of packages.lisp, as it holds the overall defsystem structure.
90 (load-data "iris.lsp") ;; (the above partially fixed).
91 (variables)
92 diabetes )
96 (progn ;; Importing data from DSV text files.
98 (defparameter *my-df-2*
99 (make-instance 'dataframe-array
100 :storage
101 (listoflist->array
102 (cybertiggyr-dsv::load-escaped
103 "/media/disk/Desktop/sandbox/CLS.git/Data/example-mixed.csv"))
104 :doc "This is an interesting dataframe-array"))
105 #| :case-labels (list "x" "y")
106 :var-labels (list "a" "b" "c" "d" "e")
109 ;; a better approach is:
110 (asdf:oos 'asdf:load-op 'rsm-string)
111 (rsm.string:file->string-table
112 "/media/disk/Desktop/sandbox/CLS.git/Data/example-mixed.csv")
114 (rsm.string:file->number-table
115 "/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric.csv")
117 (defparameter *my-df-2*
118 (make-instance 'dataframe-array
119 :storage
120 (listoflist->array
121 (transpose-listoflist
122 (rsm.string:file->string-table
123 "/media/disk/Desktop/sandbox/CLS.git/Data/example-mixed.csv")))
124 :doc "This is an interesting dataframe-array"))
125 *my-df-2*
127 (defparameter *my-df-3*
128 (make-instance 'dataframe-array
129 :storage
130 (listoflist->array
131 (transpose-listoflist
132 (rsm.string:file->number-table
133 "/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric.csv")))
134 :doc "This is an interesting dataframe-array"))
135 *my-df-3*
142 (progn
143 ;; Plotting
144 (asdf:oos 'asdf:load-op 'cl-plplot)
146 (plot-ex)
147 (defparameter *gdev* "xcairo")
148 ;; (defparameter *gdev* "xwin")
149 (cl-plplot::plsdev *gdev*)
151 ;; there is currently a loose pointer floating around that causes
152 ;; errors the 3rd time that we create a plot (and crashes SBCL the
153 ;; 4th time). Order independent.
154 (contour-plot-ex)
155 (fn-contour-plot-ex)
156 (shade-plot-ex)
157 (3D-plot-ex)
162 (in-package :ls-user)
165 (progn
166 ;; REVIEW: general Lisp use guidance
168 (fdefinition 'make-matrix)
169 (documentation 'make-matrix 'function)
171 #| Examples from CLHS, a bit of guidance.
173 ;; This function assumes its callers have checked the types of the
174 ;; arguments, and authorizes the compiler to build in that assumption.
175 (defun discriminant (a b c)
176 (declare (number a b c))
177 "Compute the discriminant for a quadratic equation."
178 (- (* b b) (* 4 a c))) => DISCRIMINANT
179 (discriminant 1 2/3 -2) => 76/9
181 ;; This function assumes its callers have not checked the types of the
182 ;; arguments, and performs explicit type checks before making any assumptions.
183 (defun careful-discriminant (a b c)
184 "Compute the discriminant for a quadratic equation."
185 (check-type a number)
186 (check-type b number)
187 (check-type c number)
188 (locally (declare (number a b c))
189 (- (* b b) (* 4 a c)))) => CAREFUL-DISCRIMINANT
190 (careful-discriminant 1 2/3 -2) => 76/9
195 #+nil
196 (progn ;; experiments with GSL and the Lisp interface.
197 (asdf:oos 'asdf:load-op 'gsll)
198 (asdf:oos 'asdf:load-op 'gsll-tests)
200 ;; the following should be equivalent
201 (setf *t1* (LIST 6.18d0 6.647777777777779d0 6.18d0))
202 (setf *t2* (MULTIPLE-VALUE-LIST
203 (LET ((VEC
204 (gsll:make-marray 'DOUBLE-FLOAT
205 :INITIAL-CONTENTS '(-3.21d0 1.0d0 12.8d0)))
206 (WEIGHTS
207 (gsll:MAKE-MARRAY 'DOUBLE-FLOAT
208 :INITIAL-CONTENTS '(3.0d0 1.0d0 2.0d0))))
209 (LET ((MEAN (gsll:MEAN VEC)))
210 (LIST (gsll:ABSOLUTE-DEVIATION VEC)
211 (gsll:WEIGHTED-ABSOLUTE-DEVIATION VEC WEIGHTS)
212 (gsll:ABSOLUTE-DEVIATION VEC MEAN))))))
213 (eql *t1* *t2*)
215 ;; from (gsll:examples 'gsll::numerical-integration) ...
216 (gsll:integration-qng gsll::one-sine 0.0d0 PI)
218 (gsll:defun-single axpb (x) (+ (* 2 x) 3)) ;; a<-2, b<-3
219 (gsll:integration-qng axpb 1d0 2d0)
221 (let ((a 2)
222 (b 3))
223 (defun-single axpb2 (x) (+ (* a x) b)))
224 (gsll:integration-qng axpb2 1d0 2d0)
226 ;; BAD
227 ;; (gsll:integration-qng
228 ;; (let ((a 2)
229 ;; (b 3))
230 ;; (defun-single axpb2 (x) (+ (* a x) b)))
231 ;; 1d0 2d0)
233 ;; right, but weird expansion...
234 (gsll:integration-qng
235 (let ((a 2)
236 (b 3))
237 (defun axpb2 (x) (+ (* a x) b))
238 (gsll:def-single-function axpb2)
239 axpb2)
240 1d0 2d0)
242 ;; Linear least squares
244 (gsll:gsl-lookup "gsl_linalg_LU_decomp") ; => gsll:lu-decomposition
245 (gsll:gsl-lookup "gsl_linalg_LU_solve") ; => gsll:lu-solve
250 #+nil
251 (progn ;; philosophy time
253 (setf my-model (model :name "ex1"
254 :data-slots (list w x y z)
255 :param-slots (list alpha beta gamma)
256 :math-form (regression-model :formula '(= w (+ (* beta x)
257 (* alpha y)
258 (* gamma z)
259 normal-error))
260 :centrality 'median ; 'mean
263 #| or:
264 #R"W ~ x+ y + z "
267 (setf my-dataset (statistical-table :table data-frame-contents
268 :metadata (list (:case-names (list ))
269 (:var-names (list ))
270 (:documentation "string of doc"))))
272 (setf my-analysis (analysis
273 :model my-model
274 :data my-dataset
275 :parameter-map (pairing (model-param-slots my-model)
276 (data-var-names my-dataset))))
278 ;; ontological implications -- the analysis is an abstract class of
279 ;; data, model, and mapping between the model and data. The fit is
280 ;; the instantiation of such. This provides a statistical object
281 ;; computation theory which can be realized as "executable
282 ;; statistics" or "computable statistics".
283 (setf my-analysis (analyze my-fit
284 :estimation-method 'linear-least-squares-regression))
286 ;; one of the tricks here is that one needs to provide the structure
287 ;; from which to consider estimation, and more importantly, the
288 ;; validity of the estimation.
291 (setf linear-least-squares-regression
292 (estimation-method-definition
293 :variable-defintions ((list
294 ;; from MachLearn: supervised,
295 ;; unsupervised
296 :data-response-vars list-drv ; nil if unsup
298 :param-vars list-pv
299 :data-predictor-vars list-dpv
300 ;; nil in this case. these
301 ;; describe "out-of-box" specs
302 :hyper-vars list-hv))
303 :form '(regression-additive-error
304 :central-form (linear-form drv pv dpv)
305 :error-form 'normal-error)
306 :resulting-decision '(point-estimation interval-estimation)
307 :philosophy 'frequentist
308 :documentation "use least squares to fit a linear regression
309 model to data."))
311 (defparameter *statistical-philosophies*
312 '(frequentist bayesian fiducial decision-analysis)
313 "can be combined to build decision-making approaches and
314 characterizations")
316 (defparameter *decisions*
317 '(estimation selection testing)
318 "possible results from a...")
319 ;; is this really true? One can embedded hypothesis testing within
320 ;; estimation, as the hypothesis estimated to select. And
321 ;; categorical/continuous rear their ugly heads, but not really in
322 ;; an essential way.
324 (defparameter *ontology-of-decision-procedures*
325 (list :decisions
326 (list :estimation
327 (list :point
328 (list :maximum-likelihood
329 :minimum-entropy
330 :least-squares
331 :method-of-moments)
332 :interval
333 (list :maximum-likelihood
335 :testing
336 (list :fisherian
337 :neyman-pearson
338 (list :traditional
339 :bioequivalence-inversion)
340 :selection
341 (list :ranking
342 :top-k-of-n-select))
343 :parametric
344 :partially-parametric))
345 "start of ontology"))
348 ;;;; LM
350 (progn
352 (defparameter *y*
353 (make-vector
355 :type :row
356 :initial-contents '((1d0 2d0 3d0 4d0 5d0 6d0 7d0 8d0))))
359 (defparameter *xv+1*
360 (make-matrix
362 :initial-contents '((1d0 1d0)
363 (1d0 3d0)
364 (1d0 2d0)
365 (1d0 4d0)
366 (1d0 3d0)
367 (1d0 5d0)
368 (1d0 4d0)
369 (1d0 6d0))))
372 ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
373 (defparameter *xtx-2* (m* (transpose *xv+1*) *xv+1*))
374 ;; #<LA-SIMPLE-MATRIX-DOUBLE 2 x 2
375 ;; 8.0d0 28.0d0
376 ;; 28.0d0 116.0d0>
378 (defparameter *xty-2* (m* (transpose *xv+1*) (transpose *y*)))
379 ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
380 ;; 36.0d0
381 ;; 150.0d0>
383 (defparameter *rcond-2* 0.000001)
384 (defparameter *betahat-2* (gelsy *xtx-2* *xty-2* *rcond-2*))
385 ;; *xtx-2* => "details of complete orthogonal factorization"
386 ;; according to man page:
387 ;; #<LA-SIMPLE-MATRIX-DOUBLE 2 x 2
388 ;; -119.33147112141039d0 -29.095426104883202d0
389 ;; 0.7873402682880205d0 -1.20672274167718d0>
391 ;; *xty-2* => output becomes solution:
392 ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
393 ;; -0.16666666666668312d0
394 ;; 1.333333333333337d0>
396 *betahat-2* ; which matches R, see below
398 (documentation 'gelsy 'function)
401 ;; (#<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
402 ;; -0.16666666666668312 1.333333333333337>
403 ;; 2)
405 ;; ## Test case in R:
406 ;; x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
407 ;; y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
408 ;; lm(y~x)
409 ;; ## => Call: lm(formula = y ~ x)
411 ;; Coefficients: (Intercept) x
412 ;; -0.1667 1.3333
414 ;; summary(lm(y~x))
415 ;; ## =>
417 ;; Call:
418 ;; lm(formula = y ~ x)
420 ;; Residuals:
421 ;; Min 1Q Median 3Q Max
422 ;; -1.833e+00 -6.667e-01 -3.886e-16 6.667e-01 1.833e+00
424 ;; Coefficients:
425 ;; Estimate Std. Error t value Pr(>|t|)
426 ;; (Intercept) -0.1667 1.1587 -0.144 0.89034
427 ;; x 1.3333 0.3043 4.382 0.00466 **
428 ;; ---
429 ;; Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
431 ;; Residual standard error: 1.291 on 6 degrees of freedom
432 ;; Multiple R-squared: 0.7619, Adjusted R-squared: 0.7222
433 ;; F-statistic: 19.2 on 1 and 6 DF, p-value: 0.004659
437 ;; which suggests one might do (modulo ensuring correct
438 ;; orientations). When this is finalized, it should migrate to
439 ;; CLS.
443 (defparameter *n* 20) ; # rows = # obsns
444 (defparameter *p* 10) ; # cols = # vars
445 (defparameter *x-temp* (rand *n* *p*))
446 (defparameter *b-temp* (rand *p* 1))
447 (defparameter *y-temp* (m* *x-temp* *b-temp*))
448 ;; so Y=Xb + \eps
449 (defparameter *rcond* (* (coerce (expt 2 -52) 'double-float)
450 (max (nrows *x-temp*) (ncols *y-temp*))))
451 (defparameter *orig-x* (copy *x-temp*))
452 (defparameter *orig-b* (copy *b-temp*))
453 (defparameter *orig-y* (copy *y-temp*))
455 (defparameter *lm-result* (lm *x-temp* *y-temp*))
456 (princ (first *lm-result*))
457 (princ (second *lm-result*))
458 (princ (third *lm-result*))
459 (v= (third *lm-result*)
460 (v- (first (first *lm-result*))
461 (first (second *lm-result*))))
466 ;; Some issues exist in the LAPACK vs. LINPACK variants, hence R
467 ;; uses LINPACK primarily, rather than LAPACK. See comments in R
468 ;; source for issues.
471 ;; Goal is to start from X, Y and then realize that if
472 ;; Y = X \beta, then, i.e. 8x1 = 8xp px1 + 8x1
473 ;; XtX \hat\beta = Xt Y
474 ;; so that we can solve the equation W \beta = Z where W and Z
475 ;; are known, to estimate \beta.
477 ;; the above is known to be numerically instable -- some processing
478 ;; of X is preferred and should be done prior. And most of the
479 ;; transformation-based work does precisely that.
481 ;; recall: Var[Y] = E[(Y - E[Y])(Y-E[Y])t]
482 ;; = E[Y Yt] - 2 \mu \mut + \mu \mut
483 ;; = E[Y Yt] - \mu \mut
485 ;; Var Y = E[Y^2] - \mu^2
488 ;; For initial estimates of covariance of \hat\beta:
490 ;; \hat\beta = (Xt X)^-1 Xt Y
491 ;; with E[ \hat\beta ]
492 ;; = E[ (Xt X)^-1 Xt Y ]
493 ;; = E[(Xt X)^-1 Xt (X\beta)]
494 ;; = \beta
496 ;; So Var[\hat\beta] = ...
497 ;; (Xt X)
498 ;; and this gives SE(\beta_i) = (* (sqrt (mref Var i i)) adjustment)
501 ;; from docs:
503 (setf *temp-result*
504 (let ((*default-implementation* :foreign-array))
505 (let* ((m 10)
506 (n 10)
507 (a (rand m n))
508 (x (rand n 1))
509 (b (m* a x))
510 (rcond (* (coerce (expt 2 -52) 'double-float)
511 (max (nrows a) (ncols a))))
512 (orig-a (copy a))
513 (orig-b (copy b))
514 (orig-x (copy x)))
515 (list x (gelsy a b rcond))
516 ;; no applicable conversion?
517 ;; (m- (#<FA-SIMPLE-VECTOR-DOUBLE (10 x 1))
518 ;; (#<FA-SIMPLE-VECTOR-DOUBLE (10 x 1)) )
519 (v- x (first (gelsy a b rcond))))))
522 (princ *temp-result*)
524 (setf *temp-result*
525 (let ((*default-implementation* :lisp-array))
526 (let* ((m 10)
527 (n 10)
528 (a (rand m n))
529 (x (rand n 1))
530 (b (m* a x))
531 (rcond (* (coerce (expt 2 -52) 'double-float)
532 (max (nrows a) (ncols a))))
533 (orig-a (copy a))
534 (orig-b (copy b))
535 (orig-x (copy x)))
536 (list x (gelsy a b rcond))
537 (m- x (first (gelsy a b rcond)))
539 (princ *temp-result*)
542 (defparameter *xv*
543 (make-vector
545 :type :row ;; default, not usually needed!
546 :initial-contents '((1d0 3d0 2d0 4d0 3d0 5d0 4d0 6d0))))
548 (defparameter *y*
549 (make-vector
551 :type :row
552 :initial-contents '((1d0 2d0 3d0 4d0 5d0 6d0 7d0 8d0))))
554 ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
555 (defparameter *xtx-1* (m* *xv* (transpose *xv*)))
556 (defparameter *xty-1* (m* *xv* (transpose *y*)))
557 (defparameter *rcond-in* (* (coerce (expt 2 -52) 'double-float)
558 (max (nrows *xtx-1*)
559 (ncols *xty-1*))))
561 (defparameter *betahat* (gelsy *xtx-1* *xty-1* *rcond-in*))
563 ;; (#<LA-SIMPLE-VECTOR-DOUBLE (1 x 1)
564 ;; 1.293103448275862>
565 ;; 1)
567 ;; ## Test case in R:
568 ;; x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
569 ;; y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
570 ;; lm(y~x-1)
571 ;; ## =>
572 ;; Call:
573 ;; lm(formula = y ~ x - 1)
575 ;; Coefficients:
576 ;; x
577 ;; 1.293
579 (first *betahat*))
584 (type-of #2A((1 2 3 4 5)
585 (10 20 30 40 50)))
587 (type-of (rand 10 20))
589 (typep #2A((1 2 3 4 5)
590 (10 20 30 40 50))
591 'matrix-like)
593 (typep (rand 10 20) 'matrix-like)
595 (typep #2A((1 2 3 4 5)
596 (10 20 30 40 50))
597 'array)
599 (typep (rand 10 20) 'array)