supporting functions for creating and accessing elements of the dataframes (store...
[CommonLispStat.git] / TODO.lisp
blob4857e3f6acf201e25adf805df8586c41772714ca
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-03-25 15:50:59 tony>
4 ;;; Creation: <2008-09-08 08:06:30 tony>
5 ;;; File: TODO.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c) 2007-2008, AJ Rossini <blindglobe@gmail.com>. BSD.
8 ;;; Purpose: Stuff that needs to be made working sits inside the progns...
10 ;;; What is this talk of 'release'? Klingons do not make software
11 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
12 ;;; designers and quality assurance people in its wake.
14 ;;; This file contains the current challenges to solve, including a
15 ;;; description of the setup and the work to solve....
17 ;;; SET UP
19 (in-package :cl-user)
20 ;;(asdf:oos 'asdf:compile-op 'lispstat)
21 ;;(asdf:oos 'asdf:load-op 'lispstat)
23 (in-package :lisp-stat-unittests)
25 ;; tests = 87, failures = 7, errors = 35
27 (describe (run-tests :suite 'lisp-stat-ut))
28 (run-tests :suite 'lisp-stat-ut)
31 ;; FIXME: Example: currently not relevant, yet
32 (describe
33 (lift::run-test
34 :test-case 'lisp-stat-unittests::create-proto
35 :suite 'lisp-stat-unittests::lisp-stat-ut-proto))
38 (describe 'lisp-stat-ut)
39 (in-package :ls-user)
42 (progn ;; dataframe
44 (describe (lift::run-tests :suite 'lisp-stat-ut-dataclos))
45 (lift::run-tests :suite 'lisp-stat-ut-dataclos)
47 (describe
48 (lift::run-test
49 :test-case 'lisp-stat-unittests::create-proto
50 :suite 'lisp-stat-unittests::lisp-stat-ut-proto))
52 (defparameter *my-df-1*
53 (make-instance 'dataframe-array
54 :storage #2A((1 2 3 4 5)
55 (10 20 30 40 50))
56 :doc "This is an interesting dataframe-array"
57 :case-labels (list "x" "y")
58 :var-labels (list "a" "b" "c" "d" "e")))
61 (defparameter *my-df-2*
62 (make-instance 'dataframe-array
63 :storage
64 (make-array-from-listoflists
65 (cybertiggyr-dsv::load-escaped
66 "/media/disk/Desktop/sandbox/CLS.git/Data/example-mixed.csv"))
67 :doc "This is an interesting dataframe-array"))
68 #| :case-labels (list "x" "y")
69 :var-labels (list "a" "b" "c" "d" "e")
70 |#
75 (progn ;; Data setup
77 (describe 'make-matrix)
79 (defparameter *indep-vars-2-matrix*
80 (make-matrix (length iron) 2
81 :initial-contents
82 (mapcar #'(lambda (x y)
83 (list (coerce x 'double-float)
84 (coerce y 'double-float)))
85 iron aluminum)))
88 (defparameter *dep-var*
89 (make-vector (length absorbtion)
90 :type :row
91 :initial-contents
92 (list
93 (mapcar #'(lambda (x) (coerce x 'double-float))
94 absorbtion))))
96 (defparameter *dep-var-int*
97 (make-vector (length absorbtion)
98 :type :row
99 :element-type 'integer
100 :initial-contents (list absorbtion)))
103 (defparameter *xv+1a*
104 (make-matrix
106 :initial-contents #2A((1d0 1d0)
107 (1d0 3d0)
108 (1d0 2d0)
109 (1d0 4d0)
110 (1d0 3d0)
111 (1d0 5d0)
112 (1d0 4d0)
113 (1d0 6d0))))
115 (defparameter *xv+1b*
116 (bind2
117 (ones 8 1)
118 (make-matrix
120 :initial-contents '((1d0)
121 (3d0)
122 (2d0)
123 (4d0)
124 (3d0)
125 (5d0)
126 (4d0)
127 (6d0)))
128 :by :column))
130 (m= *xv+1a* *xv+1b*) ; => T
132 (princ "Data Set up"))
137 (progn
138 ;; REVIEW: general Lisp use guidance
140 (fdefinition 'make-matrix)
141 (documentation 'make-matrix 'function)
143 #| Examples from CLHS, a bit of guidance.
145 ;; This function assumes its callers have checked the types of the
146 ;; arguments, and authorizes the compiler to build in that assumption.
147 (defun discriminant (a b c)
148 (declare (number a b c))
149 "Compute the discriminant for a quadratic equation."
150 (- (* b b) (* 4 a c))) => DISCRIMINANT
151 (discriminant 1 2/3 -2) => 76/9
153 ;; This function assumes its callers have not checked the types of the
154 ;; arguments, and performs explicit type checks before making any assumptions.
155 (defun careful-discriminant (a b c)
156 "Compute the discriminant for a quadratic equation."
157 (check-type a number)
158 (check-type b number)
159 (check-type c number)
160 (locally (declare (number a b c))
161 (- (* b b) (* 4 a c)))) => CAREFUL-DISCRIMINANT
162 (careful-discriminant 1 2/3 -2) => 76/9
167 #+nil
168 (progn ;; FIXME: Regression modeling
170 ;; data setup in previous FIXME
171 (defparameter *m* nil
172 "holding variable.")
173 ;; need to make vectors and matrices from the lists...
175 ;; BROKEN
176 (def *m* (regression-model (list->vector-like iron)
177 (list->vector-like absorbtion)))
179 (def m (regression-model (list->vector-like iron)
180 (list->vector-like absorbtion) :print nil))
181 ;;Good
182 (send m :print)
183 (send m :own-slots)
184 (send m :own-methods)
185 ;; (lsos::ls-objects-methods m) ; bogus?
186 (send m :show)
188 (def m (regression-model (list->vector-like iron)
189 (list->vector-like absorbtion)))
191 (def m (regression-model (listoflists->matrix-like (list iron aluminum))
192 (list->vector-like absorbtion) :print nil))
195 (send m :compute)
196 (send m :sweep-matrix)
197 (format t "~%~A~%" (send m :sweep-matrix))
199 ;; need to get multiple-linear regression working (simple linear regr
200 ;; works)... to do this, we need to redo the whole numeric structure,
201 ;; I'm keeping these in as example of brokenness...
203 (send m :basis) ;; this should be positive?
204 (send m :coef-estimates) )
206 #+nil
207 (progn ;; FIXME: Need to clean up data examples, licenses, attributions, etc.
208 ;; The following breaks because we should use a package to hold
209 ;; configuration details, and this would be the only package outside
210 ;; of packages.lisp, as it holds the overall defsystem structure.
211 (load-data "iris.lsp") ;; (the above partially fixed).
212 (variables)
213 diabetes )
218 (progn ;; FIXME: read data from CSV file. To do.
221 ;; challenge is to ensure that we get mixed arrays when we want them,
222 ;; and single-type (simple) arrays in other cases.
225 (defparameter *csv-num*
226 (cybertiggyr-dsv::load-escaped
227 #p"/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric.csv"
228 :field-separator #\,
229 :trace T))
231 (nth 0 (nth 0 *csv-num*))
233 (defparameter *csv-num*
234 (cybertiggyr-dsv::load-escaped
235 #p"/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric2.dsv"
236 :field-separator #\:))
238 (nth 0 (nth 0 *csv-num*))
241 ;; The handling of these types should be compariable to what we do for
242 ;; matrices, but without the numerical processing. i.e. mref, bind2,
243 ;; make-dataframe, and the class structure should be similar.
245 ;; With numerical data, there should be a straightforward mapping from
246 ;; the data.frame to a matrix. With categorical data (including
247 ;; dense categories such as doc-strings, as well as sparse categories
248 ;; such as binary data), we need to include metadata about ordering,
249 ;; coding, and such. So the structures should probably consider
251 ;; Using the CSV file:
253 (defun parse-number (s)
254 (let* ((*read-eval* nil)
255 (n (read-from-string s)))
256 (if (numberp n) n)))
258 (parse-number "34")
259 (parse-number "34 ")
260 (parse-number " 34")
261 (parse-number " 34 ")
263 (+ (parse-number "3.4") 3)
264 (parse-number "3.4 ")
265 (parse-number " 3.4")
266 (+ (parse-number " 3.4 ") 3)
268 (parse-number "a")
270 ;; (coerce "2.3" 'number) => ERROR
271 ;; (coerce "2" 'float) => ERROR
273 (defparameter *csv-num*
274 (cybertiggyr-dsv::load-escaped
275 #p"/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric.csv"
276 :field-separator #\,
277 :filter #'parse-number
278 :trace T))
280 (nth 0 (nth 0 *csv-num*))
282 (defparameter *csv-num*
283 (cybertiggyr-dsv::load-escaped
284 #p"/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric2.dsv"
285 :field-separator #\:
286 :filter #'parse-number))
288 (nth 0 (nth 0 *csv-num*))
290 ;; now we've got the DSV code in the codebase, auto-loaded I hope:
291 cybertiggyr-dsv:*field-separator*
292 (defparameter *example-numeric.csv*
293 (cybertiggyr-dsv:load-escaped "Data/example-numeric.csv"
294 :field-separator #\,))
295 *example-numeric.csv*
297 ;; the following fails because we've got a bit of string conversion
298 ;; to do. 2 thoughts: #1 modify dsv package, but mucking with
299 ;; encapsulation. #2 add a coercion tool (better, but potentially
300 ;; inefficient).
301 #+nil(coerce (nth 3 (nth 3 *example-numeric.csv*)) 'double-float)
303 ;; cases, simple to not so
304 (defparameter *test-string1* "1.2")
305 (defparameter *test-string2* " 1.2")
306 (defparameter *test-string3* " 1.2 ")
310 #+nil
311 (progn ;; experiments with GSL and the Lisp interface.
312 (asdf:oos 'asdf:load-op 'gsll)
313 (asdf:oos 'asdf:load-op 'gsll-tests)
315 ;; the following should be equivalent
316 (setf *t1* (LIST 6.18d0 6.647777777777779d0 6.18d0))
317 (setf *t2* (MULTIPLE-VALUE-LIST
318 (LET ((VEC
319 (gsll:make-marray 'DOUBLE-FLOAT
320 :INITIAL-CONTENTS '(-3.21d0 1.0d0 12.8d0)))
321 (WEIGHTS
322 (gsll:MAKE-MARRAY 'DOUBLE-FLOAT
323 :INITIAL-CONTENTS '(3.0d0 1.0d0 2.0d0))))
324 (LET ((MEAN (gsll:MEAN VEC)))
325 (LIST (gsll:ABSOLUTE-DEVIATION VEC)
326 (gsll:WEIGHTED-ABSOLUTE-DEVIATION VEC WEIGHTS)
327 (gsll:ABSOLUTE-DEVIATION VEC MEAN))))))
328 (eql *t1* *t2*)
330 ;; from (gsll:examples 'gsll::numerical-integration) ...
331 (gsll:integration-qng gsll::one-sine 0.0d0 PI)
333 (gsll:defun-single axpb (x) (+ (* 2 x) 3)) ;; a<-2, b<-3
334 (gsll:integration-qng axpb 1d0 2d0)
336 (let ((a 2)
337 (b 3))
338 (defun-single axpb2 (x) (+ (* a x) b)))
339 (gsll:integration-qng axpb2 1d0 2d0)
341 ;; BAD
342 ;; (gsll:integration-qng
343 ;; (let ((a 2)
344 ;; (b 3))
345 ;; (defun-single axpb2 (x) (+ (* a x) b)))
346 ;; 1d0 2d0)
348 ;; right, but weird expansion...
349 (gsll:integration-qng
350 (let ((a 2)
351 (b 3))
352 (defun axpb2 (x) (+ (* a x) b))
353 (gsll:def-single-function axpb2)
354 axpb2)
355 1d0 2d0)
357 ;; Linear least squares
359 (gsll:gsl-lookup "gsl_linalg_LU_decomp") ; => gsll:lu-decomposition
360 (gsll:gsl-lookup "gsl_linalg_LU_solve") ; => gsll:lu-solve
365 #+nil
366 (progn ;; philosophy time
368 (setf my-model (model :name "ex1"
369 :data-slots (list w x y z)
370 :param-slots (list alpha beta gamma)
371 :math-form (regression-model :formula '(= w (+ (* beta x)
372 (* alpha y)
373 (* gamma z)
374 normal-error))
375 :centrality 'median ; 'mean
378 #| or:
379 #R"W ~ x+ y + z "
382 (setf my-dataset (statistical-table :table data-frame-contents
383 :metadata (list (:case-names (list ))
384 (:var-names (list ))
385 (:documentation "string of doc"))))
387 (setf my-analysis (analysis
388 :model my-model
389 :data my-dataset
390 :parameter-map (pairing (model-param-slots my-model)
391 (data-var-names my-dataset))))
393 ;; ontological implications -- the analysis is an abstract class of
394 ;; data, model, and mapping between the model and data. The fit is
395 ;; the instantiation of such. This provides a statistical object
396 ;; computation theory which can be realized as "executable
397 ;; statistics" or "computable statistics".
398 (setf my-analysis (analyze my-fit
399 :estimation-method 'linear-least-squares-regression))
401 ;; one of the tricks here is that one needs to provide the structure
402 ;; from which to consider estimation, and more importantly, the
403 ;; validity of the estimation.
406 (setf linear-least-squares-regression
407 (estimation-method-definition
408 :variable-defintions ((list
409 ;; from MachLearn: supervised,
410 ;; unsupervised
411 :data-response-vars list-drv ; nil if unsup
413 :param-vars list-pv
414 :data-predictor-vars list-dpv
415 ;; nil in this case. these
416 ;; describe "out-of-box" specs
417 :hyper-vars list-hv))
418 :form '(regression-additive-error
419 :central-form (linear-form drv pv dpv)
420 :error-form 'normal-error)
421 :resulting-decision '(point-estimation interval-estimation)
422 :philosophy 'frequentist
423 :documentation "use least squares to fit a linear regression
424 model to data."))
426 (defparameter *statistical-philosophies*
427 '(frequentist bayesian fiducial decision-analysis)
428 "can be combined to build decision-making approaches and
429 characterizations")
431 (defparameter *decisions*
432 '(estimation selection testing)
433 "possible results from a...")
434 ;; is this really true? One can embedded hypothesis testing within
435 ;; estimation, as the hypothesis estimated to select. And
436 ;; categorical/continuous rear their ugly heads, but not really in
437 ;; an essential way.
439 (defparameter *ontology-of-decision-procedures*
440 (list :decisions
441 (list :estimation
442 (list :point
443 (list :maximum-likelihood
444 :minimum-entropy
445 :least-squares
446 :method-of-moments)
447 :interval
448 (list :maximum-likelihood
450 :testing
451 (list :fisherian
452 :neyman-pearson
453 (list :traditional
454 :bioequivalence-inversion)
455 :selection
456 (list :ranking
457 :top-k-of-n-select))
458 :parametric
459 :partially-parametric))
460 "start of ontology"))
463 ;;;; LM
465 (progn
467 (defparameter *y*
468 (make-vector
470 :type :row
471 :initial-contents '((1d0 2d0 3d0 4d0 5d0 6d0 7d0 8d0))))
474 (defparameter *xv+1*
475 (make-matrix
477 :initial-contents '((1d0 1d0)
478 (1d0 3d0)
479 (1d0 2d0)
480 (1d0 4d0)
481 (1d0 3d0)
482 (1d0 5d0)
483 (1d0 4d0)
484 (1d0 6d0))))
487 ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
488 (defparameter *xtx-2* (m* (transpose *xv+1*) *xv+1*))
489 ;; #<LA-SIMPLE-MATRIX-DOUBLE 2 x 2
490 ;; 8.0d0 28.0d0
491 ;; 28.0d0 116.0d0>
493 (defparameter *xty-2* (m* (transpose *xv+1*) (transpose *y*)))
494 ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
495 ;; 36.0d0
496 ;; 150.0d0>
498 (defparameter *rcond-2* 0.000001)
499 (defparameter *betahat-2* (gelsy *xtx-2* *xty-2* *rcond-2*))
500 ;; *xtx-2* => "details of complete orthogonal factorization"
501 ;; according to man page:
502 ;; #<LA-SIMPLE-MATRIX-DOUBLE 2 x 2
503 ;; -119.33147112141039d0 -29.095426104883202d0
504 ;; 0.7873402682880205d0 -1.20672274167718d0>
506 ;; *xty-2* => output becomes solution:
507 ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
508 ;; -0.16666666666668312d0
509 ;; 1.333333333333337d0>
511 *betahat-2* ; which matches R, see below
513 (documentation 'gelsy 'function)
516 ;; (#<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
517 ;; -0.16666666666668312 1.333333333333337>
518 ;; 2)
520 ;; ## Test case in R:
521 ;; x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
522 ;; y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
523 ;; lm(y~x)
524 ;; ## => Call: lm(formula = y ~ x)
526 ;; Coefficients: (Intercept) x
527 ;; -0.1667 1.3333
529 ;; summary(lm(y~x))
530 ;; ## =>
532 ;; Call:
533 ;; lm(formula = y ~ x)
535 ;; Residuals:
536 ;; Min 1Q Median 3Q Max
537 ;; -1.833e+00 -6.667e-01 -3.886e-16 6.667e-01 1.833e+00
539 ;; Coefficients:
540 ;; Estimate Std. Error t value Pr(>|t|)
541 ;; (Intercept) -0.1667 1.1587 -0.144 0.89034
542 ;; x 1.3333 0.3043 4.382 0.00466 **
543 ;; ---
544 ;; Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
546 ;; Residual standard error: 1.291 on 6 degrees of freedom
547 ;; Multiple R-squared: 0.7619, Adjusted R-squared: 0.7222
548 ;; F-statistic: 19.2 on 1 and 6 DF, p-value: 0.004659
552 ;; which suggests one might do (modulo ensuring correct
553 ;; orientations). When this is finalized, it should migrate to
554 ;; CLS.
558 (defparameter *n* 20) ; # rows = # obsns
559 (defparameter *p* 10) ; # cols = # vars
560 (defparameter *x-temp* (rand *n* *p*))
561 (defparameter *b-temp* (rand *p* 1))
562 (defparameter *y-temp* (m* *x-temp* *b-temp*))
563 ;; so Y=Xb + \eps
564 (defparameter *rcond* (* (coerce (expt 2 -52) 'double-float)
565 (max (nrows *x-temp*) (ncols *y-temp*))))
566 (defparameter *orig-x* (copy *x-temp*))
567 (defparameter *orig-b* (copy *b-temp*))
568 (defparameter *orig-y* (copy *y-temp*))
570 (defparameter *lm-result* (lm *x-temp* *y-temp*))
571 (princ (first *lm-result*))
572 (princ (second *lm-result*))
573 (princ (third *lm-result*))
574 (v= (third *lm-result*)
575 (v- (first (first *lm-result*))
576 (first (second *lm-result*))))
581 ;; Some issues exist in the LAPACK vs. LINPACK variants, hence R
582 ;; uses LINPACK primarily, rather than LAPACK. See comments in R
583 ;; source for issues.
586 ;; Goal is to start from X, Y and then realize that if
587 ;; Y = X \beta, then, i.e. 8x1 = 8xp px1 + 8x1
588 ;; XtX \hat\beta = Xt Y
589 ;; so that we can solve the equation W \beta = Z where W and Z
590 ;; are known, to estimate \beta.
592 ;; the above is known to be numerically instable -- some processing
593 ;; of X is preferred and should be done prior. And most of the
594 ;; transformation-based work does precisely that.
596 ;; recall: Var[Y] = E[(Y - E[Y])(Y-E[Y])t]
597 ;; = E[Y Yt] - 2 \mu \mut + \mu \mut
598 ;; = E[Y Yt] - \mu \mut
600 ;; Var Y = E[Y^2] - \mu^2
603 ;; For initial estimates of covariance of \hat\beta:
605 ;; \hat\beta = (Xt X)^-1 Xt Y
606 ;; with E[ \hat\beta ]
607 ;; = E[ (Xt X)^-1 Xt Y ]
608 ;; = E[(Xt X)^-1 Xt (X\beta)]
609 ;; = \beta
611 ;; So Var[\hat\beta] = ...
612 ;; (Xt X)
613 ;; and this gives SE(\beta_i) = (* (sqrt (mref Var i i)) adjustment)
616 ;; from docs:
618 (setf *temp-result*
619 (let ((*default-implementation* :foreign-array))
620 (let* ((m 10)
621 (n 10)
622 (a (rand m n))
623 (x (rand n 1))
624 (b (m* a x))
625 (rcond (* (coerce (expt 2 -52) 'double-float)
626 (max (nrows a) (ncols a))))
627 (orig-a (copy a))
628 (orig-b (copy b))
629 (orig-x (copy x)))
630 (list x (gelsy a b rcond))
631 ;; no applicable conversion?
632 ;; (m- (#<FA-SIMPLE-VECTOR-DOUBLE (10 x 1))
633 ;; (#<FA-SIMPLE-VECTOR-DOUBLE (10 x 1)) )
634 (v- x (first (gelsy a b rcond))))))
637 (princ *temp-result*)
639 (setf *temp-result*
640 (let ((*default-implementation* :lisp-array))
641 (let* ((m 10)
642 (n 10)
643 (a (rand m n))
644 (x (rand n 1))
645 (b (m* a x))
646 (rcond (* (coerce (expt 2 -52) 'double-float)
647 (max (nrows a) (ncols a))))
648 (orig-a (copy a))
649 (orig-b (copy b))
650 (orig-x (copy x)))
651 (list x (gelsy a b rcond))
652 (m- x (first (gelsy a b rcond)))
654 (princ *temp-result*)
657 (defparameter *xv*
658 (make-vector
660 :type :row ;; default, not usually needed!
661 :initial-contents '((1d0 3d0 2d0 4d0 3d0 5d0 4d0 6d0))))
663 (defparameter *y*
664 (make-vector
666 :type :row
667 :initial-contents '((1d0 2d0 3d0 4d0 5d0 6d0 7d0 8d0))))
669 ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
670 (defparameter *xtx-1* (m* *xv* (transpose *xv*)))
671 (defparameter *xty-1* (m* *xv* (transpose *y*)))
672 (defparameter *rcond-in* (* (coerce (expt 2 -52) 'double-float)
673 (max (nrows *xtx-1*)
674 (ncols *xty-1*))))
676 (defparameter *betahat* (gelsy *xtx-1* *xty-1* *rcond-in*))
678 ;; (#<LA-SIMPLE-VECTOR-DOUBLE (1 x 1)
679 ;; 1.293103448275862>
680 ;; 1)
682 ;; ## Test case in R:
683 ;; x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
684 ;; y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
685 ;; lm(y~x-1)
686 ;; ## =>
687 ;; Call:
688 ;; lm(formula = y ~ x - 1)
690 ;; Coefficients:
691 ;; x
692 ;; 1.293
694 (first *betahat*))
698 #+nil
699 (progn
701 (asdf:oos 'asdf:load-op 'cl-plplot)
703 (plot-ex))