bootstrap API cleaned, still no real code.
[CommonLispStat.git] / TODO.lisp
blob1b6a0d4a417389c80306a73a058253b13f2469e7
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-02-13 16:07:43 tony>
4 ;;; Creation: <2008-09-08 08:06:30 tony>
5 ;;; File: TODO.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c) 2007-2008, AJ Rossini <blindglobe@gmail.com>. BSD.
8 ;;; Purpose: Stuff that needs to be made working sits inside the progns...
10 ;;; What is this talk of 'release'? Klingons do not make software
11 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
12 ;;; designers and quality assurance people in its wake.
14 ;;; This file contains the current challenges to solve, including a
15 ;;; description of the setup and the work to solve....
17 ;;; SET UP
19 (in-package :cl-user)
20 ;;(asdf:oos 'asdf:compile-op 'lispstat)
21 ;;(asdf:oos 'asdf:load-op 'lispstat)
24 (in-package :lisp-stat-unittests)
26 ;; tests = 54, failures = 7, errors = 3
28 (describe (run-tests :suite 'lisp-stat-ut))
29 (run-tests :suite 'lisp-stat-ut)
32 ;; FIXME: Example: currently not relevant, yet
33 (describe
34 (lift::run-test
35 :test-case 'lisp-stat-unittests::create-proto
36 :suite 'lisp-stat-unittests::lisp-stat-ut-proto))
39 (in-package :ls-user)
42 (progn ;; Data setup
44 ;; Making data-frames (i.e. cases (rows) by variables (columns))
45 ;; takes a bit of getting used to. For this, it is important to
46 ;; realize that we can do the following:
47 ;; #1 - consider the possibility of having a row, and transposing
48 ;; it, so the list-of-lists is: ((1 2 3 4 5)) (1 row, 5 columns)
49 ;; #2 - naturally list-of-lists: ((1)(2)(3)(4)(5)) (5 rows, 1 column)
50 ;; see src/data/listoflist.lisp for code to process this particular
51 ;; data structure.
52 (defparameter *indep-vars-1-matrix*
53 (transpose (make-matrix 1 (length iron)
54 :initial-contents
55 (list (mapcar #'(lambda (x) (coerce x 'double-float))
56 iron))))
57 "creating iron into double float, straightforward")
59 (documentation '*indep-vars-1-matrix* 'variable)
60 ;; *indep-vars-1-matrix*
62 ;; or directly:
63 (defparameter *indep-vars-1a-matrix*
64 (make-matrix (length iron) 1
65 :initial-contents
66 (mapcar #'(lambda (x) (list (coerce x 'double-float)))
67 iron)))
68 ;; *indep-vars-1a-matrix*
70 ;; and mathematically, they seem equal:
71 (m= *indep-vars-1-matrix* *indep-vars-1a-matrix*) ; => T
72 ;; but of course not completely...
73 (eql *indep-vars-1-matrix* *indep-vars-1a-matrix*) ; => NIL
74 (eq *indep-vars-1-matrix* *indep-vars-1a-matrix*) ; => NIL
76 ;; and verify...
77 (print *indep-vars-1-matrix*)
78 (print *indep-vars-1a-matrix*)
80 (documentation 'lisp-matrix:bind2 'function) ; by which we mean:
81 (documentation 'bind2 'function)
82 (bind2 *indep-vars-1-matrix* *indep-vars-1a-matrix* :by :column) ; 2 col
83 (bind2 *indep-vars-1-matrix* *indep-vars-1a-matrix* :by :row) ; 1 long col
85 ;; the weird way
86 (defparameter *indep-vars-2-matrix*
87 (transpose (make-matrix 2 (length iron)
88 :initial-contents
89 (list
90 (mapcar #'(lambda (x) (coerce x 'double-float))
91 iron)
92 (mapcar #'(lambda (x) (coerce x 'double-float))
93 aluminum)))))
94 ;; *indep-vars-2-matrix*
96 ;; the "right"? way
97 (defparameter *indep-vars-2-matrix*
98 (make-matrix (length iron) 2
99 :initial-contents
100 (mapcar #'(lambda (x y)
101 (list (coerce x 'double-float)
102 (coerce y 'double-float)))
103 iron aluminum)))
104 ;; *indep-vars-2-matrix*
107 ;; The below FAILS due to coercion issues; it just isn't lispy, it's R'y.
109 (defparameter *dep-var* (make-vector (length absorbtion)
110 :initial-contents (list absorbtion)))
112 ;; BUT below, this should be the right type.
113 (defparameter *dep-var*
114 (make-vector (length absorbtion)
115 :type :row
116 :initial-contents
117 (list
118 (mapcar #'(lambda (x) (coerce x 'double-float))
119 absorbtion))))
120 ;; *dep-var*
123 (defparameter *dep-var-int*
124 (make-vector (length absorbtion)
125 :type :row
126 :element-type 'integer
127 :initial-contents (list absorbtion)))
129 (typep *dep-var* 'matrix-like) ; => T
130 (typep *dep-var* 'vector-like) ; => T
132 (typep *indep-vars-1-matrix* 'matrix-like) ; => T
133 (typep *indep-vars-1-matrix* 'vector-like) ; => T
134 (typep *indep-vars-2-matrix* 'matrix-like) ; => T
135 (typep *indep-vars-2-matrix* 'vector-like) ; => F
137 iron
138 ;; following fails, need to ensure that we work on list elts, not just
139 ;; elts within a list:
140 ;; (coerce iron 'real)
142 ;; the following is a general list-conversion coercion approach -- is
143 ;; there a more efficient way?
144 (coerce 1 'real)
145 (mapcar #'(lambda (x) (coerce x 'double-float)) iron)
147 (princ "Data Set up"))
151 #+nil
152 (progn
153 ;; REVIEW: general Lisp use guidance
155 (fdefinition 'make-matrix)
156 (documentation 'make-matrix 'function)
158 #| Examples from CLHS, a bit of guidance.
160 ;; This function assumes its callers have checked the types of the
161 ;; arguments, and authorizes the compiler to build in that assumption.
162 (defun discriminant (a b c)
163 (declare (number a b c))
164 "Compute the discriminant for a quadratic equation."
165 (- (* b b) (* 4 a c))) => DISCRIMINANT
166 (discriminant 1 2/3 -2) => 76/9
168 ;; This function assumes its callers have not checked the types of the
169 ;; arguments, and performs explicit type checks before making any assumptions.
170 (defun careful-discriminant (a b c)
171 "Compute the discriminant for a quadratic equation."
172 (check-type a number)
173 (check-type b number)
174 (check-type c number)
175 (locally (declare (number a b c))
176 (- (* b b) (* 4 a c)))) => CAREFUL-DISCRIMINANT
177 (careful-discriminant 1 2/3 -2) => 76/9
182 #+nil
183 (progn ;; FIXME: Regression modeling
185 ;; data setup in previous FIXME
186 (defparameter m nil
187 "holding variable.")
188 ;; need to make vectors and matrices from the lists...
190 (def m (regression-model (list->vector-like iron)
191 (list->vector-like absorbtion)))
193 (def m (regression-model (list->vector-like iron)
194 (list->vector-like absorbtion) :print nil))
195 ;;Good
196 (send m :print)
197 (send m :own-slots)
198 (send m :own-methods)
199 ;; (lsos::ls-objects-methods m) ; bogus?
200 (send m :show)
202 (def m (regression-model (list->vector-like iron)
203 (list->vector-like absorbtion)))
205 (def m (regression-model (listoflists->matrix-like (list iron aluminum))
206 (list->vector-like absorbtion) :print nil))
209 (send m :compute)
210 (send m :sweep-matrix)
211 (format t "~%~A~%" (send m :sweep-matrix))
213 ;; need to get multiple-linear regression working (simple linear regr
214 ;; works)... to do this, we need to redo the whole numeric structure,
215 ;; I'm keeping these in as example of brokenness...
217 (send m :basis) ;; this should be positive?
218 (send m :coef-estimates) )
220 #+nil
221 (progn ;; FIXME: Need to clean up data examples, licenses, attributions, etc.
222 ;; The following breaks because we should use a package to hold
223 ;; configuration details, and this would be the only package outside
224 ;; of packages.lisp, as it holds the overall defsystem structure.
225 (load-data "iris.lsp") ;; (the above partially fixed).
226 (variables)
227 diabetes )
229 #+nil
230 (progn
232 ;; FIXME: Data.Frames probably deserve to be related to lists --
233 ;; either lists of cases, or lists of variables. We probably do not
234 ;; want to mix them, but want to be able to convert between such
235 ;; structures.
237 (defparameter *my-case-data*
238 '((:cases
239 (:case1 Y Med 3.4 5)
240 (:case2 N Low 3.2 3)
241 (:case3 Y High 3.1 4))
242 (:var-names (list "Response" "Level" "Pressure" "Size"))))
244 *my-case-data*
246 (elt *my-case-data* 1)
247 (elt *my-case-data* 0)
248 ;;(elt *my-case-data* 2) ;; error
249 (elt (elt *my-case-data* 0) 1)
250 (elt (elt *my-case-data* 0) 0)
251 (elt (elt (elt *my-case-data* 0) 1) 0)
252 (elt (elt (elt *my-case-data* 0) 1) 1)
253 (elt (elt *my-case-data* 0) 2))
257 (progn ;; FIXME: read data from CSV file. To do.
259 ;; challenge is to ensure that we get mixed arrays when we want them,
260 ;; and single-type (simple) arrays in other cases.
262 (defparameter *csv-num* (read-csv "Data/example-num.csv" :type 'numeric))
263 (defparameter *csv-mix* (read-csv "Data/example-mixed.csv" :type 'data))
265 ;; The handling of these types should be compariable to what we do for
266 ;; matrices, but without the numerical processing. i.e. mref, bind2,
267 ;; make-dataframe, and the class structure should be similar.
269 ;; With numerical data, there should be a straightforward mapping from
270 ;; the data.frame to a matrix. With categorical data (including
271 ;; dense categories such as doc-strings, as well as sparse categories
272 ;; such as binary data), we need to include metadata about ordering,
273 ;; coding, and such. So the structures should probably consider
275 ;; Using the CSV file:
277 (asdf:oos 'asdf:compile-op 'csv :force t)
278 (asdf:oos 'asdf:load-op 'parse-number)
279 (asdf:oos 'asdf:load-op 'csv)
280 (fare-csv:read-csv-file "Data/example-numeric.csv")
282 ;; but I think the cl-csv package is broken, need to use the dsv-style
283 ;; package.
285 ;; now we've got the DSV code in the codebase, auto-loaded I hope:
286 cybertiggyr-dsv:*field-separator*
287 (defparameter *example-numeric.csv*
288 (cybertiggyr-dsv:load-escaped "Data/example-numeric.csv"
289 :field-separator #\,))
290 *example-numeric.csv*
292 ;; the following fails because we've got a bit of string conversion
293 ;; to do. 2 thoughts: #1 modify dsv package, but mucking with
294 ;; encapsulation. #2 add a coercion tool (better, but potentially
295 ;; inefficient).
296 #+nil(coerce (nth 3 (nth 3 *example-numeric.csv*)) 'double-float)
298 ;; cases, simple to not so
299 (defparameter *test-string1* "1.2")
300 (defparameter *test-string2* " 1.2")
301 (defparameter *test-string3* " 1.2 ")
305 #+nil
306 (progn ;; experiments with GSL and the Lisp interface.
307 (asdf:oos 'asdf:load-op 'gsll)
308 (asdf:oos 'asdf:load-op 'gsll-tests)
310 ;; the following should be equivalent
311 (setf *t1* (LIST 6.18d0 6.647777777777779d0 6.18d0))
312 (setf *t2* (MULTIPLE-VALUE-LIST
313 (LET ((VEC
314 (gsll:make-marray 'DOUBLE-FLOAT
315 :INITIAL-CONTENTS '(-3.21d0 1.0d0 12.8d0)))
316 (WEIGHTS
317 (gsll:MAKE-MARRAY 'DOUBLE-FLOAT
318 :INITIAL-CONTENTS '(3.0d0 1.0d0 2.0d0))))
319 (LET ((MEAN (gsll:MEAN VEC)))
320 (LIST (gsll:ABSOLUTE-DEVIATION VEC)
321 (gsll:WEIGHTED-ABSOLUTE-DEVIATION VEC WEIGHTS)
322 (gsll:ABSOLUTE-DEVIATION VEC MEAN))))))
323 (eql *t1* *t2*)
325 ;; from (gsll:examples 'gsll::numerical-integration) ...
326 (gsll:integration-qng gsll::one-sine 0.0d0 PI)
328 (gsll:defun-single axpb (x) (+ (* 2 x) 3)) ;; a<-2, b<-3
329 (gsll:integration-qng axpb 1d0 2d0)
331 (let ((a 2)
332 (b 3))
333 (defun-single axpb2 (x) (+ (* a x) b)))
334 (gsll:integration-qng axpb2 1d0 2d0)
336 ;; BAD
337 ;; (gsll:integration-qng
338 ;; (let ((a 2)
339 ;; (b 3))
340 ;; (defun-single axpb2 (x) (+ (* a x) b)))
341 ;; 1d0 2d0)
343 ;; right, but weird expansion...
344 (gsll:integration-qng
345 (let ((a 2)
346 (b 3))
347 (defun axpb2 (x) (+ (* a x) b))
348 (gsll:def-single-function axpb2)
349 axpb2)
350 1d0 2d0)
352 ;; Linear least squares
354 (gsll:gsl-lookup "gsl_linalg_LU_decomp") ; => gsll:lu-decomposition
355 (gsll:gsl-lookup "gsl_linalg_LU_solve") ; => gsll:lu-solve
360 #+nil
361 (progn ;; philosophy time
363 (setf my-model (model :name "ex1"
364 :data-slots (list x y z)
365 :param-slots (list alpha beta gamma)
366 :math-form (regression-model :formula '(= y (+ (* beta x)
367 (* alpha y)
368 (* gamma z)
369 normal-error)))))
370 (setf my-dataset (statistical-table :table data-frame-contents
371 :metadata (list (:case-names (list ))
372 (:var-names (list ))
373 (:documentation "string of doc"))))
375 (setf my-analysis (analysis
376 :model my-model
377 :data my-dataset
378 :parameter-map (pairing (model-param-slots my-model)
379 (data-var-names my-dataset))))
381 ;; ontological implications -- the analysis is an abstract class of
382 ;; data, model, and mapping between the model and data. The fit is
383 ;; the instantiation of such. This provides a statistical object
384 ;; computation theory which can be realized as "executable
385 ;; statistics" or "computable statistics".
386 (setf my-analysis (analyze my-fit
387 :estimation-method 'linear-least-squares-regression))
389 ;; one of the tricks here is that one needs to provide the structure
390 ;; from which to consider estimation, and more importantly, the
391 ;; validity of the estimation.
394 (setf linear-least-squares-regression
395 (estimation-method-definition
396 :variable-defintions ((list
397 ;; from MachLearn: supervised,
398 ;; unsupervised
399 :data-response-vars list-drv ; nil if unsup
401 :param-vars list-pv
402 :data-predictor-vars list-dpv
403 ;; nil in this case. these
404 ;; describe "out-of-box" specs
405 :hyper-vars list-hv))
406 :form '(regression-additive-error
407 :central-form (linear-form drv pv dpv)
408 :error-form 'normal-error)
409 :resulting-decision '(point-estimation interval-estimation)
410 :philosophy 'frequentist
411 :documentation "use least squares to fit a linear regression
412 model to data."))
414 (defparameter *statistical-philosophies*
415 '(frequentist bayesian fiducial decision-analysis)
416 "can be combined to build decision-making approaches and
417 characterizations")
419 (defparameter *decisions*
420 '(estimation selection testing)
421 "possible results from a...")
422 ;; is this really true? One can embedded hypothesis testing within
423 ;; estimation, as the hypothesis estimated to select. And
424 ;; categorical/continuous rear their ugly heads, but not really in
425 ;; an essential way.
427 (defparameter *ontology-of-decision-procedures*
428 (list :decisions
429 (list :estimation
430 (list :point
431 (list :maximum-likelihood
432 :minimum-entropy
433 :least-squares
434 :method-of-moments)
435 :interval
436 (list :maximum-likelihood
438 :testing
439 (list :fisherian
440 :neyman-pearson
441 (list :traditional
442 :bioequivalence-inversion)
443 :selection
444 (list :ranking
445 :top-k-of-n-select))
446 :parametric
447 :partially-parametric))
448 "start of ontology"))
451 ;;;; LM
453 (progn
455 (defparameter *y*
456 (make-vector
458 :type :row
459 :initial-contents '((1d0 2d0 3d0 4d0 5d0 6d0 7d0 8d0))))
462 (defparameter *xv+1*
463 (make-matrix
465 :initial-contents '((1d0 1d0)
466 (1d0 3d0)
467 (1d0 2d0)
468 (1d0 4d0)
469 (1d0 3d0)
470 (1d0 5d0)
471 (1d0 4d0)
472 (1d0 6d0))))
474 (defparameter *xv+1a*
475 (make-matrix
477 :initial-contents #2A((1d0 1d0)
478 (1d0 3d0)
479 (1d0 2d0)
480 (1d0 4d0)
481 (1d0 3d0)
482 (1d0 5d0)
483 (1d0 4d0)
484 (1d0 6d0))))
486 (defparameter *xv+1b*
487 (bind2
488 (ones 8 1)
489 (make-matrix
491 :initial-contents '((1d0)
492 (3d0)
493 (2d0)
494 (4d0)
495 (3d0)
496 (5d0)
497 (4d0)
498 (6d0)))
499 :by :column))
501 (m= *xv+1a* *xv+1b*) ; => T
503 ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
504 (defparameter *xtx-2* (m* (transpose *xv+1*) *xv+1*))
505 ;; #<LA-SIMPLE-MATRIX-DOUBLE 2 x 2
506 ;; 8.0d0 28.0d0
507 ;; 28.0d0 116.0d0>
509 (defparameter *xty-2* (m* (transpose *xv+1*) (transpose *y*)))
510 ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
511 ;; 36.0d0
512 ;; 150.0d0>
514 (defparameter *rcond-2* 0.000001)
515 (defparameter *betahat-2* (gelsy *xtx-2* *xty-2* *rcond-2*))
516 ;; *xtx-2* => "details of complete orthogonal factorization"
517 ;; according to man page:
518 ;; #<LA-SIMPLE-MATRIX-DOUBLE 2 x 2
519 ;; -119.33147112141039d0 -29.095426104883202d0
520 ;; 0.7873402682880205d0 -1.20672274167718d0>
522 ;; *xty-2* => output becomes solution:
523 ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
524 ;; -0.16666666666668312d0
525 ;; 1.333333333333337d0>
527 *betahat-2* ; which matches R, see below
529 (documentation 'gelsy 'function)
532 ;; (#<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
533 ;; -0.16666666666668312 1.333333333333337>
534 ;; 2)
536 ;; ## Test case in R:
537 ;; x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
538 ;; y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
539 ;; lm(y~x)
540 ;; ## => Call: lm(formula = y ~ x)
542 ;; Coefficients: (Intercept) x
543 ;; -0.1667 1.3333
545 ;; summary(lm(y~x))
546 ;; ## =>
548 ;; Call:
549 ;; lm(formula = y ~ x)
551 ;; Residuals:
552 ;; Min 1Q Median 3Q Max
553 ;; -1.833e+00 -6.667e-01 -3.886e-16 6.667e-01 1.833e+00
555 ;; Coefficients:
556 ;; Estimate Std. Error t value Pr(>|t|)
557 ;; (Intercept) -0.1667 1.1587 -0.144 0.89034
558 ;; x 1.3333 0.3043 4.382 0.00466 **
559 ;; ---
560 ;; Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
562 ;; Residual standard error: 1.291 on 6 degrees of freedom
563 ;; Multiple R-squared: 0.7619, Adjusted R-squared: 0.7222
564 ;; F-statistic: 19.2 on 1 and 6 DF, p-value: 0.004659
568 ;; which suggests one might do (modulo ensuring correct
569 ;; orientations). When this is finalized, it should migrate to
570 ;; CLS.
572 ;; might add args: (method 'gelsy), or do we want to put a more
573 ;; general front end, linear-least-square, across the range of
574 ;; LAPACK solvers?
575 (defun lm (x y &optional rcond)
576 "fit the linear model:
577 y = x \beta + e
579 and estimate \beta. X,Y should be in cases-by-vars form, i.e. X
580 should be n x p, Y should be n x 1. Returns estimates, n and p.
581 Probably should return a form providing the call, as well.
583 R's lm object returns: coefficients, residuals, effects, rank, fitted,
584 qr-results for numerical considerations, DF_resid. Need to
585 encapsulate into a class or struct.
587 (check-type x matrix-like)
588 (check-type y vector-like) ; vector-like might be too strict?
589 ; maybe matrix-like?
590 (assert (= (nrows y) (nrows x)) ; same number of observations/cases
591 (x y) "Can not multiply x:~S by y:~S" x y)
592 (let ((betahat (gelsy (m* (transpose x) x)
593 (m* (transpose x) y)
594 (if rcond rcond (* (coerce (expt 2 -52) 'double-float)
595 (max (nrows x)
596 (ncols y))))))
597 (betahat1 (gelsy x
599 (* (coerce (expt 2 -52) 'double-float)
600 (max (nrows x)
601 (ncols y))))))
602 ;; need computation for SEs,
603 (format t "")
604 (list betahat ; LA-SIMPLE-VECTOR-DOUBLE
605 betahat1 ; LA-SLICE-VECVIEW-DOUBLE
606 (v- (first betahat) (first betahat1))
607 ;; (sebetahat betahat x y) ; TODO: write me!
608 (nrows x) ; surrogate for n
609 (ncols x)))) ; surrogate for p
612 (defparameter *n* 20) ; # rows = # obsns
613 (defparameter *p* 10) ; # cols = # vars
614 (defparameter *x-temp* (rand *n* *p*))
615 (defparameter *b-temp* (rand *p* 1))
616 (defparameter *y-temp* (m* *x-temp* *b-temp*))
617 ;; so Y=Xb + \eps
618 (defparameter *rcond* (* (coerce (expt 2 -52) 'double-float)
619 (max (nrows *x-temp*) (ncols *y-temp*))))
620 (defparameter *orig-x* (copy *x-temp*))
621 (defparameter *orig-b* (copy *b-temp*))
622 (defparameter *orig-y* (copy *y-temp*))
624 (defparameter *lm-result* (lm *x-temp* *y-temp*))
625 (princ (first *lm-result*))
626 (princ (second *lm-result*))
627 (princ (third *lm-result*))
628 (v= (third *lm-result*)
629 (v- (first (first *lm-result*))
630 (first (second *lm-result*)))))