3 ;;; Time-stamp: <2009-04-13 12:02:31 tony>
4 ;;; Creation: <2008-09-08 08:06:30 tony>
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c) 2007-2008, AJ Rossini <blindglobe@gmail.com>. BSD.
8 ;;; Purpose: Stuff that needs to be made working sits inside the
9 ;;; progns... This file contains the current challenges to
10 ;;; solve, including a description of the setup and the work
13 ;;; What is this talk of 'release'? Klingons do not make software
14 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
15 ;;; designers and quality assurance people in its wake.
20 ;;(asdf:oos 'asdf:load-op 'lisp-matrix)
21 ;;(asdf:oos 'asdf:compile-op 'lispstat)
22 ;;(asdf:oos 'asdf:load-op 'lispstat)
24 (in-package :lisp-stat-unittests
)
26 ;; tests = 78, failures = 7, errors = 15
27 (run-tests :suite
'lisp-stat-ut
)
28 (describe (run-tests :suite
'lisp-stat-ut
))
30 ;; FIXME: Example: currently not relevant, yet
31 ;; (describe (lift::run-test :test-case 'lisp-stat-unittests::create-proto
32 ;; :suite 'lisp-stat-unittests::lisp-stat-ut-proto))
34 (describe 'lisp-stat-ut
)
38 (progn ;; FIXME: Regression modeling (some data future-ish)
41 (regression-model (list->vector-like iron
) ;; BROKEN
42 (list->vector-like absorbtion
))
49 (covariation-matrix *m-fit
*)
55 ;; (lsos::ls-objects-methods m) ; bogus?
59 (regression-model (list->vector-like iron
)
60 (list->vector-like absorbtion
)))
64 (regression-model (listoflists->matrix-like
(list iron aluminum
))
65 (list->vector-like absorbtion
) :print nil
))
68 (send m
:sweep-matrix
)
69 (format t
"~%~A~%" (send m
:sweep-matrix
))
71 ;; need to get multiple-linear regression working (simple linear regr
72 ;; works)... to do this, we need to redo the whole numeric structure,
73 ;; I'm keeping these in as example of brokenness...
75 (send m
:basis
) ;; this should be positive?
76 (send m
:coef-estimates
) )
79 (progn ;; FIXME: Need to clean up data examples, licenses, attributions, etc.
80 ;; The following breaks because we should use a package to hold
81 ;; configuration details, and this would be the only package outside
82 ;; of packages.lisp, as it holds the overall defsystem structure.
83 (load-data "iris.lsp") ;; (the above partially fixed).
90 (describe (lift::run-tests
:suite
'lisp-stat-ut-dataframe
))
91 (lift::run-tests
:suite
'lisp-stat-ut-dataframe
)
95 :test-case
'lisp-stat-unittests
::create-proto
96 :suite
'lisp-stat-unittests
::lisp-stat-ut-proto
))
98 (defparameter *my-df-1
*
99 (make-instance 'dataframe-array
100 :storage
#2A
((1 2 3 4 5)
102 :doc
"This is an interesting dataframe-array"
103 :case-labels
(list "x" "y")
104 :var-labels
(list "a" "b" "c" "d" "e")))
106 (setf (dfref *my-df-1
* 0 0) -
1d0
)
110 (make-dataframe #2A
((1 2 3 4 5)
113 (make-dataframe (rand 4 3))
117 (make-instance 'dataframe-array
124 (make-instance 'dataframe-array
131 (make-instance 'dataframe-array
132 :storage
#2A
((1d0 2d0
)
138 (defparameter *my-df-1
*
139 (make-dataframe #2A
((1 2 3 4 5)
141 :caselabels
(list "x" "y")
142 :varlabels
(list "a" "b" "c" "d" "e")
143 :doc
"This is an interesting dataframe-array"))
145 (caselabels *my-df-1
*)
146 (varlabels *my-df-1
*)
149 (defparameter *my-df-2
*
150 (make-instance 'dataframe-array
152 (make-array-from-listoflists
153 (cybertiggyr-dsv::load-escaped
154 "/media/disk/Desktop/sandbox/CLS.git/Data/example-mixed.csv"))
155 :doc
"This is an interesting dataframe-array"))
156 #|
:case-labels
(list "x" "y")
157 :var-labels
(list "a" "b" "c" "d" "e")
165 (describe 'make-matrix
)
167 (defparameter *indep-vars-2-matrix
*
168 (make-matrix (length iron
) 2
170 (mapcar #'(lambda (x y
)
171 (list (coerce x
'double-float
)
172 (coerce y
'double-float
)))
176 (defparameter *dep-var
*
177 (make-vector (length absorbtion
)
181 (mapcar #'(lambda (x) (coerce x
'double-float
))
184 (make-dataframe *dep-var
*)
185 (make-dataframe (transpose *dep-var
*))
187 (defparameter *dep-var-int
*
188 (make-vector (length absorbtion
)
190 :element-type
'integer
191 :initial-contents
(list absorbtion
)))
194 (defparameter *xv
+1a
*
197 :initial-contents
#2A
((1d0 1d0
)
206 (defparameter *xv
+1b
*
211 :initial-contents
'((1d0)
221 (m= *xv
+1a
* *xv
+1b
*) ; => T
223 (princ "Data Set up"))
229 ;; REVIEW: general Lisp use guidance
231 (fdefinition 'make-matrix
)
232 (documentation 'make-matrix
'function
)
234 #| Examples from CLHS
, a bit of guidance.
236 ;; This function assumes its callers have checked the types of the
237 ;; arguments, and authorizes the compiler to build in that assumption.
238 (defun discriminant (a b c
)
239 (declare (number a b c
))
240 "Compute the discriminant for a quadratic equation."
241 (- (* b b
) (* 4 a c
))) => DISCRIMINANT
242 (discriminant 1 2/3 -
2) => 76/9
244 ;; This function assumes its callers have not checked the types of the
245 ;; arguments, and performs explicit type checks before making any assumptions.
246 (defun careful-discriminant (a b c
)
247 "Compute the discriminant for a quadratic equation."
248 (check-type a number
)
249 (check-type b number
)
250 (check-type c number
)
251 (locally (declare (number a b c
))
252 (- (* b b
) (* 4 a c
)))) => CAREFUL-DISCRIMINANT
253 (careful-discriminant 1 2/3 -
2) => 76/9
261 (progn ;; FIXME: read data from CSV file. To do.
264 ;; challenge is to ensure that we get mixed arrays when we want them,
265 ;; and single-type (simple) arrays in other cases.
268 (defparameter *csv-num
*
269 (cybertiggyr-dsv::load-escaped
270 #p
"/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric.csv"
274 (nth 0 (nth 0 *csv-num
*))
276 (defparameter *csv-num
*
277 (cybertiggyr-dsv::load-escaped
278 #p
"/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric2.dsv"
279 :field-separator
#\
:))
281 (nth 0 (nth 0 *csv-num
*))
284 ;; The handling of these types should be compariable to what we do for
285 ;; matrices, but without the numerical processing. i.e. mref, bind2,
286 ;; make-dataframe, and the class structure should be similar.
288 ;; With numerical data, there should be a straightforward mapping from
289 ;; the data.frame to a matrix. With categorical data (including
290 ;; dense categories such as doc-strings, as well as sparse categories
291 ;; such as binary data), we need to include metadata about ordering,
292 ;; coding, and such. So the structures should probably consider
294 ;; Using the CSV file:
296 (defun parse-number (s)
297 (let* ((*read-eval
* nil
)
298 (n (read-from-string s
)))
304 (parse-number " 34 ")
306 (+ (parse-number "3.4") 3)
307 (parse-number "3.4 ")
308 (parse-number " 3.4")
309 (+ (parse-number " 3.4 ") 3)
313 ;; (coerce "2.3" 'number) => ERROR
314 ;; (coerce "2" 'float) => ERROR
316 (defparameter *csv-num
*
317 (cybertiggyr-dsv::load-escaped
318 #p
"/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric.csv"
320 :filter
#'parse-number
323 (nth 0 (nth 0 *csv-num
*))
325 (defparameter *csv-num
*
326 (cybertiggyr-dsv::load-escaped
327 #p
"/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric2.dsv"
329 :filter
#'parse-number
))
331 (nth 0 (nth 0 *csv-num
*))
333 ;; now we've got the DSV code in the codebase, auto-loaded I hope:
334 cybertiggyr-dsv
:*field-separator
*
335 (defparameter *example-numeric.csv
*
336 (cybertiggyr-dsv:load-escaped
"Data/example-numeric.csv"
337 :field-separator
#\
,))
338 *example-numeric.csv
*
340 ;; the following fails because we've got a bit of string conversion
341 ;; to do. 2 thoughts: #1 modify dsv package, but mucking with
342 ;; encapsulation. #2 add a coercion tool (better, but potentially
344 #+nil
(coerce (nth 3 (nth 3 *example-numeric.csv
*)) 'double-float
)
346 ;; cases, simple to not so
347 (defparameter *test-string1
* "1.2")
348 (defparameter *test-string2
* " 1.2")
349 (defparameter *test-string3
* " 1.2 ")
354 (progn ;; experiments with GSL and the Lisp interface.
355 (asdf:oos
'asdf
:load-op
'gsll
)
356 (asdf:oos
'asdf
:load-op
'gsll-tests
)
358 ;; the following should be equivalent
359 (setf *t1
* (LIST 6.18d0
6.647777777777779d0
6.18d0
))
360 (setf *t2
* (MULTIPLE-VALUE-LIST
362 (gsll:make-marray
'DOUBLE-FLOAT
363 :INITIAL-CONTENTS
'(-3.21d0
1.0d0
12.8d0
)))
365 (gsll:MAKE-MARRAY
'DOUBLE-FLOAT
366 :INITIAL-CONTENTS
'(3.0d0
1.0d0
2.0d0
))))
367 (LET ((MEAN (gsll:MEAN VEC
)))
368 (LIST (gsll:ABSOLUTE-DEVIATION VEC
)
369 (gsll:WEIGHTED-ABSOLUTE-DEVIATION VEC WEIGHTS
)
370 (gsll:ABSOLUTE-DEVIATION VEC MEAN
))))))
373 ;; from (gsll:examples 'gsll::numerical-integration) ...
374 (gsll:integration-qng gsll
::one-sine
0.0d0 PI
)
376 (gsll:defun-single axpb
(x) (+ (* 2 x
) 3)) ;; a<-2, b<-3
377 (gsll:integration-qng axpb
1d0
2d0
)
381 (defun-single axpb2
(x) (+ (* a x
) b
)))
382 (gsll:integration-qng axpb2
1d0
2d0
)
385 ;; (gsll:integration-qng
388 ;; (defun-single axpb2 (x) (+ (* a x) b)))
391 ;; right, but weird expansion...
392 (gsll:integration-qng
395 (defun axpb2 (x) (+ (* a x
) b
))
396 (gsll:def-single-function axpb2
)
400 ;; Linear least squares
402 (gsll:gsl-lookup
"gsl_linalg_LU_decomp") ; => gsll:lu-decomposition
403 (gsll:gsl-lookup
"gsl_linalg_LU_solve") ; => gsll:lu-solve
409 (progn ;; philosophy time
411 (setf my-model
(model :name
"ex1"
412 :data-slots
(list w x y z
)
413 :param-slots
(list alpha beta gamma
)
414 :math-form
(regression-model :formula
'(= w
(+ (* beta x
)
418 :centrality
'median
; 'mean
425 (setf my-dataset
(statistical-table :table data-frame-contents
426 :metadata
(list (:case-names
(list ))
428 (:documentation
"string of doc"))))
430 (setf my-analysis
(analysis
433 :parameter-map
(pairing (model-param-slots my-model
)
434 (data-var-names my-dataset
))))
436 ;; ontological implications -- the analysis is an abstract class of
437 ;; data, model, and mapping between the model and data. The fit is
438 ;; the instantiation of such. This provides a statistical object
439 ;; computation theory which can be realized as "executable
440 ;; statistics" or "computable statistics".
441 (setf my-analysis
(analyze my-fit
442 :estimation-method
'linear-least-squares-regression
))
444 ;; one of the tricks here is that one needs to provide the structure
445 ;; from which to consider estimation, and more importantly, the
446 ;; validity of the estimation.
449 (setf linear-least-squares-regression
450 (estimation-method-definition
451 :variable-defintions
((list
452 ;; from MachLearn: supervised,
454 :data-response-vars list-drv
; nil if unsup
457 :data-predictor-vars list-dpv
458 ;; nil in this case. these
459 ;; describe "out-of-box" specs
460 :hyper-vars list-hv
))
461 :form
'(regression-additive-error
462 :central-form
(linear-form drv pv dpv
)
463 :error-form
'normal-error
)
464 :resulting-decision
'(point-estimation interval-estimation
)
465 :philosophy
'frequentist
466 :documentation
"use least squares to fit a linear regression
469 (defparameter *statistical-philosophies
*
470 '(frequentist bayesian fiducial decision-analysis
)
471 "can be combined to build decision-making approaches and
474 (defparameter *decisions
*
475 '(estimation selection testing
)
476 "possible results from a...")
477 ;; is this really true? One can embedded hypothesis testing within
478 ;; estimation, as the hypothesis estimated to select. And
479 ;; categorical/continuous rear their ugly heads, but not really in
482 (defparameter *ontology-of-decision-procedures
*
486 (list :maximum-likelihood
491 (list :maximum-likelihood
497 :bioequivalence-inversion
)
502 :partially-parametric
))
503 "start of ontology"))
514 :initial-contents
'((1d0 2d0
3d0
4d0
5d0
6d0
7d0
8d0
))))
520 :initial-contents
'((1d0 1d0
)
530 ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
531 (defparameter *xtx-2
* (m* (transpose *xv
+1*) *xv
+1*))
532 ;; #<LA-SIMPLE-MATRIX-DOUBLE 2 x 2
536 (defparameter *xty-2
* (m* (transpose *xv
+1*) (transpose *y
*)))
537 ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
541 (defparameter *rcond-2
* 0.000001)
542 (defparameter *betahat-2
* (gelsy *xtx-2
* *xty-2
* *rcond-2
*))
543 ;; *xtx-2* => "details of complete orthogonal factorization"
544 ;; according to man page:
545 ;; #<LA-SIMPLE-MATRIX-DOUBLE 2 x 2
546 ;; -119.33147112141039d0 -29.095426104883202d0
547 ;; 0.7873402682880205d0 -1.20672274167718d0>
549 ;; *xty-2* => output becomes solution:
550 ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
551 ;; -0.16666666666668312d0
552 ;; 1.333333333333337d0>
554 *betahat-2
* ; which matches R, see below
556 (documentation 'gelsy
'function
)
559 ;; (#<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
560 ;; -0.16666666666668312 1.333333333333337>
563 ;; ## Test case in R:
564 ;; x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
565 ;; y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
567 ;; ## => Call: lm(formula = y ~ x)
569 ;; Coefficients: (Intercept) x
576 ;; lm(formula = y ~ x)
579 ;; Min 1Q Median 3Q Max
580 ;; -1.833e+00 -6.667e-01 -3.886e-16 6.667e-01 1.833e+00
583 ;; Estimate Std. Error t value Pr(>|t|)
584 ;; (Intercept) -0.1667 1.1587 -0.144 0.89034
585 ;; x 1.3333 0.3043 4.382 0.00466 **
587 ;; Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
589 ;; Residual standard error: 1.291 on 6 degrees of freedom
590 ;; Multiple R-squared: 0.7619, Adjusted R-squared: 0.7222
591 ;; F-statistic: 19.2 on 1 and 6 DF, p-value: 0.004659
595 ;; which suggests one might do (modulo ensuring correct
596 ;; orientations). When this is finalized, it should migrate to
601 (defparameter *n
* 20) ; # rows = # obsns
602 (defparameter *p
* 10) ; # cols = # vars
603 (defparameter *x-temp
* (rand *n
* *p
*))
604 (defparameter *b-temp
* (rand *p
* 1))
605 (defparameter *y-temp
* (m* *x-temp
* *b-temp
*))
607 (defparameter *rcond
* (* (coerce (expt 2 -
52) 'double-float
)
608 (max (nrows *x-temp
*) (ncols *y-temp
*))))
609 (defparameter *orig-x
* (copy *x-temp
*))
610 (defparameter *orig-b
* (copy *b-temp
*))
611 (defparameter *orig-y
* (copy *y-temp
*))
613 (defparameter *lm-result
* (lm *x-temp
* *y-temp
*))
614 (princ (first *lm-result
*))
615 (princ (second *lm-result
*))
616 (princ (third *lm-result
*))
617 (v= (third *lm-result
*)
618 (v- (first (first *lm-result
*))
619 (first (second *lm-result
*))))
624 ;; Some issues exist in the LAPACK vs. LINPACK variants, hence R
625 ;; uses LINPACK primarily, rather than LAPACK. See comments in R
626 ;; source for issues.
629 ;; Goal is to start from X, Y and then realize that if
630 ;; Y = X \beta, then, i.e. 8x1 = 8xp px1 + 8x1
631 ;; XtX \hat\beta = Xt Y
632 ;; so that we can solve the equation W \beta = Z where W and Z
633 ;; are known, to estimate \beta.
635 ;; the above is known to be numerically instable -- some processing
636 ;; of X is preferred and should be done prior. And most of the
637 ;; transformation-based work does precisely that.
639 ;; recall: Var[Y] = E[(Y - E[Y])(Y-E[Y])t]
640 ;; = E[Y Yt] - 2 \mu \mut + \mu \mut
641 ;; = E[Y Yt] - \mu \mut
643 ;; Var Y = E[Y^2] - \mu^2
646 ;; For initial estimates of covariance of \hat\beta:
648 ;; \hat\beta = (Xt X)^-1 Xt Y
649 ;; with E[ \hat\beta ]
650 ;; = E[ (Xt X)^-1 Xt Y ]
651 ;; = E[(Xt X)^-1 Xt (X\beta)]
654 ;; So Var[\hat\beta] = ...
656 ;; and this gives SE(\beta_i) = (* (sqrt (mref Var i i)) adjustment)
662 (let ((*default-implementation
* :foreign-array
))
668 (rcond (* (coerce (expt 2 -
52) 'double-float
)
669 (max (nrows a
) (ncols a
))))
673 (list x
(gelsy a b rcond
))
674 ;; no applicable conversion?
675 ;; (m- (#<FA-SIMPLE-VECTOR-DOUBLE (10 x 1))
676 ;; (#<FA-SIMPLE-VECTOR-DOUBLE (10 x 1)) )
677 (v- x
(first (gelsy a b rcond
))))))
680 (princ *temp-result
*)
683 (let ((*default-implementation
* :lisp-array
))
689 (rcond (* (coerce (expt 2 -
52) 'double-float
)
690 (max (nrows a
) (ncols a
))))
694 (list x
(gelsy a b rcond
))
695 (m- x
(first (gelsy a b rcond
)))
697 (princ *temp-result
*)
703 :type
:row
;; default, not usually needed!
704 :initial-contents
'((1d0 3d0
2d0
4d0
3d0
5d0
4d0
6d0
))))
710 :initial-contents
'((1d0 2d0
3d0
4d0
5d0
6d0
7d0
8d0
))))
712 ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
713 (defparameter *xtx-1
* (m* *xv
* (transpose *xv
*)))
714 (defparameter *xty-1
* (m* *xv
* (transpose *y
*)))
715 (defparameter *rcond-in
* (* (coerce (expt 2 -
52) 'double-float
)
719 (defparameter *betahat
* (gelsy *xtx-1
* *xty-1
* *rcond-in
*))
721 ;; (#<LA-SIMPLE-VECTOR-DOUBLE (1 x 1)
722 ;; 1.293103448275862>
725 ;; ## Test case in R:
726 ;; x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
727 ;; y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
731 ;; lm(formula = y ~ x - 1)
744 (asdf:oos
'asdf
:load-op
'cl-plplot
)
750 (type-of #2A
((1 2 3 4 5)
753 (type-of (rand 10 20))
755 (typep #2A
((1 2 3 4 5)
759 (typep (rand 10 20) 'matrix-like
)
761 (typep #2A
((1 2 3 4 5)
765 (typep (rand 10 20) 'array
)