TODO.lisp

   1 ;;; -*- mode: lisp -*-
   2
   3 ;;; Time-stamp: <2009-02-17 08:37:49 tony>
   4 ;;; Creation:   <2008-09-08 08:06:30 tony>
   5 ;;; File:       TODO.lisp
   6 ;;; Author:     AJ Rossini <blindglobe@gmail.com>
   7 ;;; Copyright:  (c) 2007-2008, AJ Rossini <blindglobe@gmail.com>.  BSD.
   8 ;;; Purpose:    Stuff that needs to be made working sits inside the progns...
   9
  10 ;;; What is this talk of 'release'? Klingons do not make software
  11 ;;; 'releases'.  Our software 'escapes', leaving a bloody trail of
  12 ;;; designers and quality assurance people in its wake.
  13
  14 ;;; This file contains the current challenges to solve, including a
  15 ;;; description of the setup and the work to solve....
  16
  17 ;;; SET UP
  18
  19 (in-package :cl-user)
  20 ;;(asdf:oos 'asdf:compile-op 'lispstat)
  21 ;;(asdf:oos 'asdf:load-op 'lispstat)
  22
  23 (in-package :lisp-stat-unittests)
  24
  25 ;; tests = 54, failures = 7, errors = 3
  26
  27 (describe (run-tests :suite 'lisp-stat-ut))
  28 (run-tests :suite 'lisp-stat-ut)
  29
  30 #|
  31   ;; FIXME: Example: currently not relevant, yet
  32   (describe
  33     (lift::run-test
  34       :test-case  'lisp-stat-unittests::create-proto
  35       :suite 'lisp-stat-unittests::lisp-stat-ut-proto))
  36 |#
  37
  38 (in-package :ls-user)
  39
  40
  41 (progn ;; Data setup
  42
  43   ;; Making data-frames (i.e. cases (rows) by variables (columns))
  44   ;; takes a bit of getting used to.  For this, it is important to
  45   ;; realize that we can do the following:
  46   ;; #1 - consider the possibility of having a row, and transposing
  47   ;; it, so the list-of-lists is:  ((1 2 3 4 5))     (1 row, 5 columns)
  48   ;; #2 - naturally list-of-lists: ((1)(2)(3)(4)(5)) (5 rows, 1 column)
  49   ;; see src/data/listoflist.lisp for code to process this particular
  50   ;; data structure.
  51   (defparameter *indep-vars-1-matrix*
  52     (transpose  (make-matrix 1 (length iron)
  53                  :initial-contents
  54                  (list (mapcar #'(lambda (x) (coerce x 'double-float))
  55                                iron))))
  56     "creating iron into double float, straightforward")
  57
  58   (documentation '*indep-vars-1-matrix* 'variable)
  59   ;; *indep-vars-1-matrix*
  60
  61   ;; or directly:
  62   (defparameter *indep-vars-1a-matrix*
  63     (make-matrix (length iron)  1
  64                  :initial-contents
  65                  (mapcar #'(lambda (x) (list  (coerce x 'double-float)))
  66                                iron)))
  67   ;; *indep-vars-1a-matrix*
  68
  69   ;; and mathematically, they seem equal:
  70   (m= *indep-vars-1-matrix* *indep-vars-1a-matrix*) ; => T
  71   ;; but of course not completely...
  72   (eql *indep-vars-1-matrix* *indep-vars-1a-matrix*) ; => NIL
  73   (eq *indep-vars-1-matrix* *indep-vars-1a-matrix*) ; => NIL
  74
  75   ;; and verify...
  76   (print *indep-vars-1-matrix*)
  77   (print *indep-vars-1a-matrix*)
  78
  79   (documentation 'lisp-matrix:bind2 'function) ; by which we mean:
  80   (documentation 'bind2 'function)
  81   (bind2 *indep-vars-1-matrix* *indep-vars-1a-matrix* :by :column) ; 2 col
  82   (bind2 *indep-vars-1-matrix* *indep-vars-1a-matrix* :by :row) ; 1 long col
  83
  84   ;; the weird way
  85   (defparameter *indep-vars-2-matrix*
  86     (transpose (make-matrix  2 (length iron)
  87                              :initial-contents
  88                              (list
  89                               (mapcar #'(lambda (x) (coerce x 'double-float))
  90                                       iron)
  91                               (mapcar #'(lambda (x) (coerce x 'double-float))
  92                                       aluminum)))))
  93   ;; *indep-vars-2-matrix*
  94
  95   ;; the "right"? way
  96   (defparameter *indep-vars-2-matrix*
  97     (make-matrix (length iron) 2
  98                  :initial-contents
  99                  (mapcar #'(lambda (x y)
 100                              (list (coerce x 'double-float)
 101                                    (coerce y 'double-float)))
 102                          iron aluminum)))
 103   ;; *indep-vars-2-matrix*
 104
 105
 106   ;; The below FAILS due to coercion issues; it just isn't lispy, it's R'y.
 107 #|
 108   (defparameter *dep-var* (make-vector (length absorbtion)
 109                                        :initial-contents (list absorbtion)))
 110 |#
 111   ;; BUT below, this should be the right type.
 112   (defparameter *dep-var*
 113     (make-vector (length absorbtion)
 114                  :type :row
 115                  :initial-contents
 116                  (list
 117                   (mapcar #'(lambda (x) (coerce x 'double-float))
 118                           absorbtion))))
 119   ;; *dep-var*
 120
 121
 122   (defparameter *dep-var-int*
 123     (make-vector (length absorbtion)
 124                  :type :row
 125                  :element-type 'integer
 126                  :initial-contents (list absorbtion)))
 127
 128   (typep *dep-var* 'matrix-like)        ; => T
 129   (typep *dep-var* 'vector-like)        ; => T
 130
 131   (typep *indep-vars-1-matrix* 'matrix-like) ; => T
 132   (typep *indep-vars-1-matrix* 'vector-like) ; => T
 133   (typep *indep-vars-2-matrix* 'matrix-like) ; => T
 134   (typep *indep-vars-2-matrix* 'vector-like) ; => F
 135
 136   iron
 137   ;; following fails, need to ensure that we work on list elts, not just
 138   ;; elts within a list:
 139   ;;
 140   ;;     (coerce iron 'real)
 141   ;;
 142   ;; the following is a general list-conversion coercion approach -- is
 143   ;; there a more efficient way?
 144   ;;     (coerce 1 'real)
 145   ;;     (mapcar #'(lambda (x) (coerce x 'double-float)) iron)
 146
 147   (princ "Data Set up"))
 148
 149
 150 (progn
 151   ;; REVIEW: general Lisp use guidance
 152
 153   (fdefinition 'make-matrix)
 154   (documentation 'make-matrix 'function)
 155
 156 #| Examples from CLHS, a bit of guidance.
 157
 158   ;; This function assumes its callers have checked the types of the
 159   ;; arguments, and authorizes the compiler to build in that assumption.
 160   (defun discriminant (a b c)
 161    (declare (number a b c))
 162    "Compute the discriminant for a quadratic equation."
 163    (- (* b b) (* 4 a c))) =>  DISCRIMINANT
 164   (discriminant 1 2/3 -2) =>  76/9
 165
 166   ;; This function assumes its callers have not checked the types of the
 167   ;; arguments, and performs explicit type checks before making any assumptions.
 168  (defun careful-discriminant (a b c)
 169    "Compute the discriminant for a quadratic equation."
 170    (check-type a number)
 171    (check-type b number)
 172    (check-type c number)
 173    (locally (declare (number a b c))
 174      (- (* b b) (* 4 a c)))) =>  CAREFUL-DISCRIMINANT
 175  (careful-discriminant 1 2/3 -2) =>  76/9
 176 |#
 177   )
 178
 179
 180 #+nil
 181 (progn ;; FIXME: Regression modeling
 182
 183   ;; data setup in previous FIXME
 184   (defparameter  *m* nil
 185     "holding variable.")
 186   ;; need to make vectors and matrices from the lists...
 187
 188   ;; BROKEN
 189   (def *m* (regression-model (list->vector-like iron)
 190                              (list->vector-like absorbtion)))
 191
 192   (def m (regression-model (list->vector-like iron)
 193                            (list->vector-like absorbtion) :print nil))
 194                            ;;Good
 195   (send m :print)
 196   (send m :own-slots)
 197   (send m :own-methods)
 198   ;; (lsos::ls-objects-methods m) ; bogus?
 199   (send m :show)
 200
 201   (def m (regression-model (list->vector-like iron)
 202                            (list->vector-like absorbtion)))
 203
 204   (def m (regression-model (listoflists->matrix-like  (list iron aluminum))
 205                            (list->vector-like  absorbtion) :print nil))
 206
 207
 208   (send m :compute)
 209   (send m :sweep-matrix)
 210   (format t "~%~A~%" (send m :sweep-matrix))
 211
 212   ;; need to get multiple-linear regression working (simple linear regr
 213   ;; works)... to do this, we need to redo the whole numeric structure,
 214   ;; I'm keeping these in as example of brokenness...
 215
 216   (send m :basis) ;; this should be positive?
 217   (send m :coef-estimates)  )
 218
 219 #+nil
 220 (progn ;; FIXME: Need to clean up data examples, licenses, attributions, etc.
 221   ;; The following breaks because we should use a package to hold
 222   ;; configuration details, and this would be the only package outside
 223   ;; of packages.lisp, as it holds the overall defsystem structure.
 224   (load-data "iris.lsp")  ;; (the above partially fixed).
 225   (variables)
 226   diabetes )
 227
 228 #+nil
 229 (progn
 230
 231   ;; FIXME: Data.Frames probably deserve to be related to lists --
 232   ;; either lists of cases, or lists of variables.  We probably do not
 233   ;; want to mix them, but want to be able to convert between such
 234   ;; structures.
 235
 236   (defparameter *my-case-data*
 237     '((:cases
 238        (:case1 Y Med  3.4 5)
 239        (:case2 N Low  3.2 3)
 240        (:case3 Y High 3.1 4))
 241       (:var-names (list "Response" "Level" "Pressure" "Size"))))
 242
 243   *my-case-data*
 244
 245   (elt *my-case-data* 1)
 246   (elt *my-case-data* 0)
 247   ;;(elt *my-case-data* 2) ;; error
 248   (elt (elt *my-case-data* 0) 1)
 249   (elt (elt *my-case-data* 0) 0)
 250   (elt (elt (elt *my-case-data* 0) 1) 0)
 251   (elt (elt (elt *my-case-data* 0) 1) 1)
 252   (elt (elt *my-case-data* 0) 2))
 253
 254
 255
 256 (progn ;; FIXME: read data from CSV file.  To do.
 257
 258   ;; challenge is to ensure that we get mixed arrays when we want them,
 259   ;; and single-type (simple) arrays in other cases.
 260
 261   (defparameter *csv-num* (read-csv "Data/example-num.csv" :type 'numeric))
 262   (defparameter *csv-mix* (read-csv "Data/example-mixed.csv" :type 'data))
 263
 264   ;; The handling of these types should be compariable to what we do for
 265   ;; matrices, but without the numerical processing.  i.e. mref, bind2,
 266   ;; make-dataframe, and the class structure should be similar.
 267
 268   ;; With numerical data, there should be a straightforward mapping from
 269   ;; the data.frame to a matrix.   With categorical data (including
 270   ;; dense categories such as doc-strings, as well as sparse categories
 271   ;; such as binary data), we need to include metadata about ordering,
 272   ;; coding, and such.  So the structures should probably consider
 273
 274   ;; Using the CSV file:
 275
 276   (asdf:oos 'asdf:compile-op 'csv :force t)
 277   (asdf:oos 'asdf:load-op 'parse-number)
 278   (asdf:oos 'asdf:load-op 'csv)
 279   (fare-csv:read-csv-file "Data/example-numeric.csv")
 280
 281   ;; but I think the cl-csv package is broken, need to use the dsv-style
 282   ;; package.
 283
 284   ;; now we've got the DSV code in the codebase, auto-loaded I hope:
 285   cybertiggyr-dsv:*field-separator*
 286   (defparameter *example-numeric.csv*
 287     (cybertiggyr-dsv:load-escaped "Data/example-numeric.csv"
 288                                   :field-separator #\,))
 289   *example-numeric.csv*
 290
 291   ;; the following fails because we've got a bit of string conversion
 292   ;; to do.   2 thoughts: #1 modify dsv package, but mucking with
 293   ;; encapsulation.  #2 add a coercion tool (better, but potentially
 294   ;; inefficient).
 295   #+nil(coerce  (nth 3 (nth 3 *example-numeric.csv*)) 'double-float)
 296
 297   ;; cases, simple to not so
 298   (defparameter *test-string1* "1.2")
 299   (defparameter *test-string2* " 1.2")
 300   (defparameter *test-string3* " 1.2 ")
 301   )
 302
 303
 304 #+nil
 305 (progn ;; experiments with GSL and the Lisp interface.
 306   (asdf:oos 'asdf:load-op 'gsll)
 307   (asdf:oos 'asdf:load-op 'gsll-tests)
 308
 309   ;; the following should be equivalent
 310   (setf *t1*  (LIST 6.18d0 6.647777777777779d0 6.18d0))
 311   (setf *t2*  (MULTIPLE-VALUE-LIST
 312                (LET ((VEC
 313                       (gsll:make-marray 'DOUBLE-FLOAT
 314                                         :INITIAL-CONTENTS '(-3.21d0 1.0d0 12.8d0)))
 315                      (WEIGHTS
 316                       (gsll:MAKE-MARRAY 'DOUBLE-FLOAT
 317                                         :INITIAL-CONTENTS '(3.0d0 1.0d0 2.0d0))))
 318                  (LET ((MEAN (gsll:MEAN VEC)))
 319                    (LIST (gsll:ABSOLUTE-DEVIATION VEC)
 320                          (gsll:WEIGHTED-ABSOLUTE-DEVIATION VEC WEIGHTS)
 321                          (gsll:ABSOLUTE-DEVIATION VEC MEAN))))))
 322   (eql *t1* *t2*)
 323
 324   ;; from (gsll:examples 'gsll::numerical-integration) ...
 325   (gsll:integration-qng gsll::one-sine 0.0d0 PI)
 326
 327   (gsll:defun-single axpb (x) (+ (* 2 x) 3)) ;; a<-2, b<-3
 328   (gsll:integration-qng axpb 1d0 2d0)
 329
 330   (let ((a 2)
 331         (b 3))
 332     (defun-single axpb2 (x) (+ (* a x) b)))
 333   (gsll:integration-qng axpb2 1d0 2d0)
 334
 335   ;;   BAD
 336   ;;   (gsll:integration-qng
 337   ;;    (let ((a 2)
 338   ;;     (b 3))
 339   ;;      (defun-single axpb2 (x) (+ (* a x) b)))
 340   ;;    1d0 2d0)
 341
 342   ;; right, but weird expansion...
 343   (gsll:integration-qng
 344    (let ((a 2)
 345          (b 3))
 346      (defun axpb2 (x) (+ (* a x) b))
 347      (gsll:def-single-function axpb2)
 348      axpb2)
 349    1d0 2d0)
 350
 351   ;; Linear least squares
 352
 353   (gsll:gsl-lookup "gsl_linalg_LU_decomp") ; => gsll:lu-decomposition
 354   (gsll:gsl-lookup "gsl_linalg_LU_solve") ; => gsll:lu-solve
 355   )
 356
 357
 358
 359 #+nil
 360 (progn ;; philosophy time
 361
 362   (setf my-model (model :name "ex1"
 363                         :data-slots (list x y z)
 364                         :param-slots (list alpha beta gamma)
 365                         :math-form (regression-model :formula '(= y (+ (* beta x)
 366                                                                      (* alpha y)
 367                                                                      (* gamma z)
 368                                                                      normal-error)))))
 369   (setf my-dataset (statistical-table :table data-frame-contents
 370                                       :metadata (list (:case-names (list ))
 371                                                       (:var-names (list ))
 372                                                       (:documentation "string of doc"))))
 373
 374   (setf my-analysis (analysis
 375                      :model my-model
 376                      :data my-dataset
 377                      :parameter-map (pairing (model-param-slots my-model)
 378                                              (data-var-names my-dataset))))
 379
 380   ;; ontological implications -- the analysis is an abstract class of
 381   ;; data, model, and mapping between the model and data.  The fit is
 382   ;; the instantiation of such.  This provides a statistical object
 383   ;; computation theory which can be realized as "executable
 384   ;; statistics" or "computable statistics".
 385   (setf my-analysis (analyze my-fit
 386                              :estimation-method 'linear-least-squares-regression))
 387
 388   ;; one of the tricks here is that one needs to provide the structure
 389   ;; from which to consider estimation, and more importantly, the
 390   ;; validity of the estimation.
 391
 392   ;;
 393   (setf linear-least-squares-regression
 394         (estimation-method-definition
 395          :variable-defintions ((list
 396                                 ;; from MachLearn: supervised,
 397                                 ;; unsupervised
 398                                 :data-response-vars list-drv ; nil if unsup
 399                                 ;;
 400                                 :param-vars list-pv
 401                                      :data-predictor-vars list-dpv
 402                                      ;; nil in this case.  these
 403                                      ;; describe "out-of-box" specs
 404                                      :hyper-vars list-hv))
 405          :form '(regression-additive-error
 406                  :central-form (linear-form drv pv dpv)
 407                  :error-form 'normal-error)
 408          :resulting-decision '(point-estimation interval-estimation)
 409          :philosophy 'frequentist
 410          :documentation "use least squares to fit a linear regression
 411                          model to data."))
 412
 413   (defparameter *statistical-philosophies*
 414     '(frequentist bayesian fiducial decision-analysis)
 415     "can be combined to build decision-making approaches and
 416     characterizations")
 417
 418   (defparameter *decisions*
 419     '(estimation selection testing)
 420     "possible results from a...")
 421   ;; is this really true?  One can embedded hypothesis testing within
 422   ;; estimation, as the hypothesis estimated to select.  And
 423   ;; categorical/continuous rear their ugly heads, but not really in
 424   ;; an essential way.
 425
 426   (defparameter *ontology-of-decision-procedures*
 427     (list :decisions
 428           (list :estimation
 429                 (list :point
 430                       (list :maximum-likelihood
 431                             :minimum-entropy
 432                             :least-squares
 433                             :method-of-moments)
 434                       :interval
 435                       (list :maximum-likelihood
 436                             :))
 437                 :testing
 438                 (list :fisherian
 439                       :neyman-pearson
 440                       (list :traditional
 441                             :bioequivalence-inversion)
 442                       :selection
 443                       (list :ranking
 444                             :top-k-of-n-select))
 445                 :parametric
 446                 :partially-parametric))
 447     "start of ontology"))
 448
 449
 450 ;;;; LM
 451
 452 (progn
 453
 454   (defparameter *y*
 455     (make-vector
 456      8
 457      :type :row
 458      :initial-contents '((1d0 2d0 3d0 4d0 5d0 6d0 7d0 8d0))))
 459
 460
 461   (defparameter *xv+1*
 462     (make-matrix
 463      8 2
 464      :initial-contents '((1d0 1d0)
 465                          (1d0 3d0)
 466                          (1d0 2d0)
 467                          (1d0 4d0)
 468                          (1d0 3d0)
 469                          (1d0 5d0)
 470                          (1d0 4d0)
 471                          (1d0 6d0))))
 472
 473   (defparameter *xv+1a*
 474     (make-matrix
 475      8 2
 476      :initial-contents #2A((1d0 1d0)
 477                            (1d0 3d0)
 478                            (1d0 2d0)
 479                            (1d0 4d0)
 480                            (1d0 3d0)
 481                            (1d0 5d0)
 482                            (1d0 4d0)
 483                            (1d0 6d0))))
 484
 485   (defparameter *xv+1b*
 486     (bind2
 487      (ones 8 1)
 488      (make-matrix
 489       8 1
 490       :initial-contents '((1d0)
 491                           (3d0)
 492                           (2d0)
 493                           (4d0)
 494                           (3d0)
 495                           (5d0)
 496                           (4d0)
 497                           (6d0)))
 498      :by :column))
 499
 500   (m= *xv+1a* *xv+1b*) ; => T
 501
 502   ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
 503   (defparameter *xtx-2* (m* (transpose *xv+1*) *xv+1*))
 504   ;; #<LA-SIMPLE-MATRIX-DOUBLE  2 x 2
 505   ;;  8.0d0 28.0d0
 506   ;;  28.0d0 116.0d0>
 507
 508   (defparameter *xty-2* (m* (transpose *xv+1*)  (transpose *y*)))
 509   ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
 510   ;;  36.0d0
 511   ;;  150.0d0>
 512
 513   (defparameter *rcond-2* 0.000001)
 514   (defparameter *betahat-2*  (gelsy *xtx-2* *xty-2* *rcond-2*))
 515   ;; *xtx-2* => "details of complete orthogonal factorization"
 516   ;; according to man page:
 517   ;; #<LA-SIMPLE-MATRIX-DOUBLE  2 x 2
 518   ;;  -119.33147112141039d0 -29.095426104883202d0
 519   ;;  0.7873402682880205d0 -1.20672274167718d0>
 520
 521   ;; *xty-2* => output becomes solution:
 522   ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
 523   ;;  -0.16666666666668312d0
 524   ;;  1.333333333333337d0>
 525
 526   *betahat-2* ; which matches R, see below
 527
 528   (documentation 'gelsy 'function)
 529
 530
 531 ;;   (#<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
 532 ;;    -0.16666666666668312 1.333333333333337>
 533 ;;    2)
 534
 535 ;;   ## Test case in R:
 536 ;;   x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
 537 ;;   y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
 538 ;;   lm(y~x)
 539 ;;   ## => Call:  lm(formula = y ~ x)
 540
 541 ;;   Coefficients:  (Intercept)            x
 542 ;;                      -0.1667       1.3333
 543
 544 ;;   summary(lm(y~x))
 545 ;;   ## =>
 546
 547 ;;   Call:
 548 ;;   lm(formula = y ~ x)
 549
 550 ;;   Residuals:
 551 ;;          Min         1Q     Median         3Q        Max
 552 ;;   -1.833e+00 -6.667e-01 -3.886e-16  6.667e-01  1.833e+00
 553
 554 ;;   Coefficients:
 555 ;;               Estimate Std. Error t value Pr(>|t|)
 556 ;;   (Intercept)  -0.1667     1.1587  -0.144  0.89034
 557 ;;   x             1.3333     0.3043   4.382  0.00466 **
 558 ;;   ---
 559 ;;   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 560
 561 ;;   Residual standard error: 1.291 on 6 degrees of freedom
 562 ;;   Multiple R-squared: 0.7619,        Adjusted R-squared: 0.7222
 563 ;;   F-statistic:  19.2 on 1 and 6 DF,  p-value: 0.004659
 564
 565
 566
 567   ;; which suggests one might do (modulo ensuring correct
 568   ;; orientations).  When this is finalized, it should migrate to
 569   ;; CLS.
 570   ;;
 571
 572
 573   (defparameter *n* 20) ; # rows = # obsns
 574   (defparameter *p* 10) ; # cols = # vars
 575   (defparameter *x-temp*  (rand *n* *p*))
 576   (defparameter *b-temp*  (rand *p* 1))
 577   (defparameter *y-temp*  (m* *x-temp* *b-temp*))
 578   ;; so Y=Xb + \eps
 579   (defparameter *rcond* (* (coerce (expt 2 -52) 'double-float)
 580                    (max (nrows *x-temp*) (ncols *y-temp*))))
 581   (defparameter *orig-x* (copy *x-temp*))
 582   (defparameter *orig-b* (copy *b-temp*))
 583   (defparameter *orig-y* (copy *y-temp*))
 584
 585   (defparameter *lm-result* (lm *x-temp* *y-temp*))
 586   (princ (first *lm-result*))
 587   (princ (second *lm-result*))
 588   (princ (third *lm-result*))
 589   (v= (third *lm-result*)
 590       (v- (first (first *lm-result*))
 591           (first  (second *lm-result*)))))