TODO.lisp

   1 ;;; -*- mode: lisp -*-
   2
   3 ;;; Time-stamp: <2009-04-17 18:25:00 tony>
   4 ;;; Creation:   <2008-09-08 08:06:30 tony>
   5 ;;; File:       TODO.lisp
   6 ;;; Author:     AJ Rossini <blindglobe@gmail.com>
   7 ;;; Copyright:  (c) 2007-2008, AJ Rossini <blindglobe@gmail.com>.  BSD.
   8 ;;; Purpose: Stuff that needs to be made working sits inside the
   9 ;;;          progns... This file contains the current challenges to
  10 ;;;          solve, including a description of the setup and the work
  11 ;;;          to solve....
  12
  13 ;;; What is this talk of 'release'? Klingons do not make software
  14 ;;; 'releases'.  Our software 'escapes', leaving a bloody trail of
  15 ;;; designers and quality assurance people in its wake.
  16
  17 ;;; SET UP
  18
  19 (in-package :cl-user)
  20 ;;(asdf:oos 'asdf:load-op 'lisp-matrix)
  21 ;;(asdf:oos 'asdf:compile-op 'lispstat)
  22 ;;(asdf:oos 'asdf:load-op 'lispstat)
  23
  24 (in-package :lisp-stat-unittests)
  25
  26 ;; tests = 80, failures = 8, errors = 15
  27 (run-tests :suite 'lisp-stat-ut)
  28 (describe (run-tests :suite 'lisp-stat-ut))
  29
  30 ;; FIXME: Example: currently not relevant, yet
  31 ;;   (describe (lift::run-test :test-case  'lisp-stat-unittests::create-proto
  32 ;;                             :suite 'lisp-stat-unittests::lisp-stat-ut-proto))
  33
  34   (describe (lift::run-tests :suite 'lisp-stat-ut-dataframe))
  35   (lift::run-tests :suite 'lisp-stat-ut-dataframe)
  36
  37   (describe
  38     (lift::run-test
  39       :test-case  'lisp-stat-unittests::create-proto
  40       :suite 'lisp-stat-unittests::lisp-stat-ut-proto))
  41
  42 (describe 'lisp-stat-ut)
  43
  44 (in-package :ls-user)
  45
  46 (progn ;; FIXME: Regression modeling (some data future-ish).
  47
  48   ;; TODO:
  49   ;; - confirm estimates for multivariate case,
  50   ;; - pretty-print output
  51   ;; - fix up API -- what do we want this to look like?
  52
  53   (defparameter *m*
  54     (regression-model (list->vector-like iron) ;; BROKEN
  55                       (list->vector-like absorbtion))
  56     "holding variable.")
  57
  58   (defparameter *m-fit*
  59     (fit-model *m*))
  60
  61   (princ *m*)
  62   (princ *m-fit*)
  63
  64   (estimates *m-fit*)
  65   (covariance-matrix *m-fit*)
  66
  67   (defparameter *m3*
  68     (regression-model (transpose  (listoflist->matrix-like (list iron aluminum)
  69                                                :orientation :row-major))
  70                       (list->vector-like  absorbtion) ))
  71   (princ *m3*)
  72   (defparameter *m3-fit*
  73     (fit-model *m3*))
  74 #|
  75   ;; Should the above look something like:
  76   (defparameter *m3-fit*
  77                 (spec-and-fit-model '(absorbtion = iron aluminum)))
  78   ;; in which case we split the list before/after the "=" character.
  79 |#
  80
  81   (estimates *m3-fit*)
  82   (covariance-matrix *m3-fit*))
  83
  84
  85 #+nil
  86 (progn ;; FIXME: Need to clean up data examples, licenses, attributions, etc.
  87   ;; The following breaks because we should use a package to hold
  88   ;; configuration details, and this would be the only package outside
  89   ;; of packages.lisp, as it holds the overall defsystem structure.
  90   (load-data "iris.lsp")  ;; (the above partially fixed).
  91   (variables)
  92   diabetes )
  93
  94
  95
  96 (progn ;; Importing data from DSV text files.
  97
  98   (defparameter *my-df-2*
  99         (make-instance 'dataframe-array
 100                        :storage
 101                        (listoflist->array
 102                         (cybertiggyr-dsv::load-escaped
 103                          "/media/disk/Desktop/sandbox/CLS.git/Data/example-mixed.csv"))
 104                        :doc "This is an interesting dataframe-array"))
 105 #|                     :case-labels (list "x" "y")
 106                        :var-labels (list "a" "b" "c" "d" "e")
 107 |#
 108
 109   ;; a better approach is:
 110   (asdf:oos 'asdf:load-op 'rsm-string)
 111   (rsm.string:file->string-table
 112    "/media/disk/Desktop/sandbox/CLS.git/Data/example-mixed.csv")
 113
 114   (rsm.string:file->number-table
 115    "/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric.csv")
 116
 117   (defparameter *my-df-2*
 118         (make-instance 'dataframe-array
 119                        :storage
 120                        (listoflist->array
 121                         (transpose-listoflist
 122                          (rsm.string:file->string-table
 123                           "/media/disk/Desktop/sandbox/CLS.git/Data/example-mixed.csv")))
 124                        :doc "This is an interesting dataframe-array"))
 125   *my-df-2*
 126
 127   (defparameter *my-df-3*
 128         (make-instance 'dataframe-array
 129                        :storage
 130                        (listoflist->array
 131                         (transpose-listoflist
 132                          (rsm.string:file->number-table
 133                           "/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric.csv")))
 134                        :doc "This is an interesting dataframe-array"))
 135   *my-df-3*
 136
 137   )
 138
 139
 140
 141
 142 (progn
 143   ;; Plotting
 144   (asdf:oos 'asdf:load-op 'cl-plplot)
 145
 146   (plot-ex)
 147   (defparameter *gdev* "xcairo")
 148   ;; (defparameter *gdev* "xwin")
 149   (cl-plplot::plsdev *gdev*)
 150
 151   (contour-plot-ex)
 152   (fn-contour-plot-ex)
 153   (shade-plot-ex)
 154   (3D-plot-ex)
 155
 156   )
 157
 158
 159 (in-package :ls-user)
 160
 161
 162 (progn
 163   ;; REVIEW: general Lisp use guidance
 164
 165   (fdefinition 'make-matrix)
 166   (documentation 'make-matrix 'function)
 167
 168 #| Examples from CLHS, a bit of guidance.
 169
 170   ;; This function assumes its callers have checked the types of the
 171   ;; arguments, and authorizes the compiler to build in that assumption.
 172   (defun discriminant (a b c)
 173    (declare (number a b c))
 174    "Compute the discriminant for a quadratic equation."
 175    (- (* b b) (* 4 a c))) =>  DISCRIMINANT
 176   (discriminant 1 2/3 -2) =>  76/9
 177
 178   ;; This function assumes its callers have not checked the types of the
 179   ;; arguments, and performs explicit type checks before making any assumptions.
 180  (defun careful-discriminant (a b c)
 181    "Compute the discriminant for a quadratic equation."
 182    (check-type a number)
 183    (check-type b number)
 184    (check-type c number)
 185    (locally (declare (number a b c))
 186      (- (* b b) (* 4 a c)))) =>  CAREFUL-DISCRIMINANT
 187  (careful-discriminant 1 2/3 -2) =>  76/9
 188 |#
 189   )
 190
 191
 192 #+nil
 193 (progn ;; experiments with GSL and the Lisp interface.
 194   (asdf:oos 'asdf:load-op 'gsll)
 195   (asdf:oos 'asdf:load-op 'gsll-tests)
 196
 197   ;; the following should be equivalent
 198   (setf *t1*  (LIST 6.18d0 6.647777777777779d0 6.18d0))
 199   (setf *t2*  (MULTIPLE-VALUE-LIST
 200                (LET ((VEC
 201                       (gsll:make-marray 'DOUBLE-FLOAT
 202                                         :INITIAL-CONTENTS '(-3.21d0 1.0d0 12.8d0)))
 203                      (WEIGHTS
 204                       (gsll:MAKE-MARRAY 'DOUBLE-FLOAT
 205                                         :INITIAL-CONTENTS '(3.0d0 1.0d0 2.0d0))))
 206                  (LET ((MEAN (gsll:MEAN VEC)))
 207                    (LIST (gsll:ABSOLUTE-DEVIATION VEC)
 208                          (gsll:WEIGHTED-ABSOLUTE-DEVIATION VEC WEIGHTS)
 209                          (gsll:ABSOLUTE-DEVIATION VEC MEAN))))))
 210   (eql *t1* *t2*)
 211
 212   ;; from (gsll:examples 'gsll::numerical-integration) ...
 213   (gsll:integration-qng gsll::one-sine 0.0d0 PI)
 214
 215   (gsll:defun-single axpb (x) (+ (* 2 x) 3)) ;; a<-2, b<-3
 216   (gsll:integration-qng axpb 1d0 2d0)
 217
 218   (let ((a 2)
 219         (b 3))
 220     (defun-single axpb2 (x) (+ (* a x) b)))
 221   (gsll:integration-qng axpb2 1d0 2d0)
 222
 223   ;;   BAD
 224   ;;   (gsll:integration-qng
 225   ;;    (let ((a 2)
 226   ;;     (b 3))
 227   ;;      (defun-single axpb2 (x) (+ (* a x) b)))
 228   ;;    1d0 2d0)
 229
 230   ;; right, but weird expansion...
 231   (gsll:integration-qng
 232    (let ((a 2)
 233          (b 3))
 234      (defun axpb2 (x) (+ (* a x) b))
 235      (gsll:def-single-function axpb2)
 236      axpb2)
 237    1d0 2d0)
 238
 239   ;; Linear least squares
 240
 241   (gsll:gsl-lookup "gsl_linalg_LU_decomp") ; => gsll:lu-decomposition
 242   (gsll:gsl-lookup "gsl_linalg_LU_solve") ; => gsll:lu-solve
 243   )
 244
 245
 246
 247 #+nil
 248 (progn ;; philosophy time
 249
 250   (setf my-model (model :name "ex1"
 251                         :data-slots (list w x y z)
 252                         :param-slots (list alpha beta gamma)
 253                         :math-form (regression-model :formula '(= w (+ (* beta x)
 254                                                                      (* alpha y)
 255                                                                      (* gamma z)
 256                                                                      normal-error))
 257                                                      :centrality 'median ; 'mean
 258                                                      )))
 259
 260 #| or:
 261   #R"W ~ x+  y + z "
 262 |#
 263
 264   (setf my-dataset (statistical-table :table data-frame-contents
 265                                       :metadata (list (:case-names (list ))
 266                                                       (:var-names (list ))
 267                                                       (:documentation "string of doc"))))
 268
 269   (setf my-analysis (analysis
 270                      :model my-model
 271                      :data my-dataset
 272                      :parameter-map (pairing (model-param-slots my-model)
 273                                              (data-var-names my-dataset))))
 274
 275   ;; ontological implications -- the analysis is an abstract class of
 276   ;; data, model, and mapping between the model and data.  The fit is
 277   ;; the instantiation of such.  This provides a statistical object
 278   ;; computation theory which can be realized as "executable
 279   ;; statistics" or "computable statistics".
 280   (setf my-analysis (analyze my-fit
 281                              :estimation-method 'linear-least-squares-regression))
 282
 283   ;; one of the tricks here is that one needs to provide the structure
 284   ;; from which to consider estimation, and more importantly, the
 285   ;; validity of the estimation.
 286
 287   ;;
 288   (setf linear-least-squares-regression
 289         (estimation-method-definition
 290          :variable-defintions ((list
 291                                 ;; from MachLearn: supervised,
 292                                 ;; unsupervised
 293                                 :data-response-vars list-drv ; nil if unsup
 294                                 ;;
 295                                 :param-vars list-pv
 296                                      :data-predictor-vars list-dpv
 297                                      ;; nil in this case.  these
 298                                      ;; describe "out-of-box" specs
 299                                      :hyper-vars list-hv))
 300          :form '(regression-additive-error
 301                  :central-form (linear-form drv pv dpv)
 302                  :error-form 'normal-error)
 303          :resulting-decision '(point-estimation interval-estimation)
 304          :philosophy 'frequentist
 305          :documentation "use least squares to fit a linear regression
 306                          model to data."))
 307
 308   (defparameter *statistical-philosophies*
 309     '(frequentist bayesian fiducial decision-analysis)
 310     "can be combined to build decision-making approaches and
 311     characterizations")
 312
 313   (defparameter *decisions*
 314     '(estimation selection testing)
 315     "possible results from a...")
 316   ;; is this really true?  One can embedded hypothesis testing within
 317   ;; estimation, as the hypothesis estimated to select.  And
 318   ;; categorical/continuous rear their ugly heads, but not really in
 319   ;; an essential way.
 320
 321   (defparameter *ontology-of-decision-procedures*
 322     (list :decisions
 323           (list :estimation
 324                 (list :point
 325                       (list :maximum-likelihood
 326                             :minimum-entropy
 327                             :least-squares
 328                             :method-of-moments)
 329                       :interval
 330                       (list :maximum-likelihood
 331                             :))
 332                 :testing
 333                 (list :fisherian
 334                       :neyman-pearson
 335                       (list :traditional
 336                             :bioequivalence-inversion)
 337                       :selection
 338                       (list :ranking
 339                             :top-k-of-n-select))
 340                 :parametric
 341                 :partially-parametric))
 342     "start of ontology"))
 343
 344
 345 ;;;; LM
 346
 347 (progn
 348
 349   (defparameter *y*
 350     (make-vector
 351      8
 352      :type :row
 353      :initial-contents '((1d0 2d0 3d0 4d0 5d0 6d0 7d0 8d0))))
 354
 355
 356   (defparameter *xv+1*
 357     (make-matrix
 358      8 2
 359      :initial-contents '((1d0 1d0)
 360                          (1d0 3d0)
 361                          (1d0 2d0)
 362                          (1d0 4d0)
 363                          (1d0 3d0)
 364                          (1d0 5d0)
 365                          (1d0 4d0)
 366                          (1d0 6d0))))
 367
 368
 369   ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
 370   (defparameter *xtx-2* (m* (transpose *xv+1*) *xv+1*))
 371   ;; #<LA-SIMPLE-MATRIX-DOUBLE  2 x 2
 372   ;;  8.0d0 28.0d0
 373   ;;  28.0d0 116.0d0>
 374
 375   (defparameter *xty-2* (m* (transpose *xv+1*)  (transpose *y*)))
 376   ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
 377   ;;  36.0d0
 378   ;;  150.0d0>
 379
 380   (defparameter *rcond-2* 0.000001)
 381   (defparameter *betahat-2*  (gelsy *xtx-2* *xty-2* *rcond-2*))
 382   ;; *xtx-2* => "details of complete orthogonal factorization"
 383   ;; according to man page:
 384   ;; #<LA-SIMPLE-MATRIX-DOUBLE  2 x 2
 385   ;;  -119.33147112141039d0 -29.095426104883202d0
 386   ;;  0.7873402682880205d0 -1.20672274167718d0>
 387
 388   ;; *xty-2* => output becomes solution:
 389   ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
 390   ;;  -0.16666666666668312d0
 391   ;;  1.333333333333337d0>
 392
 393   *betahat-2* ; which matches R, see below
 394
 395   (documentation 'gelsy 'function)
 396
 397
 398 ;;   (#<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
 399 ;;    -0.16666666666668312 1.333333333333337>
 400 ;;    2)
 401
 402 ;;   ## Test case in R:
 403 ;;   x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
 404 ;;   y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
 405 ;;   lm(y~x)
 406 ;;   ## => Call:  lm(formula = y ~ x)
 407
 408 ;;   Coefficients:  (Intercept)            x
 409 ;;                      -0.1667       1.3333
 410
 411 ;;   summary(lm(y~x))
 412 ;;   ## =>
 413
 414 ;;   Call:
 415 ;;   lm(formula = y ~ x)
 416
 417 ;;   Residuals:
 418 ;;          Min         1Q     Median         3Q        Max
 419 ;;   -1.833e+00 -6.667e-01 -3.886e-16  6.667e-01  1.833e+00
 420
 421 ;;   Coefficients:
 422 ;;               Estimate Std. Error t value Pr(>|t|)
 423 ;;   (Intercept)  -0.1667     1.1587  -0.144  0.89034
 424 ;;   x             1.3333     0.3043   4.382  0.00466 **
 425 ;;   ---
 426 ;;   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 427
 428 ;;   Residual standard error: 1.291 on 6 degrees of freedom
 429 ;;   Multiple R-squared: 0.7619,        Adjusted R-squared: 0.7222
 430 ;;   F-statistic:  19.2 on 1 and 6 DF,  p-value: 0.004659
 431
 432
 433
 434   ;; which suggests one might do (modulo ensuring correct
 435   ;; orientations).  When this is finalized, it should migrate to
 436   ;; CLS.
 437   ;;
 438
 439
 440   (defparameter *n* 20) ; # rows = # obsns
 441   (defparameter *p* 10) ; # cols = # vars
 442   (defparameter *x-temp*  (rand *n* *p*))
 443   (defparameter *b-temp*  (rand *p* 1))
 444   (defparameter *y-temp*  (m* *x-temp* *b-temp*))
 445   ;; so Y=Xb + \eps
 446   (defparameter *rcond* (* (coerce (expt 2 -52) 'double-float)
 447                    (max (nrows *x-temp*) (ncols *y-temp*))))
 448   (defparameter *orig-x* (copy *x-temp*))
 449   (defparameter *orig-b* (copy *b-temp*))
 450   (defparameter *orig-y* (copy *y-temp*))
 451
 452   (defparameter *lm-result* (lm *x-temp* *y-temp*))
 453   (princ (first *lm-result*))
 454   (princ (second *lm-result*))
 455   (princ (third *lm-result*))
 456   (v= (third *lm-result*)
 457       (v- (first (first *lm-result*))
 458           (first  (second *lm-result*))))
 459
 460
 461
 462
 463   ;; Some issues exist in the LAPACK vs. LINPACK variants, hence R
 464   ;; uses LINPACK primarily, rather than LAPACK.  See comments in R
 465   ;; source for issues.
 466
 467
 468   ;; Goal is to start from X, Y and then realize that if
 469   ;; Y = X \beta, then,   i.e. 8x1 = 8xp px1  + 8x1
 470   ;;      XtX \hat\beta = Xt Y
 471   ;; so that we can solve the equation  W \beta = Z   where W and Z
 472   ;; are known, to estimate \beta.
 473
 474   ;; the above is known to be numerically instable -- some processing
 475   ;; of X is preferred and should be done prior.  And most of the
 476   ;; transformation-based work does precisely that.
 477
 478   ;; recall:  Var[Y] = E[(Y - E[Y])(Y-E[Y])t]
 479   ;;   = E[Y Yt] - 2 \mu \mut + \mu \mut
 480   ;;   = E[Y Yt] - \mu \mut
 481
 482   ;; Var Y = E[Y^2] - \mu^2
 483
 484
 485   ;; For initial estimates of covariance of \hat\beta:
 486
 487   ;; \hat\beta = (Xt X)^-1 Xt Y
 488   ;; with E[ \hat\beta ]
 489   ;;        = E[ (Xt X)^-1 Xt Y ]
 490   ;;        = E[(Xt X)^-1 Xt (X\beta)]
 491   ;;        = \beta
 492   ;;
 493   ;; So Var[\hat\beta] = ...
 494   ;;     (Xt X)
 495   ;; and this gives SE(\beta_i) = (* (sqrt (mref Var i i)) adjustment)
 496
 497
 498   ;; from docs:
 499
 500   (setf *temp-result*
 501         (let ((*default-implementation* :foreign-array))
 502           (let* ((m 10)
 503                  (n 10)
 504                  (a (rand m n))
 505                  (x (rand n 1))
 506                  (b (m* a x))
 507                  (rcond (* (coerce (expt 2 -52) 'double-float)
 508                            (max (nrows a) (ncols a))))
 509                  (orig-a (copy a))
 510                  (orig-b (copy b))
 511                  (orig-x (copy x)))
 512             (list x (gelsy a b rcond))
 513             ;; no applicable conversion?
 514             ;; (m-   (#<FA-SIMPLE-VECTOR-DOUBLE (10 x 1))
 515             ;;       (#<FA-SIMPLE-VECTOR-DOUBLE (10 x 1)) )
 516             (v- x (first (gelsy a b rcond))))))
 517
 518
 519   (princ *temp-result*)
 520
 521   (setf *temp-result*
 522         (let ((*default-implementation* :lisp-array))
 523           (let* ((m 10)
 524                  (n 10)
 525                  (a (rand m n))
 526                  (x (rand n 1))
 527                  (b (m* a x))
 528                  (rcond (* (coerce (expt 2 -52) 'double-float)
 529                            (max (nrows a) (ncols a))))
 530                  (orig-a (copy a))
 531                  (orig-b (copy b))
 532                  (orig-x (copy x)))
 533             (list x (gelsy a b rcond))
 534             (m- x (first  (gelsy a b rcond)))
 535             )))
 536   (princ *temp-result*)
 537
 538
 539   (defparameter *xv*
 540     (make-vector
 541      8
 542      :type :row ;; default, not usually needed!
 543      :initial-contents '((1d0 3d0 2d0 4d0 3d0 5d0 4d0 6d0))))
 544
 545   (defparameter *y*
 546     (make-vector
 547      8
 548      :type :row
 549      :initial-contents '((1d0 2d0 3d0 4d0 5d0 6d0 7d0 8d0))))
 550
 551   ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
 552   (defparameter *xtx-1* (m* *xv* (transpose *xv*)))
 553   (defparameter *xty-1* (m* *xv* (transpose  *y*)))
 554   (defparameter *rcond-in* (* (coerce (expt 2 -52) 'double-float)
 555                               (max (nrows *xtx-1*)
 556                                    (ncols *xty-1*))))
 557
 558   (defparameter *betahat*  (gelsy *xtx-1* *xty-1* *rcond-in*))
 559
 560   ;;  (#<LA-SIMPLE-VECTOR-DOUBLE (1 x 1)
 561   ;;  1.293103448275862>
 562   ;;  1)
 563
 564   ;;   ## Test case in R:
 565   ;;   x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
 566   ;;   y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
 567   ;;   lm(y~x-1)
 568   ;;   ## =>
 569   ;;   Call:
 570   ;;   lm(formula = y ~ x - 1)
 571
 572   ;;   Coefficients:
 573   ;;       x
 574   ;;   1.293
 575
 576   (first  *betahat*))
 577
 578
 579
 580 #|
 581   (type-of #2A((1 2 3 4 5)
 582                (10 20 30 40 50)))
 583
 584   (type-of (rand 10 20))
 585
 586   (typep #2A((1 2 3 4 5)
 587              (10 20 30 40 50))
 588          'matrix-like)
 589
 590   (typep (rand 10 20) 'matrix-like)
 591
 592   (typep #2A((1 2 3 4 5)
 593              (10 20 30 40 50))
 594          'array)
 595
 596   (typep (rand 10 20) 'array)
 597 |#