TODO.lisp

   1 ;;; -*- mode: lisp -*-
   2
   3 ;;; Time-stamp: <2009-04-20 19:01:23 tony>
   4 ;;; Creation:   <2008-09-08 08:06:30 tony>
   5 ;;; File:       TODO.lisp
   6 ;;; Author:     AJ Rossini <blindglobe@gmail.com>
   7 ;;; Copyright:  (c) 2007-2008, AJ Rossini <blindglobe@gmail.com>.  BSD.
   8 ;;; Purpose: Stuff that needs to be made working sits inside the
   9 ;;;          progns... This file contains the current challenges to
  10 ;;;          solve, including a description of the setup and the work
  11 ;;;          to solve....
  12
  13 ;;; What is this talk of 'release'? Klingons do not make software
  14 ;;; 'releases'.  Our software 'escapes', leaving a bloody trail of
  15 ;;; designers and quality assurance people in its wake.
  16
  17 ;;; SET UP
  18
  19 (in-package :cl-user)
  20 ;;(asdf:oos 'asdf:load-op 'lisp-matrix)
  21 ;;(asdf:oos 'asdf:compile-op 'lispstat :force t)
  22 ;;(asdf:oos 'asdf:load-op 'lispstat)
  23
  24 (in-package :lisp-stat-unittests)
  25
  26 ;; tests = 80, failures = 8, errors = 15
  27 (run-tests :suite 'lisp-stat-ut)
  28 (describe (run-tests :suite 'lisp-stat-ut))
  29
  30 ;; FIXME: Example: currently not relevant, yet
  31 ;;   (describe (lift::run-test :test-case  'lisp-stat-unittests::create-proto
  32 ;;                             :suite 'lisp-stat-unittests::lisp-stat-ut-proto))
  33
  34 (describe (lift::run-tests :suite 'lisp-stat-ut-dataframe))
  35 (lift::run-tests :suite 'lisp-stat-ut-dataframe)
  36
  37 (describe
  38  (lift::run-test
  39   :test-case  'lisp-stat-unittests::create-proto
  40   :suite 'lisp-stat-unittests::lisp-stat-ut-proto))
  41
  42 (describe 'lisp-stat-ut)
  43
  44 (in-package :ls-user)
  45
  46 #+nil
  47 (progn ;; FIXME: Need to clean up data examples, licenses, attributions, etc.
  48   ;; The following breaks because we should use a package to hold
  49   ;; configuration details, and this would be the only package outside
  50   ;; of packages.lisp, as it holds the overall defsystem structure.
  51   (load-data "iris.lsp")  ;; (the above partially fixed).
  52   (variables)
  53   diabetes )
  54
  55
  56 (progn ;; Importing data from DSV text files.
  57
  58   (defparameter *my-df-2*
  59         (make-instance 'dataframe-array
  60                        :storage
  61                        (listoflist->array
  62                         (cybertiggyr-dsv::load-escaped
  63                          "/media/disk/Desktop/sandbox/CLS.git/Data/example-mixed.csv"))
  64                        :doc "This is an interesting dataframe-array"))
  65 #|                     :case-labels (list "x" "y")
  66                        :var-labels (list "a" "b" "c" "d" "e")
  67 |#
  68
  69   ;; a better approach is:
  70   (asdf:oos 'asdf:load-op 'rsm-string)
  71   (rsm.string:file->string-table
  72    "/media/disk/Desktop/sandbox/CLS.git/Data/example-mixed.csv")
  73
  74   (rsm.string:file->number-table
  75    "/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric.csv")
  76
  77   (defparameter *my-df-2*
  78         (make-instance 'dataframe-array
  79                        :storage
  80                        (listoflist->array
  81                         (transpose-listoflist
  82                          (rsm.string:file->string-table
  83                           "/media/disk/Desktop/sandbox/CLS.git/Data/example-mixed.csv")))
  84                        :doc "This is an interesting dataframe-array"))
  85   *my-df-2*
  86
  87   (defparameter *my-df-3*
  88         (make-instance 'dataframe-array
  89                        :storage
  90                        (listoflist->array
  91                         (transpose-listoflist
  92                          (rsm.string:file->number-table
  93                           "/media/disk/Desktop/sandbox/CLS.git/Data/example-numeric.csv")))
  94                        :doc "This is an interesting dataframe-array"))
  95   *my-df-3*
  96
  97   ;; Need to the this using the make-dataframe example, or write a
  98   ;; dsvfile->dataframe command.
  99   )
 100
 101
 102
 103
 104 (progn
 105   ;; Plotting
 106   ;; (asdf:oos 'asdf:load-op 'cl-plplot)
 107
 108   (plot-ex)
 109   (defparameter *gdev* "xcairo")
 110   ;; (defparameter *gdev* "xwin")
 111   (cl-plplot::plsdev *gdev*)
 112
 113   ;; there is currently a loose pointer floating around that causes
 114   ;; errors the 3rd time that we create a plot (and crashes SBCL the
 115   ;; 4th time).  Order independent.
 116   (contour-plot-ex)
 117   (fn-contour-plot-ex)
 118   (shade-plot-ex)
 119   (3D-plot-ex)
 120
 121   )
 122
 123
 124 (progn
 125   ;; REVIEW: general Lisp use guidance
 126
 127   (fdefinition 'make-matrix)
 128   (documentation 'make-matrix 'function)
 129
 130 #| Examples from CLHS, a bit of guidance.
 131
 132   ;; This function assumes its callers have checked the types of the
 133   ;; arguments, and authorizes the compiler to build in that assumption.
 134   (defun discriminant (a b c)
 135    (declare (number a b c))
 136    "Compute the discriminant for a quadratic equation."
 137    (- (* b b) (* 4 a c))) =>  DISCRIMINANT
 138   (discriminant 1 2/3 -2) =>  76/9
 139
 140   ;; This function assumes its callers have not checked the types of the
 141   ;; arguments, and performs explicit type checks before making any assumptions.
 142  (defun careful-discriminant (a b c)
 143    "Compute the discriminant for a quadratic equation."
 144    (check-type a number)
 145    (check-type b number)
 146    (check-type c number)
 147    (locally (declare (number a b c))
 148      (- (* b b) (* 4 a c)))) =>  CAREFUL-DISCRIMINANT
 149  (careful-discriminant 1 2/3 -2) =>  76/9
 150 |#
 151   )
 152
 153
 154 #+nil
 155 (progn ;; experiments with GSL and the Lisp interface.
 156   (asdf:oos 'asdf:load-op 'gsll)
 157   (asdf:oos 'asdf:load-op 'gsll-tests)
 158
 159   ;; the following should be equivalent
 160   (setf *t1*  (LIST 6.18d0 6.647777777777779d0 6.18d0))
 161   (setf *t2*  (MULTIPLE-VALUE-LIST
 162                (LET ((VEC
 163                       (gsll:make-marray 'DOUBLE-FLOAT
 164                                         :INITIAL-CONTENTS '(-3.21d0 1.0d0 12.8d0)))
 165                      (WEIGHTS
 166                       (gsll:MAKE-MARRAY 'DOUBLE-FLOAT
 167                                         :INITIAL-CONTENTS '(3.0d0 1.0d0 2.0d0))))
 168                  (LET ((MEAN (gsll:MEAN VEC)))
 169                    (LIST (gsll:ABSOLUTE-DEVIATION VEC)
 170                          (gsll:WEIGHTED-ABSOLUTE-DEVIATION VEC WEIGHTS)
 171                          (gsll:ABSOLUTE-DEVIATION VEC MEAN))))))
 172   (eql *t1* *t2*)
 173
 174   ;; from (gsll:examples 'gsll::numerical-integration) ...
 175   (gsll:integration-qng gsll::one-sine 0.0d0 PI)
 176
 177   (gsll:defun-single axpb (x) (+ (* 2 x) 3)) ;; a<-2, b<-3
 178   (gsll:integration-qng axpb 1d0 2d0)
 179
 180   (let ((a 2)
 181         (b 3))
 182     (defun-single axpb2 (x) (+ (* a x) b)))
 183   (gsll:integration-qng axpb2 1d0 2d0)
 184
 185   ;;   BAD
 186   ;;   (gsll:integration-qng
 187   ;;    (let ((a 2)
 188   ;;     (b 3))
 189   ;;      (defun-single axpb2 (x) (+ (* a x) b)))
 190   ;;    1d0 2d0)
 191
 192   ;; right, but weird expansion...
 193   (gsll:integration-qng
 194    (let ((a 2)
 195          (b 3))
 196      (defun axpb2 (x) (+ (* a x) b))
 197      (gsll:def-single-function axpb2)
 198      axpb2)
 199    1d0 2d0)
 200
 201   ;; Linear least squares
 202
 203   (gsll:gsl-lookup "gsl_linalg_LU_decomp") ; => gsll:lu-decomposition
 204   (gsll:gsl-lookup "gsl_linalg_LU_solve") ; => gsll:lu-solve
 205   )
 206
 207
 208
 209 #+nil
 210 (progn ;; philosophy time
 211
 212   (setf my-model (model :name "ex1"
 213                         :data-slots (list w x y z)
 214                         :param-slots (list alpha beta gamma)
 215                         :math-form (regression-model :formula '(= w (+ (* beta x)
 216                                                                      (* alpha y)
 217                                                                      (* gamma z)
 218                                                                      normal-error))
 219                                                      :centrality 'median ; 'mean
 220                                                      )))
 221
 222 #| or:
 223   #R"W ~ x+  y + z "
 224 |#
 225
 226   (setf my-dataset (statistical-table :table data-frame-contents
 227                                       :metadata (list (:case-names (list ))
 228                                                       (:var-names (list ))
 229                                                       (:documentation "string of doc"))))
 230
 231   (setf my-analysis (analysis
 232                      :model my-model
 233                      :data my-dataset
 234                      :parameter-map (pairing (model-param-slots my-model)
 235                                              (data-var-names my-dataset))))
 236
 237   ;; ontological implications -- the analysis is an abstract class of
 238   ;; data, model, and mapping between the model and data.  The fit is
 239   ;; the instantiation of such.  This provides a statistical object
 240   ;; computation theory which can be realized as "executable
 241   ;; statistics" or "computable statistics".
 242   (setf my-analysis (analyze my-fit
 243                              :estimation-method 'linear-least-squares-regression))
 244
 245   ;; one of the tricks here is that one needs to provide the structure
 246   ;; from which to consider estimation, and more importantly, the
 247   ;; validity of the estimation.
 248
 249   ;;
 250   (setf linear-least-squares-regression
 251         (estimation-method-definition
 252          :variable-defintions ((list
 253                                 ;; from MachLearn: supervised,
 254                                 ;; unsupervised
 255                                 :data-response-vars list-drv ; nil if unsup
 256                                 ;;
 257                                 :param-vars list-pv
 258                                      :data-predictor-vars list-dpv
 259                                      ;; nil in this case.  these
 260                                      ;; describe "out-of-box" specs
 261                                      :hyper-vars list-hv))
 262          :form '(regression-additive-error
 263                  :central-form (linear-form drv pv dpv)
 264                  :error-form 'normal-error)
 265          :resulting-decision '(point-estimation interval-estimation)
 266          :philosophy 'frequentist
 267          :documentation "use least squares to fit a linear regression
 268                          model to data."))
 269
 270   (defparameter *statistical-philosophies*
 271     '(frequentist bayesian fiducial decision-analysis)
 272     "can be combined to build decision-making approaches and
 273     characterizations")
 274
 275   (defparameter *decisions*
 276     '(estimation selection testing)
 277     "possible results from a...")
 278   ;; is this really true?  One can embedded hypothesis testing within
 279   ;; estimation, as the hypothesis estimated to select.  And
 280   ;; categorical/continuous rear their ugly heads, but not really in
 281   ;; an essential way.
 282
 283   (defparameter *ontology-of-decision-procedures*
 284     (list :decisions
 285           (list :estimation
 286                 (list :point
 287                       (list :maximum-likelihood
 288                             :minimum-entropy
 289                             :least-squares
 290                             :method-of-moments)
 291                       :interval
 292                       (list :maximum-likelihood
 293                             :))
 294                 :testing
 295                 (list :fisherian
 296                       :neyman-pearson
 297                       (list :traditional
 298                             :bioequivalence-inversion)
 299                       :selection
 300                       (list :ranking
 301                             :top-k-of-n-select))
 302                 :parametric
 303                 :partially-parametric))
 304     "start of ontology"))
 305
 306
 307 ;;;; LM
 308
 309 (progn
 310
 311   (defparameter *y*
 312     (make-vector
 313      8
 314      :type :row
 315      :initial-contents '((1d0 2d0 3d0 4d0 5d0 6d0 7d0 8d0))))
 316
 317
 318   (defparameter *xv+1*
 319     (make-matrix
 320      8 2
 321      :initial-contents '((1d0 1d0)
 322                          (1d0 3d0)
 323                          (1d0 2d0)
 324                          (1d0 4d0)
 325                          (1d0 3d0)
 326                          (1d0 5d0)
 327                          (1d0 4d0)
 328                          (1d0 6d0))))
 329
 330
 331   ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
 332   (defparameter *xtx-2* (m* (transpose *xv+1*) *xv+1*))
 333   ;; #<LA-SIMPLE-MATRIX-DOUBLE  2 x 2
 334   ;;  8.0d0 28.0d0
 335   ;;  28.0d0 116.0d0>
 336
 337   (defparameter *xty-2* (m* (transpose *xv+1*)  (transpose *y*)))
 338   ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
 339   ;;  36.0d0
 340   ;;  150.0d0>
 341
 342   (defparameter *rcond-2* 0.000001)
 343   (defparameter *betahat-2*  (gelsy *xtx-2* *xty-2* *rcond-2*))
 344   ;; *xtx-2* => "details of complete orthogonal factorization"
 345   ;; according to man page:
 346   ;; #<LA-SIMPLE-MATRIX-DOUBLE  2 x 2
 347   ;;  -119.33147112141039d0 -29.095426104883202d0
 348   ;;  0.7873402682880205d0 -1.20672274167718d0>
 349
 350   ;; *xty-2* => output becomes solution:
 351   ;; #<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
 352   ;;  -0.16666666666668312d0
 353   ;;  1.333333333333337d0>
 354
 355   *betahat-2* ; which matches R, see below
 356
 357   (documentation 'gelsy 'function)
 358
 359
 360 ;;   (#<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
 361 ;;    -0.16666666666668312 1.333333333333337>
 362 ;;    2)
 363
 364 ;;   ## Test case in R:
 365 ;;   x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
 366 ;;   y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
 367 ;;   lm(y~x)
 368 ;;   ## => Call:  lm(formula = y ~ x)
 369
 370 ;;   Coefficients:  (Intercept)            x
 371 ;;                      -0.1667       1.3333
 372
 373 ;;   summary(lm(y~x))
 374 ;;   ## =>
 375
 376 ;;   Call:
 377 ;;   lm(formula = y ~ x)
 378
 379 ;;   Residuals:
 380 ;;          Min         1Q     Median         3Q        Max
 381 ;;   -1.833e+00 -6.667e-01 -3.886e-16  6.667e-01  1.833e+00
 382
 383 ;;   Coefficients:
 384 ;;               Estimate Std. Error t value Pr(>|t|)
 385 ;;   (Intercept)  -0.1667     1.1587  -0.144  0.89034
 386 ;;   x             1.3333     0.3043   4.382  0.00466 **
 387 ;;   ---
 388 ;;   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 389
 390 ;;   Residual standard error: 1.291 on 6 degrees of freedom
 391 ;;   Multiple R-squared: 0.7619,        Adjusted R-squared: 0.7222
 392 ;;   F-statistic:  19.2 on 1 and 6 DF,  p-value: 0.004659
 393
 394
 395
 396   ;; which suggests one might do (modulo ensuring correct
 397   ;; orientations).  When this is finalized, it should migrate to
 398   ;; CLS.
 399   ;;
 400
 401
 402   (defparameter *n* 20) ; # rows = # obsns
 403   (defparameter *p* 10) ; # cols = # vars
 404   (defparameter *x-temp*  (rand *n* *p*))
 405   (defparameter *b-temp*  (rand *p* 1))
 406   (defparameter *y-temp*  (m* *x-temp* *b-temp*))
 407   ;; so Y=Xb + \eps
 408   (defparameter *rcond* (* (coerce (expt 2 -52) 'double-float)
 409                    (max (nrows *x-temp*) (ncols *y-temp*))))
 410   (defparameter *orig-x* (copy *x-temp*))
 411   (defparameter *orig-b* (copy *b-temp*))
 412   (defparameter *orig-y* (copy *y-temp*))
 413
 414   (defparameter *lm-result* (lm *x-temp* *y-temp*))
 415   (princ (first *lm-result*))
 416   (princ (second *lm-result*))
 417   (princ (third *lm-result*))
 418   (v= (third *lm-result*)
 419       (v- (first (first *lm-result*))
 420           (first  (second *lm-result*))))
 421
 422
 423
 424
 425   ;; Some issues exist in the LAPACK vs. LINPACK variants, hence R
 426   ;; uses LINPACK primarily, rather than LAPACK.  See comments in R
 427   ;; source for issues.
 428
 429
 430   ;; Goal is to start from X, Y and then realize that if
 431   ;; Y = X \beta, then,   i.e. 8x1 = 8xp px1  + 8x1
 432   ;;      XtX \hat\beta = Xt Y
 433   ;; so that we can solve the equation  W \beta = Z   where W and Z
 434   ;; are known, to estimate \beta.
 435
 436   ;; the above is known to be numerically instable -- some processing
 437   ;; of X is preferred and should be done prior.  And most of the
 438   ;; transformation-based work does precisely that.
 439
 440   ;; recall:  Var[Y] = E[(Y - E[Y])(Y-E[Y])t]
 441   ;;   = E[Y Yt] - 2 \mu \mut + \mu \mut
 442   ;;   = E[Y Yt] - \mu \mut
 443
 444   ;; Var Y = E[Y^2] - \mu^2
 445
 446
 447   ;; For initial estimates of covariance of \hat\beta:
 448
 449   ;; \hat\beta = (Xt X)^-1 Xt Y
 450   ;; with E[ \hat\beta ]
 451   ;;        = E[ (Xt X)^-1 Xt Y ]
 452   ;;        = E[(Xt X)^-1 Xt (X\beta)]
 453   ;;        = \beta
 454   ;;
 455   ;; So Var[\hat\beta] = ...
 456   ;;     (Xt X)
 457   ;; and this gives SE(\beta_i) = (* (sqrt (mref Var i i)) adjustment)
 458
 459
 460   ;; from docs:
 461
 462   (setf *temp-result*
 463         (let ((*default-implementation* :foreign-array))
 464           (let* ((m 10)
 465                  (n 10)
 466                  (a (rand m n))
 467                  (x (rand n 1))
 468                  (b (m* a x))
 469                  (rcond (* (coerce (expt 2 -52) 'double-float)
 470                            (max (nrows a) (ncols a))))
 471                  (orig-a (copy a))
 472                  (orig-b (copy b))
 473                  (orig-x (copy x)))
 474             (list x (gelsy a b rcond))
 475             ;; no applicable conversion?
 476             ;; (m-   (#<FA-SIMPLE-VECTOR-DOUBLE (10 x 1))
 477             ;;       (#<FA-SIMPLE-VECTOR-DOUBLE (10 x 1)) )
 478             (v- x (first (gelsy a b rcond))))))
 479
 480
 481   (princ *temp-result*)
 482
 483   (setf *temp-result*
 484         (let ((*default-implementation* :lisp-array))
 485           (let* ((m 10)
 486                  (n 10)
 487                  (a (rand m n))
 488                  (x (rand n 1))
 489                  (b (m* a x))
 490                  (rcond (* (coerce (expt 2 -52) 'double-float)
 491                            (max (nrows a) (ncols a))))
 492                  (orig-a (copy a))
 493                  (orig-b (copy b))
 494                  (orig-x (copy x)))
 495             (list x (gelsy a b rcond))
 496             (m- x (first  (gelsy a b rcond)))
 497             )))
 498   (princ *temp-result*)
 499
 500
 501   (defparameter *xv*
 502     (make-vector
 503      8
 504      :type :row ;; default, not usually needed!
 505      :initial-contents '((1d0 3d0 2d0 4d0 3d0 5d0 4d0 6d0))))
 506
 507   (defparameter *y*
 508     (make-vector
 509      8
 510      :type :row
 511      :initial-contents '((1d0 2d0 3d0 4d0 5d0 6d0 7d0 8d0))))
 512
 513   ;; so something like (NOTE: matrices are transposed to begin with, hence the incongruety)
 514   (defparameter *xtx-1* (m* *xv* (transpose *xv*)))
 515   (defparameter *xty-1* (m* *xv* (transpose  *y*)))
 516   (defparameter *rcond-in* (* (coerce (expt 2 -52) 'double-float)
 517                               (max (nrows *xtx-1*)
 518                                    (ncols *xty-1*))))
 519
 520   (defparameter *betahat*  (gelsy *xtx-1* *xty-1* *rcond-in*))
 521
 522   ;;  (#<LA-SIMPLE-VECTOR-DOUBLE (1 x 1)
 523   ;;  1.293103448275862>
 524   ;;  1)
 525
 526   ;;   ## Test case in R:
 527   ;;   x <- c( 1.0, 3.0, 2.0, 4.0, 3.0, 5.0, 4.0, 6.0)
 528   ;;   y <- c( 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
 529   ;;   lm(y~x-1)
 530   ;;   ## =>
 531   ;;   Call:
 532   ;;   lm(formula = y ~ x - 1)
 533
 534   ;;   Coefficients:
 535   ;;       x
 536   ;;   1.293
 537
 538   (first  *betahat*))
 539
 540
 541
 542 #|
 543   (type-of #2A((1 2 3 4 5)
 544                (10 20 30 40 50)))
 545
 546   (type-of (rand 10 20))
 547
 548   (typep #2A((1 2 3 4 5)
 549              (10 20 30 40 50))
 550          'matrix-like)
 551
 552   (typep (rand 10 20) 'matrix-like)
 553
 554   (typep #2A((1 2 3 4 5)
 555              (10 20 30 40 50))
 556          'array)
 557
 558   (typep (rand 10 20) 'array)
 559 |#