Doc/talks/Rossini-DSC-July2009.tex

   1 \documentclass{beamer}
   2
   3 \mode<presentation>
   4 {
   5   \usetheme{classic}
   6   \setbeamercovered{transparent}
   7 }
   8
   9 \usepackage[english]{babel}
  10 \usepackage[latin1]{inputenc}
  11 \usepackage{times}
  12 \usepackage[T1]{fontenc}
  13 \usepackage{url}
  14
  15 \title[CLS]{Common Lisp Statistics}
  16 \subtitle{Using History to design better data analysis environments}
  17 \author[Rossini]{Anthony~(Tony)~Rossini}
  18
  19 \institute[Novartis and University of Washington]{
  20   Group Head, Modeling and Simulation Statistics\\
  21   Novartis Pharma AG, Switzerland
  22   \and
  23   Affiliate Assoc Prof, Biomedical and Health Informatics\\
  24   University of Washington, USA}
  25
  26 \date[DSC2009]{DSC 2009, Copenhagen}
  27 \subject{Statistical Computing Environments}
  28
  29 \begin{document}
  30
  31 \begin{frame}
  32   \titlepage
  33 \end{frame}
  34
  35 \section{What Works?}
  36 \label{sec:work}
  37
  38 \begin{frame}{Is it Vaporware? Not quite}
  39   The follow is possible with the help of the open source Common Lisp
  40   community, who provided most of the packages, tools, and glue.
  41   (Tamas Papp, Raymond Toy, Mark Hoemmomem, and many, many others).
  42   Most of the underlying code was written by others, and ``composed''
  43   by me.
  44 \end{frame}
  45
  46 \subsection{Graphics}
  47 \label{sec:work:graphics}
  48
  49 \begin{frame}{Silly Visualization Example}
  50 \includegraphics[width=3in,height=3in]{/home/tony/test1.png}
  51 \end{frame}
  52
  53 \begin{frame}[fragile]{How?}
  54 \begin{verbatim}
  55 (defparameter *frame2*
  56    (as-frame (create-xlib-image-context 200 200)
  57             :background-color +white+))
  58 (bind ((#2A((f1 f2) (f3 f4))
  59         (split-frame *frame2*
  60                      (percent 50)
  61                      (percent 50))))
  62   (defparameter *f1* f1) ; lower left
  63   (defparameter *f2* f2) ; lower right  f3  f4
  64   (defparameter *f3* f3) ; top left     f1  f2
  65   (defparameter *f4* f4)); top right
  66 \end{verbatim}
  67 \end{frame}
  68
  69 \begin{frame}[fragile]{Functions to Plot}
  70 \begin{verbatim}
  71 (plot-function *f1* #'sin
  72   (interval-of 0 2)
  73   :x-title "x" :y-title "sin(x)")
  74 (plot-function *f2* #'cos (interval-of 0 2)
  75   :x-title "x" :y-title "cos(x)")
  76 (plot-function *f3* #'tan (interval-of 0 2)
  77   :x-title "x" :y-title "tan(x)")
  78 \end{verbatim}
  79 \end{frame}
  80
  81 \begin{frame}[fragile]{Things to Plot}
  82 \small{
  83 \begin{verbatim}
  84 (let* ((n 500)
  85        (xs (num-sequence
  86              :from 0 :to 10 :length n))
  87        (ys (map 'vector
  88               #'(lambda (x) (+ x 8 (random 4.0)))
  89               xs))
  90        (weights
  91           (replicate #'(lambda () (1+ (random 10)))
  92                      n 'fixnum))
  93        (da (plot-simple *f4*
  94              (interval-of 0 10)
  95              (interval-of 10 20)
  96              :x-title "x" :y-title "y")))
  97   (draw-symbols da xs ys :weights weights))
  98 \end{verbatim}
  99 }
 100 \end{frame}
 101
 102 \begin{frame}[fragile]{Copying existing graphics}
 103   And we generated the figure on the first page by:
 104 \begin{verbatim}
 105 (xlib-image-context-to-png
 106    (context *f1*)
 107    "/home/tony/test1.png")
 108 \end{verbatim}
 109 \end{frame}
 110
 111 \subsection{Statistical Models}
 112 \label{sec:work:statmod}
 113
 114 \begin{frame}[fragile]{Linear Regression}
 115 \small{
 116 \begin{verbatim}
 117 ;; Worse than LispStat, wrapping LAPACK's dgelsy:
 118 (defparameter *result1*
 119    (lm (list->vector-like iron)
 120        (list->vector-like absorbtion)))
 121 *result*1 =>
 122 ((#<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
 123  -11.504913191235342
 124  0.23525771181009483>
 125   2)
 126
 127  #<LA-SIMPLE-MATRIX-DOUBLE  2 x 2
 128  9.730392177126686e-6 -0.001513787114206932
 129  -0.001513787114206932 0.30357851215706255>
 130
 131  13 2)
 132 \end{verbatim}
 133 }
 134 \end{frame}
 135
 136 \subsection{Data Manip/Mgmt}
 137 \label{sec:work:data}
 138
 139 \begin{frame}[fragile]{DataFrames}
 140 \small{
 141 \begin{verbatim}
 142 (defparameter *my-df-1*
 143   (make-instance 'dataframe-array
 144          :storage #2A((1 2 3 4 5) (10 20 30 40 50))
 145          :doc "This is a boring dataframe-array"
 146          :case-labels (list "x" "y")
 147          :var-labels (list "a" "b" "c" "d" "e")))
 148
 149 (xref *my-df-1* 0 0) ; API change in progress
 150
 151 (setf (xref *my-df-1* 0 0) -1d0)
 152 \end{verbatim}
 153 }
 154 \end{frame}
 155
 156 \begin{frame}[fragile]{Numerical Matrices}
 157 \small{
 158 \begin{verbatim}
 159 (defparameter *mat-1*
 160   (make-matrix 3 3
 161      :initial-contents #2A((2d0 3d0 -4d0)
 162                            (3d0 2d0 -4d0)
 163                            (4d0 4d0 -5d0))))
 164
 165 (xref *mat-1* 2 0) ; => 4d0  ; API change
 166 (setf (xref *mat-1* 2 0) -4d0)
 167
 168 (defparameter *xv*
 169  (make-vector 4 :type :row
 170    :initial-contents '((1d0 3d0 2d0 4d0))))
 171 \end{verbatim}
 172 }
 173 \end{frame}
 174
 175 \begin{frame}[fragile]{Macros make the above tolerable}
 176 \begin{verbatim}
 177 (defparameter *xv*
 178  (make-vector 4 :type :row
 179    :initial-contents '((1d0 3d0 2d0 4d0))))
 180
 181 ; can use defmacro for the following syntax =>
 182
 183 (make-row-vector *xv* '((1d0 3d0 2d0 4d0)))
 184
 185 ; or reader macros for the following:
 186 #mrv(*xv* '((1d0 3d0 2d0 4d0)))
 187 \end{verbatim}
 188 \end{frame}
 189
 190 \begin{frame}{Outline}
 191   \tableofcontents
 192 \end{frame}
 193
 194 \section{Common Lisp Statistics}
 195 \label{sec:CLS}
 196
 197 \begin{frame}{Why CLS?}
 198   \begin{itemize}
 199   \item a component-based structure for statistical computing
 200   \item Common Lisp provides a simple, \emph{primitive}, syntax
 201   \item Common Lisp provides an amazing number of advanced features
 202     that keep getting reinvented in other languages.
 203   \item Common Lisp has linkages to many amazing features developed in
 204     other languages.
 205   \item ability to leverage non-statisticians interested in computing
 206     technologies (compilers, protocols, interfaces, libraries,
 207     functionality which can be reused for statistical purposes)
 208   \item This is a ``customization'' through packages to support
 209     statistical computing, not a independent language.  ``Ala Carte'',
 210     not ``Menu''.
 211   \end{itemize}
 212 \end{frame}
 213
 214 \subsection{Implementation Plans}
 215 \label{sec:CLS:impl}
 216
 217
 218 \begin{frame}{Current Functionality}
 219   \begin{itemize}
 220   \item basic dataframes (similar to R); indexing/slicing API under
 221     development.
 222   \item Basic regression (similar to XLispStat)
 223   \item matrix storage both in foreign and lisp-centric areas.
 224   \item LAPACK (small percentage, increasing), working with both
 225     matrix storage types
 226   \item static graphics (X11) including preliminary grid functionality based
 227     on CAIRO.  Generation of PNG files from graphics windows.
 228   \item CSV file support
 229   \item Common Lisp!
 230   \end{itemize}
 231 \end{frame}
 232
 233 \begin{frame}[fragile]{Computational Environment Supported}
 234   \begin{itemize}
 235   \item Should  work on Linux, with recent SBCL versions
 236   \item Definitely works on bleeding edge Debian (unstable).
 237   \item Has worked for weak definitions of ``work'' on 4 different
 238     people's environments (not quite, but sort of requires a
 239     \verb+/home/tony/+ !)
 240   \item Threaded support on threaded lisps (SBCL/CCL, soon CLISP).
 241     But not yet integrated.
 242   \end{itemize}
 243 \end{frame}
 244
 245 \begin{frame}{Goals}
 246   Short Term
 247   \begin{itemize}
 248   \item Better integration of data structures with statistical routines
 249     (auto-handling with dataframes, rather than manual parsing).
 250   \end{itemize}
 251   Medium/Long Term
 252   \begin{itemize}
 253   \item Support for CLISP (byte-compiled interpreted lisp) and Clozure
 254     Common Lisp (formerly OpenMCL)
 255   \item high-level Front-end API to a number of matrix and numerical
 256     packages and numerical structures (
 257   \item constraint system for interactive GUIs and graphics
 258   \item full LispStat compatibility (object system partially works;
 259     GUI support coming).
 260   \item Integrated threading via Bordeaux threads (portable CL API package).
 261   \end{itemize}
 262 \end{frame}
 263
 264 \subsection{Common Lisp}
 265 \label{sec:CLS:lisp}
 266
 267 \begin{frame}{Common Lisp}
 268   \begin{itemize}
 269   \item Parens provide clear delineation of a \textbf{Complete
 270       Thought} (functional programming with side effects).
 271   \item Lisp-2 (symbols can denote both a separate function and a value)
 272   \item ANSI standard (built by committee, but the committee was
 273     reasonably smart)
 274   \item Many implementations
 275   \item Most implementations are interactive \textbf{compiled}
 276     languages (few are interpreted, and those are usually
 277     byte-compiled).
 278   \item The Original \emph{Programming with Data} Language
 279     (\emph{Programs are Data} and \emph{Data are Executable} also
 280     apply).
 281   \item advanced, powerful, first-class macros (macros functionally
 282     re-write code, allowing for structural clarity and complete
 283     destruction of syntax, should that be reasonable)
 284   \end{itemize}
 285 \end{frame}
 286
 287 \begin{frame}{Common Lisp Packages}
 288   (They are packages and called packages, not libraries.  Some people
 289   can rejoice!)
 290   \begin{itemize}
 291   \item infrastructure enhancement:  infix-notation, data structures,
 292     control and flow structures
 293   \item numerics, graphics, GUIs,
 294   \item primitive R to CL compiler (which could also be considered an
 295     object-code compiler for R); 3 interfaces which embed R within CL.
 296   \item Web 2.0 support and reporting facilities (similar to TeX) for PDF.
 297   \end{itemize}
 298   See \url{http://www.common-lisp.net/} and
 299   \url{http://www.cliki.org/}.  CLS sources can be found on
 300   \url{http://github.com/blindglobe/}
 301 \end{frame}
 302
 303 \section{What else about CLS is still Vaporware?}
 304
 305 \begin{frame}[fragile]{What does NOT work?}
 306   Primarily, the reason that we doing this:
 307
 308   \textbf{Computable and Executable Statistics}
 309
 310   (which is the subject of another talk, slides in the backup).   But
 311   consider XML:
 312 \begin{verbatim}
 313 <car brand="honda" engine="4cyl">accord</car>
 314 \end{verbatim}
 315 becomes
 316 \begin{verbatim}
 317 ; data follows keywords...
 318 (car :brand 'honda :engine "4cyl" accord)
 319 \end{verbatim}
 320 \end{frame}
 321
 322 \section{Discussion}
 323
 324 \begin{frame}{Conclusion}
 325   Active but slow development, spanning the range of needs:
 326   \begin{itemize}
 327   \item Numerics: Linear algebra basics done -- full development
 328   \item Static graphics: progress being made, have a partial
 329     grid-solution, need interactive graphics
 330   \item LispStat emulation needs to be finished
 331   \item Model specification and unification
 332   \end{itemize}
 333   Related numerical/statistical projects:
 334   \begin{itemize}
 335   \item Incanter : R/LispStat/Omegahat-like system for Clojure (Lisp
 336     on the JVM)
 337   \item FEMLisp : system/workshop for finite-element analysis modeling
 338     using Lisp
 339   \item matlisp/LispLab : LAPACK-based numerical linear algebra packages
 340   \item GSLL : GNU Scientific Library, Lisp interface.
 341   \end{itemize}
 342 \end{frame}
 343
 344 \begin{frame}{Followup}
 345   I'd be happy to talk with anyone on the following topics:
 346   \begin{itemize}
 347   \item Introduction to Common Lisp
 348   \item support for new statistical programming environment modalities
 349     (subject for another talk).
 350   \item computable and executable statistics (code that explains
 351     itself and can be parsed to generate knowledge about its claims;
 352     ``XML's promise'')
 353   \end{itemize}
 354   and if you are interested in getting involved, or trying it out.
 355 \end{frame}
 356
 357 \end{document}
 358
 359 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 360
 361
 362
 363 \section{BACKUPS}
 364
 365
 366 \section{Common Lisp}
 367
 368 \begin{frame}[fragile]{Finding out things}
 369   \begin{itemize}
 370   \item CL-NUMLIB
 371      num-sequence :from LOW to: HIGH :length SEQ-LENGTH
 372      seq(from,to,by/length)
 373    \item
 374 \begin{verbatim}
 375 (documentation
 376      'cl-numlib:num-sequence
 377      'function)
 378 \end{verbatim}
 379    \item This
 380   \end{itemize}
 381 \end{frame}
 382
 383 \section{Computable Statistics}
 384
 385 \begin{frame}{Can we compute with them?}
 386   3 Examples, of which we only present the first
 387   \begin{itemize}
 388   \item Research.
 389   \item Consulting, Applied Statistics, Scientific Honesty.
 390   \item Reimplementation.
 391   \end{itemize}
 392   Consider whether one can ``compute'' with the information given?
 393   (that is:
 394   \begin{itemize}
 395   \item do we have sufficient information to communicate enough
 396     for the right person to recreate the analysis?
 397   \item have we sufficient clarity to prevent misunderstandings about
 398     intentions and claims?
 399   \end{itemize}
 400   )
 401 \end{frame}
 402
 403 \begin{frame}[fragile]{Example 1: Theory\ldots}
 404   \label{example1}
 405   Let $f(x;\theta)$ describe the likelihood of XX under the following
 406   assumptions.
 407   \begin{enumerate}
 408   \item assumption-1
 409   \item assumption-2
 410   \end{enumerate}
 411   Then if we use the following algorithm:
 412   \begin{enumerate}
 413   \item step-1
 414   \item step-2
 415   \end{enumerate}
 416   then $\hat{\theta}$ should be $N(0,\hat\sigma^2)$ with the following
 417   characteristics\ldots
 418 \end{frame}
 419
 420 \begin{frame}
 421   \frametitle{Can we compute, using this description?}
 422   Given the information at hand:
 423   \begin{itemize}
 424   \item we ought to have a framework for initial coding for the
 425     actual simulations (test-first!)
 426   \item the implementation is somewhat clear
 427   \item We should ask: what theorems have similar assumptions?
 428   \item We should ask: what theorems have similar conclusions but
 429     different assumptions?
 430   \end{itemize}
 431 \end{frame}
 432
 433 \begin{frame}[fragile]{Realizing Theory}
 434 \small{
 435 \begin{verbatim}
 436 (define-theorem my-proposed-theorem
 437    (:theorem-type '(distribution-properties
 438                     frequentist
 439                     likelihood))
 440    (:assumes '(assumption-1 assumption-2))
 441    (:likelihood-form
 442       (defun likelihood (data theta gamma)
 443         (exponential-family theta gamma)))
 444    (:compute-by
 445       '(progn
 446          (compute-starting-values thetahat gammahat)
 447          (until (convergence)
 448            (setf convergence
 449                  (or (step-1 thetahat)
 450                      (step-2 gammahat))))))
 451    (:claim (assert
 452              (and (equal-distribution thetahat 'normal)
 453                   (equal-distribution gammahat 'normal)))))
 454 \end{verbatim}
 455 }
 456 \end{frame}
 457
 458 \begin{frame}[fragile]{It would be nice to have}
 459 \begin{verbatim}
 460    (theorem-veracity 'my-proposed-theorem)
 461 \end{verbatim}
 462 \end{frame}
 463
 464 \begin{frame}[fragile]{and why not...?}
 465 \begin{verbatim}
 466    (when (theorem-veracity
 467               'my-proposed-theorem)
 468       (write-paper 'my-proposed-theorem
 469                    :style :JASA
 470                    :output-format
 471                          '(LaTeX MSWord)))
 472 \end{verbatim}
 473 \end{frame}
 474
 475 \begin{frame}{Comments}
 476   \begin{itemize}
 477   \item The general problem is very difficult
 478   \item I'm working on some basic statistical proof of concepts (not
 479     finished): linear regression (LS-based, Normal-bayesian) and the
 480     T-test.
 481   \item Areas targetted for medium-term future: resampling methods and
 482     similar algorithms.
 483   \end{itemize}
 484 \end{frame}
 485
 486 \begin{frame}
 487   \frametitle{Example 2: Practice\ldots}
 488   \label{example2}
 489   The dataset comes from a series of clinical trials.  We model the
 490   primary endpoint, ``relief'', as a binary random variable.  There is
 491   a random trial effect on relief as well as severity due to
 492   differences in recruitment and inclusion/exclusion criteria.
 493 \end{frame}
 494
 495 \begin{frame}
 496   \frametitle{Can we compute, using this description?}
 497   \begin{itemize}
 498   \item With a real such description, it is clear what some of the
 499     potential models might be for this dataset
 500   \item It should be clear how to start thinking of a data dictionary
 501     for this problem.
 502   \end{itemize}
 503 \end{frame}
 504
 505 \begin{frame}[fragile]{Can we compute?}
 506 \begin{verbatim}
 507   (dataset-metadata paper-1
 508     :context 'clinical-trials
 509     :variables '((relief :model-type dependent
 510                          :distribution binary)
 511                  (trial  :model-type independent
 512                          :distribution categorical)
 513                  (disease-severity))
 514     :metadata '(inclusion-criteria
 515                 exclusion-criteria
 516                 recruitment-rate))
 517   (propose-analysis paper-1)
 518      ; => '(tables
 519      ;      (logistic regression))
 520 \end{verbatim}
 521 \end{frame}
 522
 523 \begin{frame}{Example 3: The Round-trip\ldots}
 524   \label{example3}
 525   The first examples describe ``ideas $\rightarrow$ code''
 526
 527   Consider the last time you read someone else's implementation of a
 528   statistical procedure (i.e. R package code).  When you read the
 529   code, could you see:
 530   \begin{itemize}
 531   \item the assumptions used?
 532   \item the algorithm implemented?
 533   \item practical guidance for when you might select the algorithm
 534     over others?
 535   \item practical guidance for when you might select the
 536     implementation over others?
 537   \end{itemize}
 538   These are usually components of any reasonable journal article.
 539   \textit{(Q: have you actually read an R package that wasn't yours?)}
 540 \end{frame}
 541
 542 \begin{frame}{Exercise left to the reader!}
 543
 544 %   (aside: I have been looking at the \textbf{stats} and \textbf{lme4}
 545 %   packages recently -- \textit{for me}, very clear numerically, much
 546 %   less so statistically)
 547 \end{frame}
 548
 549
 550
 551 \section{Context}
 552
 553 \begin{frame}{Goals for this Talk}{(define, strategic approach,
 554     justify)}
 555
 556   \begin{itemize}
 557   \item To describe the concept of \alert{computable and executable
 558       statistics}, placing it in a historical context.
 559
 560   \item To demonstrate that \alert{a research program}
 561     implemented through  simple steps can increase the efficiency  of
 562     statistical computing approaches by  clearly describing both:
 563     \begin{itemize}
 564     \item numerical characteristics of procedures,
 565     \item statistical concepts driving them.
 566     \end{itemize}
 567
 568   \item To justify that the \alert{approach is worthwhile} and
 569     represents a staged effort towards \alert{increased use of best
 570       practices}.
 571   \end{itemize}
 572   (unfortunately, the last is still incomplete)
 573 \end{frame}
 574
 575
 576 \begin{frame}{Historical Computing Languages}
 577   \begin{itemize}
 578   \item FORTRAN : FORmula TRANslator.  Original numerical computing
 579     language, designed for clean implementation of numerical
 580     algorithms
 581   \item LISP : LISt Processor.  Associated with symbolic
 582     manipulation, AI, and knowledge approaches
 583   \end{itemize}
 584
 585   They represent the 2 generalized needs of statistical computing,
 586   which could be summarized as
 587   \begin{itemize}
 588   \item algorithms/numerics,
 589   \item elicitation, communication, and generation of knowledge (``data
 590     analysis'')
 591   \end{itemize}
 592 \end{frame}
 593
 594 \begin{frame}{Statistical Computing Environments}
 595
 596   Past:
 597   \begin{itemize}
 598   \item SPSS / BMDP / SAS
 599   \item S ( S, S-PLUS, R)
 600   \item LispStat ( XLispStat,  ViSta, ARC , CommonLispStat ) ; QUAIL
 601   \item XGobi (Orca / GGobi / Statistical Reality Engine)
 602   \item MiniTab
 603   \item Stata
 604   \item DataDesk
 605   \item Augsburg Impressionist series (MANET,
 606   \item Excel
 607   \end{itemize}
 608   many others...
 609
 610 \end{frame}
 611
 612 \begin{frame}{How many are left?}
 613
 614   \begin{itemize}
 615   \item R
 616   \item SAS
 617   \item SPSS
 618   \item Stata
 619   \item Minitab
 620   \item very few others...
 621   \end{itemize}
 622   ``R is the Microsoft of the statistical computing world'' -- anonymous.
 623 \end{frame}
 624
 625 \begin{frame}{Selection Pressure}
 626   \begin{itemize}
 627   \item the R user population is growing rapidly, fueled by critical
 628     mass, quality, and value
 629   \item R is a great system for applied data analysis
 630   \item R is not such a great system for research into statistical
 631     computing (backwards compatibility, inertia due to user population)
 632   \end{itemize}
 633   There is a need for alternative experiments for developing new
 634   approaches/ideas/concepts.
 635 \end{frame}
 636
 637 \begin{frame}{Philosophically, why Common Lisp?}
 638   Philosophically:
 639   \begin{itemize}
 640   \item Lisp can cleanly present computational intentions, both
 641     symbolically and numerically.
 642   \item Semantics and context are important: well supported by Lisp
 643     paradigms.
 644   \item Lisp's parentheses describe singular, multi-scale,
 645     \alert{complete thoughts}.
 646   \end{itemize}
 647
 648 \end{frame}
 649
 650 \begin{frame}{Technically, why Common Lisp?}
 651   \begin{itemize}
 652   \item interactive COMPILED language (``R with a compiler'')
 653   \item CLOS is R's S4 object system ``done right''.
 654   \item clean semantics: modality, typing, can be expressed the way
 655     one wants it.
 656   \item programs are data, data are programs, leading to
 657   \item Most modern computing tools available (XML, WWW technologies)
 658   \item ``executable XML''
 659   \end{itemize}
 660   Common Lisp is very close in usage to how people currently use R
 661   (mostly interactive, some batch, and a wish for compilation efficiency).
 662 \end{frame}
 663
 664 \subsection{Background}
 665
 666 \begin{frame}
 667   \frametitle{Desire: Semantics and Statistics}
 668   \begin{itemize}
 669   \item The semantic web (content which is self-descriptive) is an
 670     interesting and potentially useful idea.
 671
 672   \item
 673     Biological informatics support (GO, Entrez) has allowed for
 674     precise definitions of concepts in biology.
 675
 676   \item It is a shame that a field like statistics, requiring such
 677     precision, has less than an imprecise and temporally instable
 678     field such as biology\ldots
 679   \end{itemize}
 680
 681   How can we express statistical work (research, applied work) which
 682   is both human and computer readable (perhaps subject to
 683   transformations first)?
 684 \end{frame}
 685
 686
 687 % \subsection{Context}
 688
 689 % \begin{frame}{Context}{(where I'm coming from, my ``priors'')}
 690 %   \begin{itemize}
 691 %   \item Pharmaceutical Industry
 692 %   \item Modeling and Simulation uses mathematical models/constructs to
 693 %     record beliefs (biology, pharmacology, clinical science) for
 694 %     explication, clinical team alignment, decision support, and
 695 %     quality.
 696 %   \item My work at Novartis is at the intersection of biomedical
 697 %     informatics, statistics, and mathematical modeling.
 698 %   \item As manager: I need a mix of applications and novel research development to
 699 %     solve our challenges better, faster, more efficiently.
 700 %   \item Data analysis is a specialized approach to computer
 701 %     programming, \alert{different} than applications programming or
 702 %     systems programming.
 703 %   \end{itemize}
 704 % \end{frame}
 705
 706
 707 \subsection{Literate Programming is insufficient}
 708
 709 \begin{frame}{Literate Statistical Practice.}
 710   \begin{enumerate}
 711   \item Literate Programming applied to data analysis (Rossini, 1997/2001)
 712   \item among the \alert{most annoying} techniques to integrate into
 713     work-flow if one is not perfectly methodological.
 714   \item Some tools:
 715     \begin{itemize}
 716     \item ESS: supports interactive creation of literate programs.
 717     \item Sweave: tool which exemplifies reporting context; odfWeave
 718       primarily simplifies reporting.
 719     \item Roxygen: primarily supports a literate programming
 720       documentation style, not a literate data analysis programming
 721       style.
 722   \end{itemize}
 723   \item ROI demonstrated in specialized cases: BioConductor.
 724   \item \alert{usually done after the fact} (final step of work-flow)
 725     as a documentation/computational reproducibility technique, rarely
 726     integrated into work-flow.
 727   \end{enumerate}
 728   Many contributors:
 729   Knuth, Claerbout, Carey, de Leeuw, Leisch, Gentleman, Temple-Lang,
 730   \ldots{}
 731 \end{frame}
 732
 733 \begin{frame}
 734   \frametitle{Literate Programming}
 735   \framesubtitle{Why isn't it enough for Data Analysis?}
 736
 737   Only 2 contexts: (executable) code and documentation.  Fine for
 738   application programming,  but for data analysis, we could benefit
 739   from:
 740   \begin{itemize}
 741   \item classification of statistical procedures
 742   \item descriptions of assumptions
 743   \item pragmatic recommendations
 744   \item inheritance of structure through the work-flow of a
 745     statistical methodology or data analysis project
 746   \item datasets and metadata
 747   \end{itemize}
 748   Concept: ontologies describing mathematical assumptions, applications
 749   of methods, work-flow, and statistical data structures can enable
 750   machine communication.
 751
 752   (i.e. informatics framework ala biology)
 753 \end{frame}
 754
 755
 756 \begin{frame}{Communication in Statistical Practice}{\ldots is essential for \ldots}
 757   \begin{itemize}
 758   \item finding
 759   \item explanations
 760   \item agreement
 761   \item receiving information
 762   \end{itemize}
 763   \alert{``machine-readable'' communication/computation lets the
 764     computer help} \\
 765   Semantic Web is about ``machine-enabled computability''.
 766 \end{frame}
 767
 768 \begin{frame}  \frametitle{Semantics}
 769   \framesubtitle{One definition: description and context}
 770
 771   Interoperability is the key, with respect to
 772   \begin{itemize}
 773   \item ``Finding things''
 774   \item Applications and activities with related functionality
 775     \begin{itemize}
 776     \item moving information from one state to another (paper, journal
 777       article, computer program)
 778     \item computer programs which implement solutions to similar tasks
 779     \end{itemize}
 780   \end{itemize}
 781 \end{frame}
 782
 783
 784 \begin{frame}{Statistical Practice is somewhat restricted}
 785   {...but in a good sense, enabling potential for semantics...}
 786
 787   There is a restrictable set of intended actions for what can be done
 788   -- the critical goal is to be able to make a difference by
 789   accelerating activities that should be ``computable'':
 790   \begin{itemize}
 791   \item restricted natural language processing
 792   \item mathematical translation
 793   \item common description of activities for simpler programming/data
 794     analysis (S approach to objects and methods)
 795   \end{itemize}
 796   R is a good basic start (model formulation approach, simple
 797   ``programming with data'' paradigm); we should see if we can do
 798   better!
 799 \end{frame}
 800
 801 \begin{frame}{Computable and Executable Statistics requires}
 802
 803   \begin{itemize}
 804   \item approaches to describe data and metadata (``data'')
 805     \begin{itemize}
 806     \item semantic WWW
 807     \item metadata management and integration, driving
 808     \item data integration
 809     \end{itemize}
 810   \item approaches to describe data analysis methods (``models'')
 811     \begin{itemize}
 812     \item quantitatively: many ontologies (AMS, etc), few meeting
 813       statistical needs.
 814     \item many substantive fields have implementations
 815       (bioinformatics, etc) but not well focused.
 816     \end{itemize}
 817   \item approaches to describe the specific form of interaction
 818     (``instances of models'')
 819     \begin{itemize}
 820     \item Original idea behind ``Literate Statistical Analysis''.
 821     \item That idea is suboptimal, more structure needed (not
 822       necessarily built upon existing...).
 823     \end{itemize}
 824   \end{itemize}
 825 \end{frame}
 826
 827 \subsection{Common Lisp Statistics}
 828
 829 \begin{frame}
 830   \frametitle{Interactive Programming}
 831   \framesubtitle{Everything goes back to being Lisp-like}
 832   \begin{itemize}
 833   \item Interactive programming (as originating with Lisp): works
 834     extremely well for data analysis (Lisp being the original
 835     ``programming with data'' language).
 836   \item Theories/methods for how to do this are reflected in styles
 837     for using R.
 838   \end{itemize}
 839 \end{frame}
 840
 841 \begin{frame}[fragile]
 842   \frametitle{Lisp}
 843
 844   Lisp (LISt Processor) is different than most high-level computing
 845   languages, and is very old (1956).  Lisp is built on lists of things
 846   which are evaluatable.
 847 \begin{verbatim}
 848 (functionName data1 data2 data3)
 849 \end{verbatim}
 850   or ``quoted'':
 851 \begin{verbatim}
 852 '(functionName data1 data2 data3)
 853 \end{verbatim}
 854   which is shorthand for
 855 \begin{verbatim}
 856 (list functionName data1 data2 data3)
 857 \end{verbatim}
 858   The difference is important -- lists of data (the second/third) are
 859   not (yet?!) functions applied to (unencapsulated lists of) data (the first).
 860 \end{frame}
 861
 862 \begin{frame}
 863   \frametitle{Features}
 864   \begin{itemize}
 865   \item Data and Functions semantically the same
 866   \item Natural interactive use through functional programming with
 867     side effects
 868   \item Batch is a simplification of interactive -- not a special mode!
 869   \end{itemize}
 870 \end{frame}
 871
 872
 873
 874 \begin{frame}[fragile]{Representation: XML and Lisp}{executing your data}
 875   Many people are familiar with XML:
 876 \begin{verbatim}
 877 <name phone="+41793674557">Tony Rossini</name>
 878 \end{verbatim}
 879   which is shorter in Lisp:
 880 \begin{verbatim}
 881 (name "Tony Rossini" :phone "+41613674557")
 882 \end{verbatim}
 883   \begin{itemize}
 884   \item Lisp ``parens'', universally hated by unbelievers, are
 885     wonderful for denoting when a ``concept is complete''.
 886   \item Why can't your data self-execute?
 887   \end{itemize}
 888 \end{frame}
 889
 890 \begin{frame}[fragile]{Numerics with Lisp}
 891   \begin{itemize}
 892   \item addition of rational numbers and arithmetic
 893   \item example for mean
 894 \begin{verbatim}
 895  (defun mean (x)
 896     (checktype x 'vector-like)
 897     (/ (loop for i from 0 to (- (nelts *x*) 1)
 898           summing (vref *x* i))
 899        (nelts *x*)))
 900 \end{verbatim}
 901   \item example for variance
 902 \begin{verbatim}
 903 (defun variance (x)
 904   (let ((meanx (mean x))
 905         (nm1 (1- (nelts x))))
 906      (/ (loop for i from 0 to nm1
 907            summing (power (- (vref *x* i) meanx) 2)
 908         nm1))))
 909 \end{verbatim}
 910   \item But through macros, \verb+(vref *x* i)+ could be
 911     \verb+#V(X[i])+ or your favorite syntax.
 912   \end{itemize}
 913
 914 \end{frame}
 915
 916
 917 \begin{frame}{Common Lisp Statistics 1}
 918   \begin{itemize}
 919   \item Originally based on LispStat (reusability)
 920   \item Re-factored structure (some numerics worked with a 1990-era code base).
 921   \item Current activities:
 922     \begin{enumerate}
 923     \item numerics redone using CFFI-based BLAS/LAPLACK (cl-blapack)
 924     \item matrix interface based on MatLisp
 925     \item starting design of a user interface system (interfaces,
 926       visuals).
 927     \item general framework for model specification (regression,
 928       likelihood, ODEs)
 929     \item general framework for algorithm specification (bootstrap,
 930       MLE, algorithmic data anaylsis methods).
 931     \end{enumerate}
 932   \end{itemize}
 933 \end{frame}
 934
 935 \begin{frame}{Common Lisp Statistics 2}
 936
 937   \begin{itemize}
 938   \item Implemented using SBCL.  Contributed fixes for
 939     Clozure/OpenMCL. Goal to target CLISP
 940   \item Supports LispStat prototype object system
 941   \item Package-based design -- only use the components you need, or
 942     the components whose API you like.
 943   \end{itemize}
 944 \end{frame}
 945
 946 \section{Discussion}
 947
 948 \begin{frame}
 949   \frametitle{Outlook}
 950   \begin{itemize}
 951   \item Semantics and Computability have captured a great deal of
 952     attention in the informatics and business computing R\&D worlds
 953   \item Statistically-driven Decision Making and Knowledge Discovery
 954     is, with high likelihood, the next challenging stage after data
 955     integration.
 956   \item Statistical practice (theory and application) can be enhanced,
 957     made more efficient, providing  increased benefit to organizations
 958     and groups using appropriate methods.
 959   \item Lisp as a language, shares characteristics of both Latin
 960     (difficult dead language useful for classical training) and German
 961     (difficult living language useful for general life).  Of course,
 962     for some people, they are not difficult.
 963   \end{itemize}
 964
 965 \end{frame}
 966
 967 \begin{frame}
 968   The research program described in this talk is currently driving the
 969   design of CommonLisp Stat, which leverages concepts and approaches
 970   from the dead and moribund LispStat project.
 971
 972   \begin{itemize}
 973   \item \url{http://repo.or.cz/w/CommonLispStat.git/}
 974   \item \url{http://www.github.com/blindglobe/}
 975   \end{itemize}
 976
 977 \end{frame}
 978 \begin{frame}{Final Comment}
 979
 980   \begin{itemize}
 981   \item In the Pharma industry, it is all about getting the right
 982     drugs to the patient faster.  Data analysis systems seriously
 983     impact this process, being potentially an impediment or an
 984     accelerator.
 985
 986     \begin{itemize}
 987     \item \alert{Information technologies can increase the efficiency
 988         of statistical practice}, though innovation change management
 989       must be taking into account.  (i.e. Statistical practice, while
 990       considered by some an ``art form'', can benefit from
 991       industrialization).
 992     \item \alert{Lisp's features match the basic requirements we need}
 993       (dichotomy: programs as data, data as programs).  Sales pitch,
 994       though...
 995     \item Outlook: Lots of work and experimentation to do!
 996     \end{itemize}
 997   \end{itemize}
 998 \end{frame}
 999
1000
1001 % % All of the following is optional and typically not needed.
1002 % \appendix
1003
1004
1005 % \section<presentation>*{\appendixname}
1006
1007
1008 % \begin{frame} \frametitle{Complements and Backup}
1009 %   No more, stop here.  Questions?  (now or later).
1010 % \end{frame}
1011
1012 % \begin{frame}{The Industrial Challenge.}{Getting the Consulting Right.}
1013 %   % - A title should summarize the slide in an understandable fashion
1014 %   %   for anyone how does not follow everything on the slide itself.
1015
1016 %   \begin{itemize}
1017 %   \item Recording assumptions for the next data analyst, reviewer.
1018 %     Use \texttt{itemize} a lot.
1019 %   \item
1020 %     Use very short sentences or short phrases.
1021 %   \end{itemize}
1022 % \end{frame}
1023
1024
1025 % \begin{frame}{The Industrial Challenge.}{Getting the Right Research Fast.}
1026 %   % - A title should summarize the slide in an understandable fashion
1027 %   %   for anyone how does not follow everything on the slide itself.
1028
1029 %   \begin{itemize}
1030 %   \item
1031 %     Use \texttt{itemize} a lot.
1032 %   \item
1033 %     Use very short sentences or short phrases.
1034 %   \end{itemize}
1035 % \end{frame}
1036
1037
1038 % \begin{frame}{Explicating the Work-flow}{QA/QC-based improvements.}
1039
1040
1041 % \end{frame}
1042
1043 % \section{Motivation}
1044
1045 % \subsection{IT Can Speed up Deliverables in Statistical Practice}
1046
1047 % \begin{frame}{Our Generic Work-flow and Life-cycle}
1048 %   {describing most data analytic activities}
1049 %   Workflow:
1050 %   \begin{enumerate}
1051 %   \item Scope out the problem
1052 %   \item Sketch out a potential solution
1053 %   \item Implement until road-blocks appear
1054 %   \item Deliver results
1055 %   \end{enumerate}
1056
1057 %   Lifecycle:
1058 %   \begin{enumerate}
1059 %   \item paper sketch
1060 %   \item 1st e-draft of text/code/date (iterate to \#1, discarding)
1061 %   \item cycle through work
1062 %   \item publish
1063 %   \item ``throw-away''
1064 %   \end{enumerate}
1065 %   but there is valuble information that could enable the next
1066 %   generation!
1067 % \end{frame}
1068
1069 % \begin{frame}[fragile]{Paper $\rightarrow$ Computer  $\rightarrow$ Article $\rightarrow$ Computer}{Cut and Paste makes for large errors.}
1070 %   \begin{itemize}
1071 %   \item Problems in a regulatory setting
1072 %   \item Regulatory issues are just ``best practices''
1073 %   \end{itemize}
1074
1075 %   Why do we ``copy/paste'', or analogously, restart our work?
1076
1077 %   pro:
1078 %   \begin{itemize}
1079 %   \item every time we repeat, we reinforce the idea in our brain
1080 %   \item review of ideas can help improve them
1081 %   \end{itemize}
1082 %   con:
1083 %   \begin{itemize}
1084 %   \item inefficiency
1085 %   \item introduction of mistakes
1086 %   \item loss of historical context
1087 %   \item changes to earlier work (on a different development branch)
1088 %     can not propagate.
1089 %   \end{itemize}
1090 % \end{frame}
1091
1092 % \section{Semantics and Statistical Practice}
1093
1094
1095 % \begin{frame}
1096 %   \frametitle{Statistical Activity Leads to Reports}
1097 %   \framesubtitle{You read what you know, do you understand it?}
1098
1099 %   How can we improve the communication of the ideas we have?
1100
1101 %   Precision of communication?
1102
1103 % \end{frame}
1104
1105
1106
1107 % \begin{frame}  \frametitle{Communication Requires Context}
1108 %   \framesubtitle{Intentions imply more than one might like...}
1109
1110 %   \begin{itemize}
1111 %   \item Consideration of what we might do
1112 %   \item Applications with related functionality
1113 %   \end{itemize}
1114 % \end{frame}
1115
1116
1117
1118 % \begin{frame}
1119 %   \frametitle{Design Patterns}
1120 %   \framesubtitle{Supporting Work-flow Transitions}
1121
1122 %   (joint work with H Wickham): The point of this research program is
1123 %   not to describe what to do at any particular stage of work, but to
1124 %   encourage researchers and practitioners to consider how the
1125 %   translation and transfer of information between stages so that work
1126 %   is not lost.
1127
1128 %   Examples of stages in a work-flow:
1129 %   \begin{itemize}
1130 %   \item planning, execution, reporting;
1131 %   \item scoping, illustrative examples or counter examples, algorithmic construction,
1132 %     article writing.
1133 %   \item descriptive statistics, preliminary inferential analysis,
1134 %     model/assumption checking, final inferential analysis,
1135 %     communication of scientific results
1136 %   \end{itemize}
1137 %   Description of work-flows is essential to initiating discussions on
1138 %   quality/efficiency of approaches to work.
1139 % \end{frame}
1140
1141 % \section{Design Challenges}
1142
1143 % \begin{frame}
1144 %   \frametitle{Activities are enhanced by support}
1145
1146 %   \begin{itemize}
1147 %   \item Mathematical manipulation can be enhanced by symbolic
1148 %     computation
1149 %   \item Statistical programming can be enabled by examples and related
1150 %     algorithm implementation
1151 %   \item Datasets, to a limited extent, can self-describe.
1152 %   \end{itemize}
1153 % \end{frame}
1154
1155 % \begin{frame}
1156 %   \frametitle{Executable and Computable Science}
1157
1158 %   Use of algorithms and construction to describe how things work.
1159
1160 %   Support for agent-based approaches
1161 % \end{frame}
1162
1163
1164 % \begin{frame}
1165 %   \frametitle{What is Data?  Metadata?}
1166
1167 %   Data: what we've observed
1168
1169 %   MetaData: context for observations, enables semantics.
1170 % \end{frame}
1171
1172
1173
1174
1175 % % \begin{frame}[fragile]
1176 % %   \frametitle{Defining Variables}
1177 % %   \framesubtitle{Setting variables}
1178 % % \begin{verbatim}
1179 % % (setq <variable> <value>)
1180 % % \end{verbatim}
1181 % %   Example:
1182 % % \begin{verbatim}
1183 % % (setq ess-source-directory
1184 % %       "/home/rossini/R-src")
1185 % % \end{verbatim}
1186 % % \end{frame}
1187
1188 % % \begin{frame}[fragile]
1189 % %   \frametitle{Defining on the fly}
1190 % % \begin{verbatim}
1191 % % (setq ess-source-directory
1192 % %    (lambda () (file-name-as-directory
1193 % %          (expand-file-name
1194 % %            (concat (default-directory)
1195 % %                    ess-suffix "-src")))))
1196 % % \end{verbatim}
1197 % %   (Lambda-expressions are anonymous functions, i.e. ``instant-functions'')
1198 % % \end{frame}
1199
1200
1201 % % \begin{frame}[fragile]
1202 % %   \frametitle{Function Reuse}
1203 % %   By naming the function, we could make the previous example reusable
1204 % %   (if possible):
1205 % % \begin{verbatim}
1206 % % (defun my-src-directory ()
1207 % %       (file-name-as-directory
1208 % %          (expand-file-name
1209 % %            (concat (default-directory)
1210 % %                    ess-suffix "-src"))))
1211 % % \end{verbatim}
1212 % %   Example:
1213 % % \begin{verbatim}
1214 % % (setq ess-source-directory (my-src-directory))
1215 % % \end{verbatim}
1216 % % \end{frame}
1217
1218
1219 % % \begin{frame}
1220 % %   \frametitle{Equality Among Packages}
1221 % %   \begin{itemize}
1222 % %   \item more/less equal can be described specifically through
1223 % %     overriding imports.
1224 % %   \end{itemize}
1225 % % \end{frame}
1226
1227
1228 % \subsection<presentation>*{For Further Reading}
1229
1230 % \begin{frame}[allowframebreaks]
1231 %   \frametitle<presentation>{Related Material}
1232
1233 %   \begin{thebibliography}{10}
1234
1235 %   \beamertemplatebookbibitems
1236 %   % Start with overview books.
1237
1238 %   \bibitem{LispStat1990}
1239 %     L.~Tierney
1240 %     \newblock {\em LispStat}.
1241
1242 %   \beamertemplatearticlebibitems
1243 %   % Followed by interesting articles. Keep the list short.
1244
1245 %   \bibitem{Rossini2001}
1246 %     AJ.~Rossini
1247 %     \newblock Literate Statistical Practice
1248 %     \newblock {\em Proceedings of the Conference on Distributed
1249 %       Statistical Computing}, 2001.
1250
1251 %   \bibitem{RossiniLeisch2003}
1252 %     AJ.~Rossini and F.~Leisch
1253 %     \newblock Literate Statistical Practice
1254 %     \newblock {\em Technical Report Series, University of Washington
1255 %       Department of Biostatistics}, 2003.
1256
1257 %   \beamertemplatearrowbibitems
1258 %   % Followed by interesting articles. Keep the list short.
1259
1260 %   \bibitem{CLS}
1261 %     Common Lisp Stat, 2008.
1262 %     \newblock \url{http://repo.or.cz/CommonLispStat.git/}
1263
1264 %   \end{thebibliography}
1265 % \end{frame}