Doc/talks/Rossini-DSC-July2009.tex

   1 \documentclass{beamer}
   2
   3 \mode<presentation>
   4 {
   5   \usetheme{classic}
   6   \setbeamercovered{transparent}
   7 }
   8
   9 \usepackage[english]{babel}
  10 \usepackage[latin1]{inputenc}
  11 \usepackage{times}
  12 \usepackage[T1]{fontenc}
  13 \usepackage{url}
  14
  15 \title[CLS]{Common Lisp Statistics}
  16 \subtitle{Using History to design better data analysis environments}
  17 \author[Rossini]{Anthony~(Tony)~Rossini}
  18
  19 \institute[Novartis and University of Washington]{
  20   Group Head, Modeling and Simulation Statistics\\
  21   Novartis Pharma AG, Switzerland
  22   \and
  23   Affiliate Assoc Prof, Biomedical and Health Informatics\\
  24   University of Washington, USA}
  25
  26 \date[DSC2009]{DSC 2009, Copenhagen}
  27 \subject{Statistical Computing Environments}
  28
  29 \begin{document}
  30
  31 \begin{frame}
  32   \titlepage
  33 \end{frame}
  34
  35 % Structuring a talk is a difficult task and the following structure
  36 % may not be suitable. Here are some rules that apply for this
  37 % solution:
  38
  39 % - Exactly two or three sections (other than the summary).
  40 % - At *most* three subsections per section.
  41 % - Talk about 30s to 2min per frame. So there should be between about
  42 %   15 and 30 frames, all told.
  43
  44 % - A conference audience is likely to know very little of what you
  45 %   are going to talk about. So *simplify*!
  46 % - In a 20min talk, getting the main ideas across is hard
  47 %   enough. Leave out details, even if it means being less precise than
  48 %   you think necessary.
  49 % - If you omit details that are vital to the proof/implementation,
  50 %   just say so once. Everybody will be happy with that.
  51
  52 \section{What Works?}
  53 \label{sec:work}
  54
  55 \begin{frame}{Is it Vaporware?}
  56
  57   Not quite...
  58 \end{frame}
  59
  60 \subsection{Graphics}
  61 \label{sec:work:graphics}
  62
  63 \begin{frame}{Silly Visualization Example}
  64 \includegraphics[width=3in,height=3in]{/home/tony/test1.png}
  65 \end{frame}
  66
  67 \begin{frame}[fragile]{Graphics Device}
  68 \begin{verbatim}
  69 (defparameter *frame2*
  70    (as-frame (create-xlib-image-context 200 200)
  71             :background-color +white+))
  72 (bind ((#2A((f1 f2) (f3 f4))
  73         (split-frame *frame2*
  74                      (percent 50)
  75                      (percent 50))))
  76   (defparameter *f1* f1) ; lower left
  77   (defparameter *f2* f2) ; lower right  f3  f4
  78   (defparameter *f3* f3) ; top left     f1  f2
  79   (defparameter *f4* f4)); top right
  80 \end{verbatim}
  81 \end{frame}
  82
  83 \begin{frame}[fragile]{Functions to Plot}
  84 \begin{verbatim}
  85 (plot-function *f1* #'sin
  86   (interval-of 0 2)
  87   :x-title "x" :y-title "sin(x)")
  88 (plot-function *f2* #'cos (interval-of 0 2)
  89   :x-title "x" :y-title "cos(x)")
  90 (plot-function *f3* #'tan (interval-of 0 2)
  91   :x-title "x" :y-title "tan(x)")
  92 \end{verbatim}
  93 \end{frame}
  94
  95 \begin{frame}[fragile]{Things to Plot}
  96 \small{
  97 \begin{verbatim}
  98 (let* ((n 500)
  99        (xs (num-sequence
 100              :from 0 :to 10 :length n))
 101        (ys (map 'vector
 102               #'(lambda (x) (+ x 8 (random 4.0)))
 103               xs))
 104        (weights
 105           (replicate #'(lambda () (1+ (random 10)))
 106                      n 'fixnum))
 107        (da (plot-simple *f4*
 108              (interval-of 0 10)
 109              (interval-of 10 20)
 110              :x-title "x" :y-title "y")))
 111   (draw-symbols da xs ys :weights weights))
 112 \end{verbatim}
 113 }
 114 \end{frame}
 115
 116 \begin{frame}[fragile]{Copying existing graphics}
 117   And we generated the figure on the first page by:
 118 \begin{verbatim}
 119 (xlib-image-context-to-png
 120    (context *f1*)
 121    "/home/tony/test1.png")
 122 \end{verbatim}
 123 \end{frame}
 124
 125 \subsection{Statistical Models}
 126 \label{sec:work:statmod}
 127
 128 \begin{frame}[fragile]{Linear Regression}
 129   Primitive LispStat, a wrapper around LAPACK's \texttt{dgelsy}:
 130 \small{
 131 \begin{verbatim}
 132 (defparameter *result1*
 133    (regression-model
 134        (list->vector-like iron)
 135        (list->vector-like absorbtion)))
 136 *result*1 =>
 137 \end{verbatim}
 138 }
 139 \end{frame}
 140
 141 \subsection{Numerical Descriptions}
 142 \label{sec:work:numdesc}
 143
 144 \begin{frame}[fragile]{Descriptives}
 145   (mean iron)
 146
 147 \end{frame}
 148
 149 \subsection{Data Manip/Mgmt}
 150 \label{sec:work:data}
 151
 152 \begin{frame}[verbatim]{DataFrames}
 153
 154 \end{frame}
 155
 156 \begin{frame}[verbatim]{Numerical Matrices}
 157
 158 \end{frame}
 159
 160 \begin{frame}{Managing / Manipulating Data}
 161
 162 \end{frame}
 163
 164
 165 \begin{frame}{Outline}
 166   \tableofcontents
 167 \end{frame}
 168
 169 \section{Common Lisp Statistics}
 170 \label{sec:CLS}
 171
 172 \begin{frame}{Why CLS?}
 173   \begin{itemize}
 174   \item a component-based structure for statistical computing
 175   \item Common Lisp provides a simple, \emph{primitive}, syntax
 176   \item Common Lisp provides an amazing number of advanced features
 177     that keep getting reinvented in other languages.
 178   \item Common Lisp has linkages to many amazing features developed in
 179     other languages.
 180   \item ability to leverage non-statisticians interested in computing
 181     technologies (compilers, protocols, interfaces, libraries,
 182     functionality which can be reused for statistical purposes)
 183   \item This is a ``customization'' through packages to support
 184     statistical computing, not a independent language.  ``Ala Carte'',
 185     not ``Menu''.
 186   \end{itemize}
 187 \end{frame}
 188
 189 \subsection{Implementation Plans}
 190 \label{sec:CLS:impl}
 191
 192
 193 \begin{frame}{Current Functionality}
 194   \begin{itemize}
 195   \item basic dataframes (similar to R); indexing/slicing API under
 196     development.
 197   \item Basic regression (similar to XLispStat)
 198   \item matrix storage both in foreign and lisp-centric areas.
 199   \item LAPACK (small percentage, increasing), working with both
 200     matrix storage types
 201   \item static graphics (X11) including preliminary grid functionality based
 202     on CAIRO.  Generation of PNG files from graphics windows.
 203   \item CSV file support
 204   \item Common Lisp!
 205   \end{itemize}
 206 \end{frame}
 207
 208 \begin{frame}[fragile]{Computational Environment Supported}
 209   \begin{itemize}
 210   \item Should  work on Linux, with recent SBCL versions
 211   \item Definitely works on bleeding edge Debian (unstable).
 212   \item Has worked for weak definitions of ``work'' on 4 different
 213     people's environments (not quite, but sort of requires a
 214     \verb+/home/tony/+ !)
 215   \item Threaded support on threaded lisps (SBCL/CCL, soon CLISP).
 216     But not yet integrated.
 217   \end{itemize}
 218 \end{frame}
 219
 220 \begin{frame}{Goals}
 221   Short Term
 222   \begin{itemize}
 223   \item Better integration of data structures with statistical routines
 224     (auto-handling with dataframes, rather than manual parsing).
 225   \end{itemize}
 226   Medium/Long Term
 227   \begin{itemize}
 228   \item Support for CLISP (byte-compiled interpreted lisp) and Clozure
 229     Common Lisp (formerly OpenMCL)
 230   \item high-level Front-end API to a number of matrix and numerical
 231     packages and numerical structures (
 232   \item constraint system for interactive GUIs and graphics
 233   \item full LispStat compatibility (object system partially works;
 234     GUI support coming).
 235   \item Integrated threading via Bordeaux threads (portable CL API package).
 236   \end{itemize}
 237 \end{frame}
 238
 239 \subsection{Common Lisp}
 240 \label{sec:CLS:lisp}
 241
 242 \begin{frame}{Common Lisp}
 243   \begin{itemize}
 244   \item Lisp-2 (symbols can denote both a separate function and a value)
 245   \item ANSI standard (built by committee, but the committee was
 246     reasonably smart)
 247   \item Many implementations
 248   \item Most implementations are interactive \textbf{compiled}
 249     languages (few are interpreted, and those are usually
 250     byte-compiled).
 251   \item Parens provide clear delineation of a \textbf{Complete
 252       Thought} (functional programming with side effects).
 253   \item The Original \emph{Programming with Data} Language
 254     (\emph{Programs are Data} and \emph{Data are Executable} also
 255     apply).
 256   \item advanced, powerful, first-class macros (macros functionally
 257     re-write code)
 258   \item
 259   \end{itemize}
 260 \end{frame}
 261
 262 \begin{frame}{Common Lisp Packages}
 263   (They are packages and called packages, not libraries.  Some people
 264   can rejoice!)
 265   \begin{itemize}
 266   \item infrastructure enhancement:  infix-notation, data structures,
 267     control and flow structures
 268   \item numerics, graphics, GUIs,
 269   \item primitive R to CL compiler (which could also be considered an
 270     object-code compiler for R); 3 interfaces which embed R within CL.
 271   \item
 272   \end{itemize}
 273   See \url{http://www.common-lisp.net/} and
 274   \url{http://www.cliki.org/}.  CLS sources can be found on
 275   \url{http://github.com/blindglobe/}
 276 \end{frame}
 277
 278 \section{What else about CLS is still Vaporware?}
 279
 280 \begin{frame}{What does NOT work?}
 281   Primarily, the reason that we doing this:
 282
 283   \textbf{Computable and Executable Statistics}
 284
 285   (which is the subject of another talk, slides in the backup).
 286 \end{frame}
 287
 288
 289
 290 \section{Discussion}
 291
 292 \begin{frame}{Conclusion}
 293   Active but slow development, spanning the range of needs:
 294   \begin{itemize}
 295   \item Numerics: Linear algebra basics done -- full development
 296   \item Static graphics: progress being made, have a partial
 297     grid-solution, need interactive graphics
 298   \item LispStat emulation needs to be finished
 299   \item Model specification and unification
 300   \end{itemize}
 301   Related numerical/statistical projects:
 302   \begin{itemize}
 303   \item Incanter : R/LispStat/Omegahat-like system for Clojure (Lisp
 304     on the JVM)
 305   \item FEMLisp : system/workshop for finite-element analysis modeling
 306     using Lisp
 307   \item matlisp/LispLab : LAPACK-based numerical linear algebra packages
 308   \item GSLL : GNU Scientific Library, Lisp interface.
 309   \end{itemize}
 310   Finally: support for a new statistical programming environment
 311   modality (subject for another talk).
 312 \end{frame}
 313
 314
 315 \end{document}
 316
 317 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 318
 319
 320
 321 \section{BACKUPS}
 322
 323
 324 \section{Common Lisp}
 325
 326 \begin{frame}[fragile]{Finding out things}
 327   \begin{itemize}
 328   \item CL-NUMLIB
 329      num-sequence :from LOW to: HIGH :length SEQ-LENGTH
 330      seq(from,to,by/length)
 331    \item
 332 \begin{verbatim}
 333 (documentation
 334      'cl-numlib:num-sequence
 335      'function)
 336 \end{verbatim}
 337    \item This
 338   \end{itemize}
 339 \end{frame}
 340
 341 \section{Computable Statistics}
 342
 343 \begin{frame}{Can we compute with them?}
 344   3 Examples, of which we only present the first
 345   \begin{itemize}
 346   \item Research.
 347   \item Consulting, Applied Statistics, Scientific Honesty.
 348   \item Reimplementation.
 349   \end{itemize}
 350   Consider whether one can ``compute'' with the information given?
 351   (that is:
 352   \begin{itemize}
 353   \item do we have sufficient information to communicate enough
 354     for the right person to recreate the analysis?
 355   \item have we sufficient clarity to prevent misunderstandings about
 356     intentions and claims?
 357   \end{itemize}
 358   )
 359 \end{frame}
 360
 361 \begin{frame}[fragile]{Example 1: Theory\ldots}
 362   \label{example1}
 363   Let $f(x;\theta)$ describe the likelihood of XX under the following
 364   assumptions.
 365   \begin{enumerate}
 366   \item assumption-1
 367   \item assumption-2
 368   \end{enumerate}
 369   Then if we use the following algorithm:
 370   \begin{enumerate}
 371   \item step-1
 372   \item step-2
 373   \end{enumerate}
 374   then $\hat{\theta}$ should be $N(0,\hat\sigma^2)$ with the following
 375   characteristics\ldots
 376 \end{frame}
 377
 378 \begin{frame}
 379   \frametitle{Can we compute, using this description?}
 380   Given the information at hand:
 381   \begin{itemize}
 382   \item we ought to have a framework for initial coding for the
 383     actual simulations (test-first!)
 384   \item the implementation is somewhat clear
 385   \item We should ask: what theorems have similar assumptions?
 386   \item We should ask: what theorems have similar conclusions but
 387     different assumptions?
 388   \end{itemize}
 389 \end{frame}
 390
 391 \begin{frame}[fragile]{Realizing Theory}
 392 \small{
 393 \begin{verbatim}
 394 (define-theorem my-proposed-theorem
 395    (:theorem-type '(distribution-properties
 396                     frequentist
 397                     likelihood))
 398    (:assumes '(assumption-1 assumption-2))
 399    (:likelihood-form
 400       (defun likelihood (data theta gamma)
 401         (exponential-family theta gamma)))
 402    (:compute-by
 403       '(progn
 404          (compute-starting-values thetahat gammahat)
 405          (until (convergence)
 406            (setf convergence
 407                  (or (step-1 thetahat)
 408                      (step-2 gammahat))))))
 409    (:claim (assert
 410              (and (equal-distribution thetahat 'normal)
 411                   (equal-distribution gammahat 'normal)))))
 412 \end{verbatim}
 413 }
 414 \end{frame}
 415
 416 \begin{frame}[fragile]{It would be nice to have}
 417 \begin{verbatim}
 418    (theorem-veracity 'my-proposed-theorem)
 419 \end{verbatim}
 420 \end{frame}
 421
 422 \begin{frame}[fragile]{and why not...?}
 423 \begin{verbatim}
 424    (when (theorem-veracity
 425               'my-proposed-theorem)
 426       (write-paper 'my-proposed-theorem
 427                    :style :JASA
 428                    :output-format
 429                          '(LaTeX MSWord)))
 430 \end{verbatim}
 431 \end{frame}
 432
 433 \begin{frame}{Comments}
 434   \begin{itemize}
 435   \item The general problem is very difficult
 436   \item I'm working on some basic statistical proof of concepts (not
 437     finished): linear regression (LS-based, Normal-bayesian) and the
 438     T-test.
 439   \item Areas targetted for medium-term future: resampling methods and
 440     similar algorithms.
 441   \end{itemize}
 442 \end{frame}
 443
 444 \begin{frame}
 445   \frametitle{Example 2: Practice\ldots}
 446   \label{example2}
 447   The dataset comes from a series of clinical trials.  We model the
 448   primary endpoint, ``relief'', as a binary random variable.  There is
 449   a random trial effect on relief as well as severity due to
 450   differences in recruitment and inclusion/exclusion criteria.
 451 \end{frame}
 452
 453 \begin{frame}
 454   \frametitle{Can we compute, using this description?}
 455   \begin{itemize}
 456   \item With a real such description, it is clear what some of the
 457     potential models might be for this dataset
 458   \item It should be clear how to start thinking of a data dictionary
 459     for this problem.
 460   \end{itemize}
 461 \end{frame}
 462
 463 \begin{frame}[fragile]{Can we compute?}
 464 \begin{verbatim}
 465   (dataset-metadata paper-1
 466     :context 'clinical-trials
 467     :variables '((relief :model-type dependent
 468                          :distribution binary)
 469                  (trial  :model-type independent
 470                          :distribution categorical)
 471                  (disease-severity))
 472     :metadata '(inclusion-criteria
 473                 exclusion-criteria
 474                 recruitment-rate))
 475   (propose-analysis paper-1)
 476      ; => '(tables
 477      ;      (logistic regression))
 478 \end{verbatim}
 479 \end{frame}
 480
 481 \begin{frame}{Example 3: The Round-trip\ldots}
 482   \label{example3}
 483   The first examples describe ``ideas $\rightarrow$ code''
 484
 485   Consider the last time you read someone else's implementation of a
 486   statistical procedure (i.e. R package code).  When you read the
 487   code, could you see:
 488   \begin{itemize}
 489   \item the assumptions used?
 490   \item the algorithm implemented?
 491   \item practical guidance for when you might select the algorithm
 492     over others?
 493   \item practical guidance for when you might select the
 494     implementation over others?
 495   \end{itemize}
 496   These are usually components of any reasonable journal article.
 497   \textit{(Q: have you actually read an R package that wasn't yours?)}
 498 \end{frame}
 499
 500 \begin{frame}{Exercise left to the reader!}
 501
 502 %   (aside: I have been looking at the \textbf{stats} and \textbf{lme4}
 503 %   packages recently -- \textit{for me}, very clear numerically, much
 504 %   less so statistically)
 505 \end{frame}
 506
 507
 508
 509 \section{Context}
 510
 511 \begin{frame}{Goals for this Talk}{(define, strategic approach,
 512     justify)}
 513
 514   \begin{itemize}
 515   \item To describe the concept of \alert{computable and executable
 516       statistics}, placing it in a historical context.
 517
 518   \item To demonstrate that \alert{a research program}
 519     implemented through  simple steps can increase the efficiency  of
 520     statistical computing approaches by  clearly describing both:
 521     \begin{itemize}
 522     \item numerical characteristics of procedures,
 523     \item statistical concepts driving them.
 524     \end{itemize}
 525
 526   \item To justify that the \alert{approach is worthwhile} and
 527     represents a staged effort towards \alert{increased use of best
 528       practices}.
 529   \end{itemize}
 530   (unfortunately, the last is still incomplete)
 531 \end{frame}
 532
 533
 534 \begin{frame}{Historical Computing Languages}
 535   \begin{itemize}
 536   \item FORTRAN : FORmula TRANslator.  Original numerical computing
 537     language, designed for clean implementation of numerical
 538     algorithms
 539   \item LISP : LISt Processor.  Associated with symbolic
 540     manipulation, AI, and knowledge approaches
 541   \end{itemize}
 542
 543   They represent the 2 generalized needs of statistical computing,
 544   which could be summarized as
 545   \begin{itemize}
 546   \item algorithms/numerics,
 547   \item elicitation, communication, and generation of knowledge (``data
 548     analysis'')
 549   \end{itemize}
 550 \end{frame}
 551
 552 \begin{frame}{Statistical Computing Environments}
 553
 554   Past:
 555   \begin{itemize}
 556   \item SPSS / BMDP / SAS
 557   \item S ( S, S-PLUS, R)
 558   \item LispStat ( XLispStat,  ViSta, ARC , CommonLispStat ) ; QUAIL
 559   \item XGobi (Orca / GGobi / Statistical Reality Engine)
 560   \item MiniTab
 561   \item Stata
 562   \item DataDesk
 563   \item Augsburg Impressionist series (MANET,
 564   \item Excel
 565   \end{itemize}
 566   many others...
 567
 568 \end{frame}
 569
 570 \begin{frame}{How many are left?}
 571
 572   \begin{itemize}
 573   \item R
 574   \item SAS
 575   \item SPSS
 576   \item Stata
 577   \item Minitab
 578   \item very few others...
 579   \end{itemize}
 580   ``R is the Microsoft of the statistical computing world'' -- anonymous.
 581 \end{frame}
 582
 583 \begin{frame}{Selection Pressure}
 584   \begin{itemize}
 585   \item the R user population is growing rapidly, fueled by critical
 586     mass, quality, and value
 587   \item R is a great system for applied data analysis
 588   \item R is not such a great system for research into statistical
 589     computing (backwards compatibility, inertia due to user population)
 590   \end{itemize}
 591   There is a need for alternative experiments for developing new
 592   approaches/ideas/concepts.
 593 \end{frame}
 594
 595 \begin{frame}{Philosophically, why Common Lisp?}
 596   Philosophically:
 597   \begin{itemize}
 598   \item Lisp can cleanly present computational intentions, both
 599     symbolically and numerically.
 600   \item Semantics and context are important: well supported by Lisp
 601     paradigms.
 602   \item Lisp's parentheses describe singular, multi-scale,
 603     \alert{complete thoughts}.
 604   \end{itemize}
 605
 606 \end{frame}
 607
 608 \begin{frame}{Technically, why Common Lisp?}
 609   \begin{itemize}
 610   \item interactive COMPILED language (``R with a compiler'')
 611   \item CLOS is R's S4 object system ``done right''.
 612   \item clean semantics: modality, typing, can be expressed the way
 613     one wants it.
 614   \item programs are data, data are programs, leading to
 615   \item Most modern computing tools available (XML, WWW technologies)
 616   \item ``executable XML''
 617   \end{itemize}
 618   Common Lisp is very close in usage to how people currently use R
 619   (mostly interactive, some batch, and a wish for compilation efficiency).
 620 \end{frame}
 621
 622 \subsection{Background}
 623
 624 \begin{frame}
 625   \frametitle{Desire: Semantics and Statistics}
 626   \begin{itemize}
 627   \item The semantic web (content which is self-descriptive) is an
 628     interesting and potentially useful idea.
 629
 630   \item
 631     Biological informatics support (GO, Entrez) has allowed for
 632     precise definitions of concepts in biology.
 633
 634   \item It is a shame that a field like statistics, requiring such
 635     precision, has less than an imprecise and temporally instable
 636     field such as biology\ldots
 637   \end{itemize}
 638
 639   How can we express statistical work (research, applied work) which
 640   is both human and computer readable (perhaps subject to
 641   transformations first)?
 642 \end{frame}
 643
 644
 645 % \subsection{Context}
 646
 647 % \begin{frame}{Context}{(where I'm coming from, my ``priors'')}
 648 %   \begin{itemize}
 649 %   \item Pharmaceutical Industry
 650 %   \item Modeling and Simulation uses mathematical models/constructs to
 651 %     record beliefs (biology, pharmacology, clinical science) for
 652 %     explication, clinical team alignment, decision support, and
 653 %     quality.
 654 %   \item My work at Novartis is at the intersection of biomedical
 655 %     informatics, statistics, and mathematical modeling.
 656 %   \item As manager: I need a mix of applications and novel research development to
 657 %     solve our challenges better, faster, more efficiently.
 658 %   \item Data analysis is a specialized approach to computer
 659 %     programming, \alert{different} than applications programming or
 660 %     systems programming.
 661 %   \end{itemize}
 662 % \end{frame}
 663
 664
 665 \subsection{Literate Programming is insufficient}
 666
 667 \begin{frame}{Literate Statistical Practice.}
 668   \begin{enumerate}
 669   \item Literate Programming applied to data analysis (Rossini, 1997/2001)
 670   \item among the \alert{most annoying} techniques to integrate into
 671     work-flow if one is not perfectly methodological.
 672   \item Some tools:
 673     \begin{itemize}
 674     \item ESS: supports interactive creation of literate programs.
 675     \item Sweave: tool which exemplifies reporting context; odfWeave
 676       primarily simplifies reporting.
 677     \item Roxygen: primarily supports a literate programming
 678       documentation style, not a literate data analysis programming
 679       style.
 680   \end{itemize}
 681   \item ROI demonstrated in specialized cases: BioConductor.
 682   \item \alert{usually done after the fact} (final step of work-flow)
 683     as a documentation/computational reproducibility technique, rarely
 684     integrated into work-flow.
 685   \end{enumerate}
 686   Many contributors:
 687   Knuth, Claerbout, Carey, de Leeuw, Leisch, Gentleman, Temple-Lang,
 688   \ldots{}
 689 \end{frame}
 690
 691 \begin{frame}
 692   \frametitle{Literate Programming}
 693   \framesubtitle{Why isn't it enough for Data Analysis?}
 694
 695   Only 2 contexts: (executable) code and documentation.  Fine for
 696   application programming,  but for data analysis, we could benefit
 697   from:
 698   \begin{itemize}
 699   \item classification of statistical procedures
 700   \item descriptions of assumptions
 701   \item pragmatic recommendations
 702   \item inheritance of structure through the work-flow of a
 703     statistical methodology or data analysis project
 704   \item datasets and metadata
 705   \end{itemize}
 706   Concept: ontologies describing mathematical assumptions, applications
 707   of methods, work-flow, and statistical data structures can enable
 708   machine communication.
 709
 710   (i.e. informatics framework ala biology)
 711 \end{frame}
 712
 713
 714 \begin{frame}{Communication in Statistical Practice}{\ldots is essential for \ldots}
 715   \begin{itemize}
 716   \item finding
 717   \item explanations
 718   \item agreement
 719   \item receiving information
 720   \end{itemize}
 721   \alert{``machine-readable'' communication/computation lets the
 722     computer help} \\
 723   Semantic Web is about ``machine-enabled computability''.
 724 \end{frame}
 725
 726 \begin{frame}  \frametitle{Semantics}
 727   \framesubtitle{One definition: description and context}
 728
 729   Interoperability is the key, with respect to
 730   \begin{itemize}
 731   \item ``Finding things''
 732   \item Applications and activities with related functionality
 733     \begin{itemize}
 734     \item moving information from one state to another (paper, journal
 735       article, computer program)
 736     \item computer programs which implement solutions to similar tasks
 737     \end{itemize}
 738   \end{itemize}
 739 \end{frame}
 740
 741
 742 \begin{frame}{Statistical Practice is somewhat restricted}
 743   {...but in a good sense, enabling potential for semantics...}
 744
 745   There is a restrictable set of intended actions for what can be done
 746   -- the critical goal is to be able to make a difference by
 747   accelerating activities that should be ``computable'':
 748   \begin{itemize}
 749   \item restricted natural language processing
 750   \item mathematical translation
 751   \item common description of activities for simpler programming/data
 752     analysis (S approach to objects and methods)
 753   \end{itemize}
 754   R is a good basic start (model formulation approach, simple
 755   ``programming with data'' paradigm); we should see if we can do
 756   better!
 757 \end{frame}
 758
 759 \begin{frame}{Computable and Executable Statistics requires}
 760
 761   \begin{itemize}
 762   \item approaches to describe data and metadata (``data'')
 763     \begin{itemize}
 764     \item semantic WWW
 765     \item metadata management and integration, driving
 766     \item data integration
 767     \end{itemize}
 768   \item approaches to describe data analysis methods (``models'')
 769     \begin{itemize}
 770     \item quantitatively: many ontologies (AMS, etc), few meeting
 771       statistical needs.
 772     \item many substantive fields have implementations
 773       (bioinformatics, etc) but not well focused.
 774     \end{itemize}
 775   \item approaches to describe the specific form of interaction
 776     (``instances of models'')
 777     \begin{itemize}
 778     \item Original idea behind ``Literate Statistical Analysis''.
 779     \item That idea is suboptimal, more structure needed (not
 780       necessarily built upon existing...).
 781     \end{itemize}
 782   \end{itemize}
 783 \end{frame}
 784
 785 \subsection{Common Lisp Statistics}
 786
 787 \begin{frame}
 788   \frametitle{Interactive Programming}
 789   \framesubtitle{Everything goes back to being Lisp-like}
 790   \begin{itemize}
 791   \item Interactive programming (as originating with Lisp): works
 792     extremely well for data analysis (Lisp being the original
 793     ``programming with data'' language).
 794   \item Theories/methods for how to do this are reflected in styles
 795     for using R.
 796   \end{itemize}
 797 \end{frame}
 798
 799 \begin{frame}[fragile]
 800   \frametitle{Lisp}
 801
 802   Lisp (LISt Processor) is different than most high-level computing
 803   languages, and is very old (1956).  Lisp is built on lists of things
 804   which are evaluatable.
 805 \begin{verbatim}
 806 (functionName data1 data2 data3)
 807 \end{verbatim}
 808   or ``quoted'':
 809 \begin{verbatim}
 810 '(functionName data1 data2 data3)
 811 \end{verbatim}
 812   which is shorthand for
 813 \begin{verbatim}
 814 (list functionName data1 data2 data3)
 815 \end{verbatim}
 816   The difference is important -- lists of data (the second/third) are
 817   not (yet?!) functions applied to (unencapsulated lists of) data (the first).
 818 \end{frame}
 819
 820 \begin{frame}
 821   \frametitle{Features}
 822   \begin{itemize}
 823   \item Data and Functions semantically the same
 824   \item Natural interactive use through functional programming with
 825     side effects
 826   \item Batch is a simplification of interactive -- not a special mode!
 827   \end{itemize}
 828 \end{frame}
 829
 830
 831
 832 \begin{frame}[fragile]{Representation: XML and Lisp}{executing your data}
 833   Many people are familiar with XML:
 834 \begin{verbatim}
 835 <name phone="+41793674557">Tony Rossini</name>
 836 \end{verbatim}
 837   which is shorter in Lisp:
 838 \begin{verbatim}
 839 (name "Tony Rossini" :phone "+41613674557")
 840 \end{verbatim}
 841   \begin{itemize}
 842   \item Lisp ``parens'', universally hated by unbelievers, are
 843     wonderful for denoting when a ``concept is complete''.
 844   \item Why can't your data self-execute?
 845   \end{itemize}
 846 \end{frame}
 847
 848 \begin{frame}[fragile]{Numerics with Lisp}
 849   \begin{itemize}
 850   \item addition of rational numbers and arithmetic
 851   \item example for mean
 852 \begin{verbatim}
 853  (defun mean (x)
 854     (checktype x 'vector-like)
 855     (/ (loop for i from 0 to (- (nelts *x*) 1)
 856           summing (vref *x* i))
 857        (nelts *x*)))
 858 \end{verbatim}
 859   \item example for variance
 860 \begin{verbatim}
 861 (defun variance (x)
 862   (let ((meanx (mean x))
 863         (nm1 (1- (nelts x))))
 864      (/ (loop for i from 0 to nm1
 865            summing (power (- (vref *x* i) meanx) 2)
 866         nm1))))
 867 \end{verbatim}
 868   \item But through macros, \verb+(vref *x* i)+ could be
 869     \verb+#V(X[i])+ or your favorite syntax.
 870   \end{itemize}
 871
 872 \end{frame}
 873
 874
 875 \begin{frame}{Common Lisp Statistics 1}
 876   \begin{itemize}
 877   \item Originally based on LispStat (reusability)
 878   \item Re-factored structure (some numerics worked with a 1990-era code base).
 879   \item Current activities:
 880     \begin{enumerate}
 881     \item numerics redone using CFFI-based BLAS/LAPLACK (cl-blapack)
 882     \item matrix interface based on MatLisp
 883     \item starting design of a user interface system (interfaces,
 884       visuals).
 885     \item general framework for model specification (regression,
 886       likelihood, ODEs)
 887     \item general framework for algorithm specification (bootstrap,
 888       MLE, algorithmic data anaylsis methods).
 889     \end{enumerate}
 890   \end{itemize}
 891 \end{frame}
 892
 893 \begin{frame}{Common Lisp Statistics 2}
 894
 895   \begin{itemize}
 896   \item Implemented using SBCL.  Contributed fixes for
 897     Clozure/OpenMCL. Goal to target CLISP
 898   \item Supports LispStat prototype object system
 899   \item Package-based design -- only use the components you need, or
 900     the components whose API you like.
 901   \end{itemize}
 902 \end{frame}
 903
 904 \section{Discussion}
 905
 906 \begin{frame}
 907   \frametitle{Outlook}
 908   \begin{itemize}
 909   \item Semantics and Computability have captured a great deal of
 910     attention in the informatics and business computing R\&D worlds
 911   \item Statistically-driven Decision Making and Knowledge Discovery
 912     is, with high likelihood, the next challenging stage after data
 913     integration.
 914   \item Statistical practice (theory and application) can be enhanced,
 915     made more efficient, providing  increased benefit to organizations
 916     and groups using appropriate methods.
 917   \item Lisp as a language, shares characteristics of both Latin
 918     (difficult dead language useful for classical training) and German
 919     (difficult living language useful for general life).  Of course,
 920     for some people, they are not difficult.
 921   \end{itemize}
 922
 923 \end{frame}
 924
 925 \begin{frame}
 926   The research program described in this talk is currently driving the
 927   design of CommonLisp Stat, which leverages concepts and approaches
 928   from the dead and moribund LispStat project.
 929
 930   \begin{itemize}
 931   \item \url{http://repo.or.cz/w/CommonLispStat.git/}
 932   \item \url{http://www.github.com/blindglobe/}
 933   \end{itemize}
 934
 935 \end{frame}
 936 \begin{frame}{Final Comment}
 937
 938   \begin{itemize}
 939   \item In the Pharma industry, it is all about getting the right
 940     drugs to the patient faster.  Data analysis systems seriously
 941     impact this process, being potentially an impediment or an
 942     accelerator.
 943
 944     \begin{itemize}
 945     \item \alert{Information technologies can increase the efficiency
 946         of statistical practice}, though innovation change management
 947       must be taking into account.  (i.e. Statistical practice, while
 948       considered by some an ``art form'', can benefit from
 949       industrialization).
 950     \item \alert{Lisp's features match the basic requirements we need}
 951       (dichotomy: programs as data, data as programs).  Sales pitch,
 952       though...
 953     \item Outlook: Lots of work and experimentation to do!
 954     \end{itemize}
 955   \end{itemize}
 956 \end{frame}
 957
 958
 959 % % All of the following is optional and typically not needed.
 960 % \appendix
 961
 962
 963 % \section<presentation>*{\appendixname}
 964
 965
 966 % \begin{frame} \frametitle{Complements and Backup}
 967 %   No more, stop here.  Questions?  (now or later).
 968 % \end{frame}
 969
 970 % \begin{frame}{The Industrial Challenge.}{Getting the Consulting Right.}
 971 %   % - A title should summarize the slide in an understandable fashion
 972 %   %   for anyone how does not follow everything on the slide itself.
 973
 974 %   \begin{itemize}
 975 %   \item Recording assumptions for the next data analyst, reviewer.
 976 %     Use \texttt{itemize} a lot.
 977 %   \item
 978 %     Use very short sentences or short phrases.
 979 %   \end{itemize}
 980 % \end{frame}
 981
 982
 983 % \begin{frame}{The Industrial Challenge.}{Getting the Right Research Fast.}
 984 %   % - A title should summarize the slide in an understandable fashion
 985 %   %   for anyone how does not follow everything on the slide itself.
 986
 987 %   \begin{itemize}
 988 %   \item
 989 %     Use \texttt{itemize} a lot.
 990 %   \item
 991 %     Use very short sentences or short phrases.
 992 %   \end{itemize}
 993 % \end{frame}
 994
 995
 996 % \begin{frame}{Explicating the Work-flow}{QA/QC-based improvements.}
 997
 998
 999 % \end{frame}
1000
1001 % \section{Motivation}
1002
1003 % \subsection{IT Can Speed up Deliverables in Statistical Practice}
1004
1005 % \begin{frame}{Our Generic Work-flow and Life-cycle}
1006 %   {describing most data analytic activities}
1007 %   Workflow:
1008 %   \begin{enumerate}
1009 %   \item Scope out the problem
1010 %   \item Sketch out a potential solution
1011 %   \item Implement until road-blocks appear
1012 %   \item Deliver results
1013 %   \end{enumerate}
1014
1015 %   Lifecycle:
1016 %   \begin{enumerate}
1017 %   \item paper sketch
1018 %   \item 1st e-draft of text/code/date (iterate to \#1, discarding)
1019 %   \item cycle through work
1020 %   \item publish
1021 %   \item ``throw-away''
1022 %   \end{enumerate}
1023 %   but there is valuble information that could enable the next
1024 %   generation!
1025 % \end{frame}
1026
1027 % \begin{frame}[fragile]{Paper $\rightarrow$ Computer  $\rightarrow$ Article $\rightarrow$ Computer}{Cut and Paste makes for large errors.}
1028 %   \begin{itemize}
1029 %   \item Problems in a regulatory setting
1030 %   \item Regulatory issues are just ``best practices''
1031 %   \end{itemize}
1032
1033 %   Why do we ``copy/paste'', or analogously, restart our work?
1034
1035 %   pro:
1036 %   \begin{itemize}
1037 %   \item every time we repeat, we reinforce the idea in our brain
1038 %   \item review of ideas can help improve them
1039 %   \end{itemize}
1040 %   con:
1041 %   \begin{itemize}
1042 %   \item inefficiency
1043 %   \item introduction of mistakes
1044 %   \item loss of historical context
1045 %   \item changes to earlier work (on a different development branch)
1046 %     can not propagate.
1047 %   \end{itemize}
1048 % \end{frame}
1049
1050 % \section{Semantics and Statistical Practice}
1051
1052
1053 % \begin{frame}
1054 %   \frametitle{Statistical Activity Leads to Reports}
1055 %   \framesubtitle{You read what you know, do you understand it?}
1056
1057 %   How can we improve the communication of the ideas we have?
1058
1059 %   Precision of communication?
1060
1061 % \end{frame}
1062
1063
1064
1065 % \begin{frame}  \frametitle{Communication Requires Context}
1066 %   \framesubtitle{Intentions imply more than one might like...}
1067
1068 %   \begin{itemize}
1069 %   \item Consideration of what we might do
1070 %   \item Applications with related functionality
1071 %   \end{itemize}
1072 % \end{frame}
1073
1074
1075
1076 % \begin{frame}
1077 %   \frametitle{Design Patterns}
1078 %   \framesubtitle{Supporting Work-flow Transitions}
1079
1080 %   (joint work with H Wickham): The point of this research program is
1081 %   not to describe what to do at any particular stage of work, but to
1082 %   encourage researchers and practitioners to consider how the
1083 %   translation and transfer of information between stages so that work
1084 %   is not lost.
1085
1086 %   Examples of stages in a work-flow:
1087 %   \begin{itemize}
1088 %   \item planning, execution, reporting;
1089 %   \item scoping, illustrative examples or counter examples, algorithmic construction,
1090 %     article writing.
1091 %   \item descriptive statistics, preliminary inferential analysis,
1092 %     model/assumption checking, final inferential analysis,
1093 %     communication of scientific results
1094 %   \end{itemize}
1095 %   Description of work-flows is essential to initiating discussions on
1096 %   quality/efficiency of approaches to work.
1097 % \end{frame}
1098
1099 % \section{Design Challenges}
1100
1101 % \begin{frame}
1102 %   \frametitle{Activities are enhanced by support}
1103
1104 %   \begin{itemize}
1105 %   \item Mathematical manipulation can be enhanced by symbolic
1106 %     computation
1107 %   \item Statistical programming can be enabled by examples and related
1108 %     algorithm implementation
1109 %   \item Datasets, to a limited extent, can self-describe.
1110 %   \end{itemize}
1111 % \end{frame}
1112
1113 % \begin{frame}
1114 %   \frametitle{Executable and Computable Science}
1115
1116 %   Use of algorithms and construction to describe how things work.
1117
1118 %   Support for agent-based approaches
1119 % \end{frame}
1120
1121
1122 % \begin{frame}
1123 %   \frametitle{What is Data?  Metadata?}
1124
1125 %   Data: what we've observed
1126
1127 %   MetaData: context for observations, enables semantics.
1128 % \end{frame}
1129
1130
1131
1132
1133 % % \begin{frame}[fragile]
1134 % %   \frametitle{Defining Variables}
1135 % %   \framesubtitle{Setting variables}
1136 % % \begin{verbatim}
1137 % % (setq <variable> <value>)
1138 % % \end{verbatim}
1139 % %   Example:
1140 % % \begin{verbatim}
1141 % % (setq ess-source-directory
1142 % %       "/home/rossini/R-src")
1143 % % \end{verbatim}
1144 % % \end{frame}
1145
1146 % % \begin{frame}[fragile]
1147 % %   \frametitle{Defining on the fly}
1148 % % \begin{verbatim}
1149 % % (setq ess-source-directory
1150 % %    (lambda () (file-name-as-directory
1151 % %          (expand-file-name
1152 % %            (concat (default-directory)
1153 % %                    ess-suffix "-src")))))
1154 % % \end{verbatim}
1155 % %   (Lambda-expressions are anonymous functions, i.e. ``instant-functions'')
1156 % % \end{frame}
1157
1158
1159 % % \begin{frame}[fragile]
1160 % %   \frametitle{Function Reuse}
1161 % %   By naming the function, we could make the previous example reusable
1162 % %   (if possible):
1163 % % \begin{verbatim}
1164 % % (defun my-src-directory ()
1165 % %       (file-name-as-directory
1166 % %          (expand-file-name
1167 % %            (concat (default-directory)
1168 % %                    ess-suffix "-src"))))
1169 % % \end{verbatim}
1170 % %   Example:
1171 % % \begin{verbatim}
1172 % % (setq ess-source-directory (my-src-directory))
1173 % % \end{verbatim}
1174 % % \end{frame}
1175
1176
1177 % % \begin{frame}
1178 % %   \frametitle{Equality Among Packages}
1179 % %   \begin{itemize}
1180 % %   \item more/less equal can be described specifically through
1181 % %     overriding imports.
1182 % %   \end{itemize}
1183 % % \end{frame}
1184
1185
1186 % \subsection<presentation>*{For Further Reading}
1187
1188 % \begin{frame}[allowframebreaks]
1189 %   \frametitle<presentation>{Related Material}
1190
1191 %   \begin{thebibliography}{10}
1192
1193 %   \beamertemplatebookbibitems
1194 %   % Start with overview books.
1195
1196 %   \bibitem{LispStat1990}
1197 %     L.~Tierney
1198 %     \newblock {\em LispStat}.
1199
1200 %   \beamertemplatearticlebibitems
1201 %   % Followed by interesting articles. Keep the list short.
1202
1203 %   \bibitem{Rossini2001}
1204 %     AJ.~Rossini
1205 %     \newblock Literate Statistical Practice
1206 %     \newblock {\em Proceedings of the Conference on Distributed
1207 %       Statistical Computing}, 2001.
1208
1209 %   \bibitem{RossiniLeisch2003}
1210 %     AJ.~Rossini and F.~Leisch
1211 %     \newblock Literate Statistical Practice
1212 %     \newblock {\em Technical Report Series, University of Washington
1213 %       Department of Biostatistics}, 2003.
1214
1215 %   \beamertemplatearrowbibitems
1216 %   % Followed by interesting articles. Keep the list short.
1217
1218 %   \bibitem{CLS}
1219 %     Common Lisp Stat, 2008.
1220 %     \newblock \url{http://repo.or.cz/CommonLispStat.git/}
1221
1222 %   \end{thebibliography}
1223 % \end{frame}