Doc/talks/Rossini-DSC-July2009.tex

   1 \documentclass{beamer}
   2
   3 \mode<presentation>
   4 {
   5   \usetheme{classic}
   6   \setbeamercovered{transparent}
   7 }
   8
   9 \usepackage[english]{babel}
  10 \usepackage[latin1]{inputenc}
  11 \usepackage{times}
  12 \usepackage[T1]{fontenc}
  13
  14 \title[CLS]{Common Lisp Statistics}
  15 \subtitle{Using History to design better data analysis environments}
  16 \author[Rossini]{Anthony~(Tony)~Rossini}
  17
  18 \institute[Novartis and University of Washington]{
  19   Group Head, Modeling and Simulation Statistics\\
  20   Novartis Pharma AG, Switzerland
  21   \and
  22   Affiliate Assoc Prof, Biomedical and Health Informatics\\
  23   University of Washington, USA}
  24
  25 \date[DSC2009]{DSC 2009, Copenhagen}
  26 \subject{Statistical Computing Environments}
  27
  28 \begin{document}
  29
  30 \begin{frame}
  31   \titlepage
  32 \end{frame}
  33
  34 \begin{frame}{Outline}
  35   \tableofcontents
  36 \end{frame}
  37
  38 % Structuring a talk is a difficult task and the following structure
  39 % may not be suitable. Here are some rules that apply for this
  40 % solution:
  41
  42 % - Exactly two or three sections (other than the summary).
  43 % - At *most* three subsections per section.
  44 % - Talk about 30s to 2min per frame. So there should be between about
  45 %   15 and 30 frames, all told.
  46
  47 % - A conference audience is likely to know very little of what you
  48 %   are going to talk about. So *simplify*!
  49 % - In a 20min talk, getting the main ideas across is hard
  50 %   enough. Leave out details, even if it means being less precise than
  51 %   you think necessary.
  52 % - If you omit details that are vital to the proof/implementation,
  53 %   just say so once. Everybody will be happy with that.
  54
  55 \section{What Works?}
  56 \label{sec:work}
  57
  58 \begin{frame}{Is it Vaporware?}
  59
  60   Not quite...
  61 \end{frame}
  62
  63 \subsection{Graphics}
  64 \label{sec:work:graphics}
  65
  66 \begin{frame}{Silly Visualization Example}
  67 \includegraphics[width=3in,height=3in]{/home/tony/test1.png}
  68 \end{frame}
  69
  70 \begin{frame}[fragile]{Graphics Device}
  71 \begin{verbatim}
  72 (defparameter *frame2*
  73    (as-frame (create-xlib-image-context 200 200)
  74             :background-color +white+))
  75 (bind ((#2A((f1 f2) (f3 f4))
  76          (split-frame *frame2*
  77                       (percent 50) (percent 50))))
  78   (defparameter *f1* f1)  ; bottom left
  79   (defparameter *f2* f2)  ; bottom right  f3  f4
  80   (defparameter *f3* f3)  ; top left      f1  f2
  81   (defparameter *f4* f4)) ; top right
  82 \end{verbatim}
  83 \end{frame}
  84
  85 \begin{frame}[fragile]{Functions to Plot}
  86 \begin{verbatim}
  87 (plot-function *f1* #'sin
  88   (interval-of 0 2)
  89   :x-title "x" :y-title "sin(x)")
  90 (plot-function *f2* #'cos (interval-of 0 2)
  91   :x-title "x" :y-title "cos(x)")
  92 (plot-function *f3* #'tan (interval-of 0 2)
  93   :x-title "x" :y-title "tan(x)")
  94 \end{verbatim}
  95 \end{frame}
  96
  97 \begin{frame}[fragile]{Things to Plot}
  98 \begin{verbatim}
  99 (let* ((n 500)
 100        (xs (num-sequence
 101              :from 0 :to 10 :length n))
 102        (ys (map 'vector
 103               #'(lambda (x) (+ x 8 (random 4.0)))
 104               xs))
 105        (weights
 106           (replicate #'(lambda () (1+ (random 10)))
 107                      n 'fixnum))
 108        (da (plot-simple *f4*
 109              (interval-of 0 10) (interval-of 10 20)
 110              :x-title "x" :y-title "y")))
 111   (draw-symbols da xs ys :weights weights))
 112 \end{verbatim}
 113 \end{frame}
 114
 115 \subsection{Statistical Models}
 116 \label{sec:work:statmod}
 117
 118
 119 \begin{frame}[fragile]{Linear Regression}
 120   Primitive LispStat, a wrapper around LAPACK's \texttt{dgelsy}:
 121 \begin{verbatim}
 122 (defparameter *result1*
 123    (regression-model (list->vector-like iron)
 124                      (list->vector-like absorbtion)))
 125 *result*1 =>
 126
 127 \end{verbatim}
 128 \end{frame}
 129
 130 \subsection{Numerical Descriptions}
 131 \label{sec:work:numdesc}
 132
 133 \begin{frame}[fragile]{Descriptives}
 134   (mean iron)
 135
 136 \end{frame}
 137
 138 \subsection{Data Manip/Mgmt}
 139 \label{sec:work:data}
 140
 141 \begin{frame}[verbatim]{DataFrames}
 142
 143 \end{frame}
 144
 145 \begin{frame}[verbatim]{Numerical Matrices}
 146
 147 \end{frame}
 148
 149 \begin{frame}{Managing / Manipulating Data}
 150
 151 \end{frame}
 152
 153
 154
 155 \section{Graphics}
 156 \label{sec:practice}
 157
 158
 159
 160 \begin{frame}[fragile]{Copying existing graphics}
 161   And we generated the figure on the first page by:
 162 \begin{verbatim}
 163 (xlib-image-context-to-png
 164    (context *f1*)
 165    "/home/tony/test1.png")
 166 \end{verbatim}
 167 \end{frame}
 168
 169
 170 \section{Common Lisp Statistics}
 171
 172 \begin{frame}{Why CLS?}
 173   \begin{itemize}
 174   \item a component-based structure for statistical computing
 175   \item ability to leverage non-statisticians interested in computing
 176     technologies (compilers, protocols, interfaces)
 177   \end{itemize}
 178 \end{frame}
 179
 180 \begin{frame}{Current Functionality}
 181   \begin{itemize}
 182   \item basic dataframes (similar to R); indexing/slicing API under
 183     development.
 184   \item Basic regression (similar to XLispStat)
 185   \item matrix storage both in foreign and lisp-centric areas.
 186   \item LAPACK (small percentage, increasing), working with both
 187     matrix storage types
 188   \item static graphics (X11) including preliminary grid functionality based
 189     on CAIRO.  Generation of PNG files from graphics windows.
 190   \item CSV file support
 191   \item Common Lisp!
 192   \end{itemize}
 193 \end{frame}
 194
 195 \begin{frame}[fragile]{Computational Environment Supported}
 196   \begin{itemize}
 197   \item Should  work on Linux, with recent SBCL versions
 198   \item Definitely works on bleeding edge Debian (unstable).
 199   \item Has worked on 4 different people's environments (not quite,
 200     but sort of requires a \verb+/home/tony/+ !)
 201   \item
 202   \end{itemize}
 203 \end{frame}
 204
 205 \begin{frame}{Common Lisp}
 206   advanced iteration
 207 \end{frame}
 208
 209
 210 \begin{frame}[fragile]{Finding out things}
 211   \begin{itemize}
 212   \item CL-NUMLIB
 213      num-sequence :from LOW to: HIGH :length SEQ-LENGTH
 214      seq(from,to,by/length)
 215    \item
 216 \begin{verbatim}
 217 (documentation
 218      'cl-numlib:num-sequence
 219      'function)
 220 \end{verbatim}
 221    \item This
 222   \end{itemize}
 223 \end{frame}
 224
 225
 226 \section{Computable Statistics}
 227
 228 \begin{frame}{What does NOT work?}
 229   Primarily, the reason that we doing this:
 230
 231   \textbf{Computable and Executable Statistics}
 232 \end{frame}
 233
 234 \begin{frame}{Can we compute with them?}
 235   3 Examples:
 236   \begin{itemize}
 237   \item Research
 238   \item Consulting
 239   \item Reimplementation
 240   \end{itemize}
 241   Consider whether one can ``compute'' with the information given?
 242 \end{frame}
 243
 244 \begin{frame}[fragile]{Example 1: Theory\ldots}
 245   \label{example1}
 246   Let $f(x;\theta)$ describe the likelihood of XX under the following
 247   assumptions.
 248   \begin{enumerate}
 249   \item assumption-1
 250   \item assumption-2
 251   \end{enumerate}
 252   Then if we use the following algorithm:
 253   \begin{enumerate}
 254   \item step-1
 255   \item step-2
 256   \end{enumerate}
 257   then $\hat{\theta}$ should be $N(0,\hat\sigma^2)$ with the following
 258   characteristics\ldots
 259 \end{frame}
 260
 261 \begin{frame}
 262   \frametitle{Can we compute, using this description?}
 263   Given the information at hand:
 264   \begin{itemize}
 265   \item we ought to have a framework for initial coding for the
 266     actual simulations (test-first!)
 267   \item the implementation is somewhat clear
 268   \item We should ask: what theorems have similar assumptions?
 269   \item We should ask: what theorems have similar conclusions but
 270     different assumptions?
 271   \end{itemize}
 272 \end{frame}
 273
 274 \begin{frame}[fragile]{Realizing Theory}
 275 \small{
 276 \begin{verbatim}
 277 (define-theorem my-proposed-theorem
 278    (:theorem-type '(distribution-properties
 279                     frequentist
 280                     likelihood))
 281    (:assumes '(assumption-1 assumption-2))
 282    (:likelihood-form
 283       (defun likelihood (data theta gamma)
 284         (exponential-family theta gamma)))
 285    (:compute-by
 286       '(progn
 287          (compute-starting-values thetahat gammahat)
 288          (until (convergence)
 289            (setf convergence
 290                  (or (step-1 thetahat)
 291                      (step-2 gammahat))))))
 292    (:claim (assert
 293              (and (equal-distribution thetahat 'normal)
 294                   (equal-distribution gammahat 'normal)))))
 295 \end{verbatim}
 296 }
 297 \end{frame}
 298
 299 \begin{frame}[fragile]{It would be nice to have}
 300 \begin{verbatim}
 301    (theorem-veracity 'my-proposed-theorem)
 302 \end{verbatim}
 303 \end{frame}
 304
 305 \begin{frame}[fragile]{and why not...?}
 306 \begin{verbatim}
 307    (when (theorem-veracity
 308               'my-proposed-theorem)
 309       (write-paper 'my-proposed-theorem
 310                    :style :JASA
 311                    :output-format
 312                          '(LaTeX MSWord)))
 313 \end{verbatim}
 314 \end{frame}
 315
 316 \begin{frame}{Comments}
 317   \begin{itemize}
 318   \item The general problem is very difficult
 319   \item Some progress has been made in small areas of basic
 320     statistics: currently working on linear regression (LS-based,
 321     Normal-bayesian) and the T-test.
 322   \item Areas targetted for medium-term future: resampling methods and
 323     similar algorithms.
 324   \end{itemize}
 325
 326 \end{frame}
 327
 328 \begin{frame}
 329   \frametitle{Example 2: Practice\ldots}
 330   \label{example2}
 331   The dataset comes from a series of clinical trials.  We model the
 332   primary endpoint, ``relief'', as a binary random variable.  There is
 333   a random trial effect on relief as well as severity due to
 334   differences in recruitment and inclusion/exclusion criteria.
 335 \end{frame}
 336
 337 \begin{frame}
 338   \frametitle{Can we compute, using this description?}
 339   \begin{itemize}
 340   \item With a real such description, it is clear what some of the
 341     potential models might be for this dataset
 342   \item It should be clear how to start thinking of a data dictionary
 343     for this problem.
 344   \end{itemize}
 345 \end{frame}
 346
 347 \begin{frame}[fragile]{Can we compute?}
 348 \begin{verbatim}
 349   (dataset-metadata paper-1
 350     :context 'clinical-trials
 351     :variables '((relief :model-type dependent
 352                          :distribution binary)
 353                  (trial  :model-type independent
 354                          :distribution categorical)
 355                  (disease-severity))
 356     :metadata '(inclusion-criteria
 357                 exclusion-criteria
 358                 recruitment-rate))
 359   (propose-analysis paper-1)
 360      ; => '(tables
 361      ;      (logistic regression))
 362 \end{verbatim}
 363 \end{frame}
 364
 365 \begin{frame}{Example 3: The Round-trip\ldots}
 366   \label{example3}
 367   The first examples describe ``ideas $\rightarrow$ code''
 368
 369   Consider the last time you read someone else's implementation of a
 370   statistical procedure (i.e. R package code).  When you read the
 371   code, could you see:
 372   \begin{itemize}
 373   \item the assumptions used?
 374   \item the algorithm implemented?
 375   \item practical guidance for when you might select the algorithm
 376     over others?
 377   \item practical guidance for when you might select the
 378     implementation over others?
 379   \end{itemize}
 380   These are usually components of any reasonable journal article.
 381   \textit{(Q: have you actually read an R package that wasn't yours?)}
 382 \end{frame}
 383
 384 \begin{frame}{Exercise left to the reader!}
 385
 386   (aside: I have been looking at the \textbf{stats} and \textbf{lme4}
 387   packages recently -- \textit{for me}, very clear numerically, much
 388   less so statistically)
 389 \end{frame}
 390
 391
 392
 393 \section{Discussion}
 394
 395 \begin{frame}{Conclusion}
 396   \begin{itemize}
 397   \item Numerics: Linear algebra basics done -- full development
 398   \item Static graphics: progress being made, have a partial grid-solution
 399   \item
 400   \end{itemize}
 401 \end{frame}
 402
 403
 404 \end{document}
 405
 406 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 407
 408
 409 \section{Preliminaries}
 410
 411 \subsection{Context}
 412
 413 \begin{frame}{Goals for this Talk}{(define, strategic approach,
 414     justify)}
 415
 416   \begin{itemize}
 417   \item To describe the concept of \alert{computable and executable
 418       statistics}, placing it in a historical context.
 419
 420   \item To demonstrate that \alert{a research program}
 421     implemented through  simple steps can increase the efficiency  of
 422     statistical computing approaches by  clearly describing both:
 423     \begin{itemize}
 424     \item numerical characteristics of procedures,
 425     \item statistical concepts driving them.
 426     \end{itemize}
 427
 428   \item To justify that the \alert{approach is worthwhile} and
 429     represents a staged effort towards \alert{increased use of best
 430       practices}.
 431   \end{itemize}
 432   (unfortunately, the last is still incomplete)
 433 \end{frame}
 434
 435
 436 \begin{frame}{Historical Computing Languages}
 437   \begin{itemize}
 438   \item FORTRAN : FORmula TRANslator.  Original numerical computing
 439     language, designed for clean implementation of numerical
 440     algorithms
 441   \item LISP : LISt Processor.  Associated with symbolic
 442     manipulation, AI, and knowledge approaches
 443   \end{itemize}
 444
 445   They represent the 2 generalized needs of statistical computing,
 446   which could be summarized as
 447   \begin{itemize}
 448   \item algorithms/numerics,
 449   \item elicitation, communication, and generation of knowledge (``data
 450     analysis'')
 451   \end{itemize}
 452 \end{frame}
 453
 454 \begin{frame}{Statistical Computing Environments}
 455
 456   Past:
 457   \begin{itemize}
 458   \item SPSS / BMDP / SAS
 459   \item S ( S, S-PLUS, R)
 460   \item LispStat ( XLispStat,  ViSta, ARC , CommonLispStat ) ; QUAIL
 461   \item XGobi (Orca / GGobi / Statistical Reality Engine)
 462   \item MiniTab
 463   \item Stata
 464   \item DataDesk
 465   \item Augsburg Impressionist series (MANET,
 466   \item Excel
 467   \end{itemize}
 468   many others...
 469
 470 \end{frame}
 471
 472 \begin{frame}{How many are left?}
 473
 474   \begin{itemize}
 475   \item R
 476   \item SAS
 477   \item SPSS
 478   \item Stata
 479   \item Minitab
 480   \item very few others...
 481   \end{itemize}
 482   ``R is the Microsoft of the statistical computing world'' -- anonymous.
 483 \end{frame}
 484
 485 \begin{frame}{Selection Pressure}
 486   \begin{itemize}
 487   \item the R user population is growing rapidly, fueled by critical
 488     mass, quality, and value
 489   \item R is a great system for applied data analysis
 490   \item R is not such a great system for research into statistical
 491     computing (backwards compatibility, inertia due to user population)
 492   \end{itemize}
 493   There is a need for alternative experiments for developing new
 494   approaches/ideas/concepts.
 495 \end{frame}
 496
 497 \begin{frame}{Philosophically, why Common Lisp?}
 498   Philosophically:
 499   \begin{itemize}
 500   \item Lisp can cleanly present computational intentions, both
 501     symbolically and numerically.
 502   \item Semantics and context are important: well supported by Lisp
 503     paradigms.
 504   \item Lisp's parentheses describe singular, multi-scale,
 505     \alert{complete thoughts}.
 506   \end{itemize}
 507
 508 \end{frame}
 509
 510 \begin{frame}{Technically, why Common Lisp?}
 511   \begin{itemize}
 512   \item interactive COMPILED language (``R with a compiler'')
 513   \item CLOS is R's S4 object system ``done right''.
 514   \item clean semantics: modality, typing, can be expressed the way
 515     one wants it.
 516   \item programs are data, data are programs, leading to
 517   \item Most modern computing tools available (XML, WWW technologies)
 518   \item ``executable XML''
 519   \end{itemize}
 520   Common Lisp is very close in usage to how people currently use R
 521   (mostly interactive, some batch, and a wish for compilation efficiency).
 522 \end{frame}
 523
 524 \subsection{Background}
 525
 526 \begin{frame}
 527   \frametitle{Desire: Semantics and Statistics}
 528   \begin{itemize}
 529   \item The semantic web (content which is self-descriptive) is an
 530     interesting and potentially useful idea.
 531
 532   \item
 533     Biological informatics support (GO, Entrez) has allowed for
 534     precise definitions of concepts in biology.
 535
 536   \item It is a shame that a field like statistics, requiring such
 537     precision, has less than an imprecise and temporally instable
 538     field such as biology\ldots
 539   \end{itemize}
 540
 541   How can we express statistical work (research, applied work) which
 542   is both human and computer readable (perhaps subject to
 543   transformations first)?
 544 \end{frame}
 545
 546
 547 % \subsection{Context}
 548
 549 % \begin{frame}{Context}{(where I'm coming from, my ``priors'')}
 550 %   \begin{itemize}
 551 %   \item Pharmaceutical Industry
 552 %   \item Modeling and Simulation uses mathematical models/constructs to
 553 %     record beliefs (biology, pharmacology, clinical science) for
 554 %     explication, clinical team alignment, decision support, and
 555 %     quality.
 556 %   \item My work at Novartis is at the intersection of biomedical
 557 %     informatics, statistics, and mathematical modeling.
 558 %   \item As manager: I need a mix of applications and novel research development to
 559 %     solve our challenges better, faster, more efficiently.
 560 %   \item Data analysis is a specialized approach to computer
 561 %     programming, \alert{different} than applications programming or
 562 %     systems programming.
 563 %   \end{itemize}
 564 % \end{frame}
 565
 566
 567 \subsection{Literate Programming is insufficient}
 568
 569 \begin{frame}{Literate Statistical Practice.}
 570   \begin{enumerate}
 571   \item Literate Programming applied to data analysis (Rossini, 1997/2001)
 572   \item among the \alert{most annoying} techniques to integrate into
 573     work-flow if one is not perfectly methodological.
 574   \item Some tools:
 575     \begin{itemize}
 576     \item ESS: supports interactive creation of literate programs.
 577     \item Sweave: tool which exemplifies reporting context; odfWeave
 578       primarily simplifies reporting.
 579     \item Roxygen: primarily supports a literate programming
 580       documentation style, not a literate data analysis programming
 581       style.
 582   \end{itemize}
 583   \item ROI demonstrated in specialized cases: BioConductor.
 584   \item \alert{usually done after the fact} (final step of work-flow)
 585     as a documentation/computational reproducibility technique, rarely
 586     integrated into work-flow.
 587   \end{enumerate}
 588   Many contributors:
 589   Knuth, Claerbout, Carey, de Leeuw, Leisch, Gentleman, Temple-Lang,
 590   \ldots{}
 591 \end{frame}
 592
 593 \begin{frame}
 594   \frametitle{Literate Programming}
 595   \framesubtitle{Why isn't it enough for Data Analysis?}
 596
 597   Only 2 contexts: (executable) code and documentation.  Fine for
 598   application programming,  but for data analysis, we could benefit
 599   from:
 600   \begin{itemize}
 601   \item classification of statistical procedures
 602   \item descriptions of assumptions
 603   \item pragmatic recommendations
 604   \item inheritance of structure through the work-flow of a
 605     statistical methodology or data analysis project
 606   \item datasets and metadata
 607   \end{itemize}
 608   Concept: ontologies describing mathematical assumptions, applications
 609   of methods, work-flow, and statistical data structures can enable
 610   machine communication.
 611
 612   (i.e. informatics framework ala biology)
 613 \end{frame}
 614
 615
 616 \begin{frame}{Communication in Statistical Practice}{\ldots is essential for \ldots}
 617   \begin{itemize}
 618   \item finding
 619   \item explanations
 620   \item agreement
 621   \item receiving information
 622   \end{itemize}
 623   \alert{``machine-readable'' communication/computation lets the
 624     computer help} \\
 625   Semantic Web is about ``machine-enabled computability''.
 626 \end{frame}
 627
 628 \begin{frame}  \frametitle{Semantics}
 629   \framesubtitle{One definition: description and context}
 630
 631   Interoperability is the key, with respect to
 632   \begin{itemize}
 633   \item ``Finding things''
 634   \item Applications and activities with related functionality
 635     \begin{itemize}
 636     \item moving information from one state to another (paper, journal
 637       article, computer program)
 638     \item computer programs which implement solutions to similar tasks
 639     \end{itemize}
 640   \end{itemize}
 641 \end{frame}
 642
 643
 644 \begin{frame}{Statistical Practice is somewhat restricted}
 645   {...but in a good sense, enabling potential for semantics...}
 646
 647   There is a restrictable set of intended actions for what can be done
 648   -- the critical goal is to be able to make a difference by
 649   accelerating activities that should be ``computable'':
 650   \begin{itemize}
 651   \item restricted natural language processing
 652   \item mathematical translation
 653   \item common description of activities for simpler programming/data
 654     analysis (S approach to objects and methods)
 655   \end{itemize}
 656   R is a good basic start (model formulation approach, simple
 657   ``programming with data'' paradigm); we should see if we can do
 658   better!
 659 \end{frame}
 660
 661 \begin{frame}{Computable and Executable Statistics requires}
 662
 663   \begin{itemize}
 664   \item approaches to describe data and metadata (``data'')
 665     \begin{itemize}
 666     \item semantic WWW
 667     \item metadata management and integration, driving
 668     \item data integration
 669     \end{itemize}
 670   \item approaches to describe data analysis methods (``models'')
 671     \begin{itemize}
 672     \item quantitatively: many ontologies (AMS, etc), few meeting
 673       statistical needs.
 674     \item many substantive fields have implementations
 675       (bioinformatics, etc) but not well focused.
 676     \end{itemize}
 677   \item approaches to describe the specific form of interaction
 678     (``instances of models'')
 679     \begin{itemize}
 680     \item Original idea behind ``Literate Statistical Analysis''.
 681     \item That idea is suboptimal, more structure needed (not
 682       necessarily built upon existing...).
 683     \end{itemize}
 684   \end{itemize}
 685 \end{frame}
 686
 687 \subsection{Common Lisp Statistics}
 688
 689 \begin{frame}
 690   \frametitle{Interactive Programming}
 691   \framesubtitle{Everything goes back to being Lisp-like}
 692   \begin{itemize}
 693   \item Interactive programming (as originating with Lisp): works
 694     extremely well for data analysis (Lisp being the original
 695     ``programming with data'' language).
 696   \item Theories/methods for how to do this are reflected in styles
 697     for using R.
 698   \end{itemize}
 699 \end{frame}
 700
 701 \begin{frame}[fragile]
 702   \frametitle{Lisp}
 703
 704   Lisp (LISt Processor) is different than most high-level computing
 705   languages, and is very old (1956).  Lisp is built on lists of things
 706   which are evaluatable.
 707 \begin{verbatim}
 708 (functionName data1 data2 data3)
 709 \end{verbatim}
 710   or ``quoted'':
 711 \begin{verbatim}
 712 '(functionName data1 data2 data3)
 713 \end{verbatim}
 714   which is shorthand for
 715 \begin{verbatim}
 716 (list functionName data1 data2 data3)
 717 \end{verbatim}
 718   The difference is important -- lists of data (the second/third) are
 719   not (yet?!) functions applied to (unencapsulated lists of) data (the first).
 720 \end{frame}
 721
 722 \begin{frame}
 723   \frametitle{Features}
 724   \begin{itemize}
 725   \item Data and Functions semantically the same
 726   \item Natural interactive use through functional programming with
 727     side effects
 728   \item Batch is a simplification of interactive -- not a special mode!
 729   \end{itemize}
 730 \end{frame}
 731
 732
 733
 734 \begin{frame}[fragile]{Representation: XML and Lisp}{executing your data}
 735   Many people are familiar with XML:
 736 \begin{verbatim}
 737 <name phone="+41793674557">Tony Rossini</name>
 738 \end{verbatim}
 739   which is shorter in Lisp:
 740 \begin{verbatim}
 741 (name "Tony Rossini" :phone "+41613674557")
 742 \end{verbatim}
 743   \begin{itemize}
 744   \item Lisp ``parens'', universally hated by unbelievers, are
 745     wonderful for denoting when a ``concept is complete''.
 746   \item Why can't your data self-execute?
 747   \end{itemize}
 748 \end{frame}
 749
 750 \begin{frame}[fragile]{Numerics with Lisp}
 751   \begin{itemize}
 752   \item addition of rational numbers and arithmetic
 753   \item example for mean
 754 \begin{verbatim}
 755  (defun mean (x)
 756     (checktype x 'vector-like)
 757     (/ (loop for i from 0 to (- (nelts *x*) 1)
 758           summing (vref *x* i))
 759        (nelts *x*)))
 760 \end{verbatim}
 761   \item example for variance
 762 \begin{verbatim}
 763 (defun variance (x)
 764   (let ((meanx (mean x))
 765         (nm1 (1- (nelts x))))
 766      (/ (loop for i from 0 to nm1
 767            summing (power (- (vref *x* i) meanx) 2)
 768         nm1))))
 769 \end{verbatim}
 770   \item But through macros, \verb+(vref *x* i)+ could be
 771     \verb+#V(X[i])+ or your favorite syntax.
 772   \end{itemize}
 773
 774 \end{frame}
 775
 776
 777 \begin{frame}{Common Lisp Statistics 1}
 778   \begin{itemize}
 779   \item Originally based on LispStat (reusability)
 780   \item Re-factored structure (some numerics worked with a 1990-era code base).
 781   \item Current activities:
 782     \begin{enumerate}
 783     \item numerics redone using CFFI-based BLAS/LAPLACK (cl-blapack)
 784     \item matrix interface based on MatLisp
 785     \item starting design of a user interface system (interfaces,
 786       visuals).
 787     \item general framework for model specification (regression,
 788       likelihood, ODEs)
 789     \item general framework for algorithm specification (bootstrap,
 790       MLE, algorithmic data anaylsis methods).
 791     \end{enumerate}
 792   \end{itemize}
 793 \end{frame}
 794
 795 \begin{frame}{Common Lisp Statistics 2}
 796
 797   \begin{itemize}
 798   \item Implemented using SBCL.  Contributed fixes for
 799     Clozure/OpenMCL. Goal to target CLISP
 800   \item Supports LispStat prototype object system
 801   \item Package-based design -- only use the components you need, or
 802     the components whose API you like.
 803   \end{itemize}
 804 \end{frame}
 805
 806 \section{Discussion}
 807
 808 \begin{frame}
 809   \frametitle{Outlook}
 810   \begin{itemize}
 811   \item Semantics and Computability have captured a great deal of
 812     attention in the informatics and business computing R\&D worlds
 813   \item Statistically-driven Decision Making and Knowledge Discovery
 814     is, with high likelihood, the next challenging stage after data
 815     integration.
 816   \item Statistical practice (theory and application) can be enhanced,
 817     made more efficient, providing  increased benefit to organizations
 818     and groups using appropriate methods.
 819   \item Lisp as a language, shares characteristics of both Latin
 820     (difficult dead language useful for classical training) and German
 821     (difficult living language useful for general life).  Of course,
 822     for some people, they are not difficult.
 823   \end{itemize}
 824
 825 \end{frame}
 826
 827 \begin{frame}
 828   The research program described in this talk is currently driving the
 829   design of CommonLisp Stat, which leverages concepts and approaches
 830   from the dead and moribund LispStat project.
 831
 832   \begin{itemize}
 833   \item \url{http://repo.or.cz/w/CommonLispStat.git/}
 834   \item \url{http://www.github.com/blindglobe/}
 835   \end{itemize}
 836
 837 \end{frame}
 838 \begin{frame}{Final Comment}
 839
 840   \begin{itemize}
 841   \item In the Pharma industry, it is all about getting the right
 842     drugs to the patient faster.  Data analysis systems seriously
 843     impact this process, being potentially an impediment or an
 844     accelerator.
 845
 846     \begin{itemize}
 847     \item \alert{Information technologies can increase the efficiency
 848         of statistical practice}, though innovation change management
 849       must be taking into account.  (i.e. Statistical practice, while
 850       considered by some an ``art form'', can benefit from
 851       industrialization).
 852     \item \alert{Lisp's features match the basic requirements we need}
 853       (dichotomy: programs as data, data as programs).  Sales pitch,
 854       though...
 855     \item Outlook: Lots of work and experimentation to do!
 856     \end{itemize}
 857   \end{itemize}
 858 \end{frame}
 859
 860
 861 % % All of the following is optional and typically not needed.
 862 % \appendix
 863
 864
 865 % \section<presentation>*{\appendixname}
 866
 867
 868 % \begin{frame} \frametitle{Complements and Backup}
 869 %   No more, stop here.  Questions?  (now or later).
 870 % \end{frame}
 871
 872 % \begin{frame}{The Industrial Challenge.}{Getting the Consulting Right.}
 873 %   % - A title should summarize the slide in an understandable fashion
 874 %   %   for anyone how does not follow everything on the slide itself.
 875
 876 %   \begin{itemize}
 877 %   \item Recording assumptions for the next data analyst, reviewer.
 878 %     Use \texttt{itemize} a lot.
 879 %   \item
 880 %     Use very short sentences or short phrases.
 881 %   \end{itemize}
 882 % \end{frame}
 883
 884
 885 % \begin{frame}{The Industrial Challenge.}{Getting the Right Research Fast.}
 886 %   % - A title should summarize the slide in an understandable fashion
 887 %   %   for anyone how does not follow everything on the slide itself.
 888
 889 %   \begin{itemize}
 890 %   \item
 891 %     Use \texttt{itemize} a lot.
 892 %   \item
 893 %     Use very short sentences or short phrases.
 894 %   \end{itemize}
 895 % \end{frame}
 896
 897
 898 % \begin{frame}{Explicating the Work-flow}{QA/QC-based improvements.}
 899
 900
 901 % \end{frame}
 902
 903 % \section{Motivation}
 904
 905 % \subsection{IT Can Speed up Deliverables in Statistical Practice}
 906
 907 % \begin{frame}{Our Generic Work-flow and Life-cycle}
 908 %   {describing most data analytic activities}
 909 %   Workflow:
 910 %   \begin{enumerate}
 911 %   \item Scope out the problem
 912 %   \item Sketch out a potential solution
 913 %   \item Implement until road-blocks appear
 914 %   \item Deliver results
 915 %   \end{enumerate}
 916
 917 %   Lifecycle:
 918 %   \begin{enumerate}
 919 %   \item paper sketch
 920 %   \item 1st e-draft of text/code/date (iterate to \#1, discarding)
 921 %   \item cycle through work
 922 %   \item publish
 923 %   \item ``throw-away''
 924 %   \end{enumerate}
 925 %   but there is valuble information that could enable the next
 926 %   generation!
 927 % \end{frame}
 928
 929 % \begin{frame}[fragile]{Paper $\rightarrow$ Computer  $\rightarrow$ Article $\rightarrow$ Computer}{Cut and Paste makes for large errors.}
 930 %   \begin{itemize}
 931 %   \item Problems in a regulatory setting
 932 %   \item Regulatory issues are just ``best practices''
 933 %   \end{itemize}
 934
 935 %   Why do we ``copy/paste'', or analogously, restart our work?
 936
 937 %   pro:
 938 %   \begin{itemize}
 939 %   \item every time we repeat, we reinforce the idea in our brain
 940 %   \item review of ideas can help improve them
 941 %   \end{itemize}
 942 %   con:
 943 %   \begin{itemize}
 944 %   \item inefficiency
 945 %   \item introduction of mistakes
 946 %   \item loss of historical context
 947 %   \item changes to earlier work (on a different development branch)
 948 %     can not propagate.
 949 %   \end{itemize}
 950 % \end{frame}
 951
 952 % \section{Semantics and Statistical Practice}
 953
 954
 955 % \begin{frame}
 956 %   \frametitle{Statistical Activity Leads to Reports}
 957 %   \framesubtitle{You read what you know, do you understand it?}
 958
 959 %   How can we improve the communication of the ideas we have?
 960
 961 %   Precision of communication?
 962
 963 % \end{frame}
 964
 965
 966
 967 % \begin{frame}  \frametitle{Communication Requires Context}
 968 %   \framesubtitle{Intentions imply more than one might like...}
 969
 970 %   \begin{itemize}
 971 %   \item Consideration of what we might do
 972 %   \item Applications with related functionality
 973 %   \end{itemize}
 974 % \end{frame}
 975
 976
 977
 978 % \begin{frame}
 979 %   \frametitle{Design Patterns}
 980 %   \framesubtitle{Supporting Work-flow Transitions}
 981
 982 %   (joint work with H Wickham): The point of this research program is
 983 %   not to describe what to do at any particular stage of work, but to
 984 %   encourage researchers and practitioners to consider how the
 985 %   translation and transfer of information between stages so that work
 986 %   is not lost.
 987
 988 %   Examples of stages in a work-flow:
 989 %   \begin{itemize}
 990 %   \item planning, execution, reporting;
 991 %   \item scoping, illustrative examples or counter examples, algorithmic construction,
 992 %     article writing.
 993 %   \item descriptive statistics, preliminary inferential analysis,
 994 %     model/assumption checking, final inferential analysis,
 995 %     communication of scientific results
 996 %   \end{itemize}
 997 %   Description of work-flows is essential to initiating discussions on
 998 %   quality/efficiency of approaches to work.
 999 % \end{frame}
1000
1001 % \section{Design Challenges}
1002
1003 % \begin{frame}
1004 %   \frametitle{Activities are enhanced by support}
1005
1006 %   \begin{itemize}
1007 %   \item Mathematical manipulation can be enhanced by symbolic
1008 %     computation
1009 %   \item Statistical programming can be enabled by examples and related
1010 %     algorithm implementation
1011 %   \item Datasets, to a limited extent, can self-describe.
1012 %   \end{itemize}
1013 % \end{frame}
1014
1015 % \begin{frame}
1016 %   \frametitle{Executable and Computable Science}
1017
1018 %   Use of algorithms and construction to describe how things work.
1019
1020 %   Support for agent-based approaches
1021 % \end{frame}
1022
1023
1024 % \begin{frame}
1025 %   \frametitle{What is Data?  Metadata?}
1026
1027 %   Data: what we've observed
1028
1029 %   MetaData: context for observations, enables semantics.
1030 % \end{frame}
1031
1032
1033
1034
1035 % % \begin{frame}[fragile]
1036 % %   \frametitle{Defining Variables}
1037 % %   \framesubtitle{Setting variables}
1038 % % \begin{verbatim}
1039 % % (setq <variable> <value>)
1040 % % \end{verbatim}
1041 % %   Example:
1042 % % \begin{verbatim}
1043 % % (setq ess-source-directory
1044 % %       "/home/rossini/R-src")
1045 % % \end{verbatim}
1046 % % \end{frame}
1047
1048 % % \begin{frame}[fragile]
1049 % %   \frametitle{Defining on the fly}
1050 % % \begin{verbatim}
1051 % % (setq ess-source-directory
1052 % %    (lambda () (file-name-as-directory
1053 % %          (expand-file-name
1054 % %            (concat (default-directory)
1055 % %                    ess-suffix "-src")))))
1056 % % \end{verbatim}
1057 % %   (Lambda-expressions are anonymous functions, i.e. ``instant-functions'')
1058 % % \end{frame}
1059
1060
1061 % % \begin{frame}[fragile]
1062 % %   \frametitle{Function Reuse}
1063 % %   By naming the function, we could make the previous example reusable
1064 % %   (if possible):
1065 % % \begin{verbatim}
1066 % % (defun my-src-directory ()
1067 % %       (file-name-as-directory
1068 % %          (expand-file-name
1069 % %            (concat (default-directory)
1070 % %                    ess-suffix "-src"))))
1071 % % \end{verbatim}
1072 % %   Example:
1073 % % \begin{verbatim}
1074 % % (setq ess-source-directory (my-src-directory))
1075 % % \end{verbatim}
1076 % % \end{frame}
1077
1078
1079 % % \begin{frame}
1080 % %   \frametitle{Equality Among Packages}
1081 % %   \begin{itemize}
1082 % %   \item more/less equal can be described specifically through
1083 % %     overriding imports.
1084 % %   \end{itemize}
1085 % % \end{frame}
1086
1087
1088 % \subsection<presentation>*{For Further Reading}
1089
1090 % \begin{frame}[allowframebreaks]
1091 %   \frametitle<presentation>{Related Material}
1092
1093 %   \begin{thebibliography}{10}
1094
1095 %   \beamertemplatebookbibitems
1096 %   % Start with overview books.
1097
1098 %   \bibitem{LispStat1990}
1099 %     L.~Tierney
1100 %     \newblock {\em LispStat}.
1101
1102 %   \beamertemplatearticlebibitems
1103 %   % Followed by interesting articles. Keep the list short.
1104
1105 %   \bibitem{Rossini2001}
1106 %     AJ.~Rossini
1107 %     \newblock Literate Statistical Practice
1108 %     \newblock {\em Proceedings of the Conference on Distributed
1109 %       Statistical Computing}, 2001.
1110
1111 %   \bibitem{RossiniLeisch2003}
1112 %     AJ.~Rossini and F.~Leisch
1113 %     \newblock Literate Statistical Practice
1114 %     \newblock {\em Technical Report Series, University of Washington
1115 %       Department of Biostatistics}, 2003.
1116
1117 %   \beamertemplatearrowbibitems
1118 %   % Followed by interesting articles. Keep the list short.
1119
1120 %   \bibitem{CLS}
1121 %     Common Lisp Stat, 2008.
1122 %     \newblock \url{http://repo.or.cz/CommonLispStat.git/}
1123
1124 %   \end{thebibliography}
1125 % \end{frame}