Doc/talks/Rossini-DSC-July2009.tex

   1 \documentclass{beamer}
   2
   3 \mode<presentation>
   4 {
   5   \usetheme{classic}
   6   \setbeamercovered{transparent}
   7 }
   8
   9 \usepackage[english]{babel}
  10 \usepackage[latin1]{inputenc}
  11 \usepackage{times}
  12 \usepackage[T1]{fontenc}
  13
  14
  15 \title[CLS]{Common Lisp Statistics}
  16 \subtitle{Using History to design better data analysis environments}
  17 \author[Rossini]{Anthony~(Tony)~Rossini}
  18
  19 \institute[Novartis and University of Washington] % (optional, but mostly needed)
  20 {
  21   Group Head, Modeling and Simulation\\
  22   Novartis Pharma AG, Switzerland
  23   \and
  24   Affiliate Assoc Prof, Biomedical and Health Informatics\\
  25   University of Washington, USA}
  26
  27 \date[DSC2009]{DSC 2009, Copenhagen}
  28 \subject{Statistical Computing Environments}
  29
  30 \begin{document}
  31
  32 \begin{frame}
  33   \titlepage
  34 \end{frame}
  35
  36 \begin{frame}{Outline}
  37   \tableofcontents
  38 \end{frame}
  39
  40 % Structuring a talk is a difficult task and the following structure
  41 % may not be suitable. Here are some rules that apply for this
  42 % solution:
  43
  44 % - Exactly two or three sections (other than the summary).
  45 % - At *most* three subsections per section.
  46 % - Talk about 30s to 2min per frame. So there should be between about
  47 %   15 and 30 frames, all told.
  48
  49 % - A conference audience is likely to know very little of what you
  50 %   are going to talk about. So *simplify*!
  51 % - In a 20min talk, getting the main ideas across is hard
  52 %   enough. Leave out details, even if it means being less precise than
  53 %   you think necessary.
  54 % - If you omit details that are vital to the proof/implementation,
  55 %   just say so once. Everybody will be happy with that.
  56
  57 \section{What Works?}
  58
  59 \begin{frame}{Silly Visualization Example}
  60 \includegraphics[width=3in,height=3in]{/home/tony/test1.png}
  61 \end{frame}
  62
  63
  64 \begin{frame}{Linear Regression}
  65
  66 \end{frame}
  67
  68
  69 \begin{frame}{Descriptives}
  70
  71 \end{frame}
  72
  73 \begin{frame}{Data Management}
  74
  75 \end{frame}
  76
  77
  78 \begin{frame}{Workflow Management}
  79
  80 \end{frame}
  81
  82
  83 \section{The Practical}
  84 \label{sec:practice}
  85
  86
  87 \begin{frame}[fragile]{Graphics Device}
  88 \begin{verbatim}
  89 (defparameter *frame2*
  90    (as-frame
  91       (create-xlib-image-context 200 200)
  92         :background-color +white+))
  93 (bind ((#2A((f1 f2) (f3 f4))
  94          (split-frame *frame2*
  95                       (percent 50) (percent 50))))
  96   (defparameter *f1* f1)  ; bottom left
  97   (defparameter *f2* f2)  ; bottom right  3  4
  98   (defparameter *f3* f3)  ; top left      1  2
  99   (defparameter *f4* f4)) ; top right
 100 \end{verbatim}
 101 \end{frame}
 102
 103 \begin{frame}[fragile]{Functions to Plot}
 104 \begin{verbatim}
 105 (plot-function *f1* #'sin
 106   (interval-of 0 2)
 107   :x-title "x" :y-title "sin(x)")
 108 (plot-function *f2* #'cos (interval-of 0 2)
 109   :x-title "x" :y-title "cos(x)")
 110 (plot-function *f3* #'tan (interval-of 0 2)
 111   :x-title "x" :y-title "tan(x)")
 112 \end{verbatim}
 113 \end{frame}
 114
 115 \begin{frame}[fragile]{Things to Plot}
 116 \begin{verbatim}
 117 (let* ((n 500)
 118        (xs (num-sequence
 119              :from 0 :to 10 :length n))
 120        (ys (map 'vector
 121               #'(lambda (x) (+ x 8 (random 4.0)))
 122               xs))
 123        (weights
 124           (replicate #'(lambda () (1+ (random 10)))
 125                      n 'fixnum))
 126        (da (plot-simple *f4*
 127              (interval-of 0 10) (interval-of 10 20)
 128              :x-title "x" :y-title "y")))
 129   (draw-symbols da xs ys :weights weights))
 130 \end{verbatim}
 131 \end{frame}
 132
 133 \begin{frame}[fragile]{Copying existing graphics}
 134   And we generated the figure on the first page by:
 135 \begin{verbatim}
 136 (xlib-image-context-to-png
 137    (context *f1*)
 138    "/home/tony/test1.png")
 139 \end{verbatim}
 140 \end{frame}
 141
 142
 143 \section{Common Lisp Statistics}
 144
 145 \begin{frame}{Why CLS?}
 146   \begin{itemize}
 147   \item a component-based structure for statistical computing
 148   \item ability to leverage non-statisticians interested in computing
 149     technologies (compilers, protocols, interfaces)
 150   \end{itemize}
 151 \end{frame}
 152
 153 \begin{frame}{Current Functionality}
 154   \begin{itemize}
 155   \item dataframes (similar to R)
 156   \item Basic regression (similar to XLispStat)
 157   \item matrix storage both in foreign and lisp-centric areas.
 158   \item LAPACK (small percentage, increasing), working with both
 159     matrix storage types
 160   \item static graphics (X11) including preliminary grid functionality based
 161     on CAIRO.  Generation of PNG files from graphics windows.
 162   \item CSV file support
 163   \item Common Lisp!
 164   \end{itemize}
 165 \end{frame}
 166
 167 \begin{frame}{Computational Environment Supported}
 168   \begin{itemize}
 169   \item Should  work on Linux, with recent SBCL versions
 170   \item Definitely works on bleeding edge Debian (unstable).
 171   \item Has worked on 4 different people's environments (not quite,
 172     but sort of requires a \verb+/home/tony/+ !)
 173   \item
 174   \end{itemize}
 175 \end{frame}
 176
 177 \begin{frame}{Common Lisp}
 178   advanced iteration
 179 \end{frame}
 180
 181
 182 \begin{frame}[fragile]{Finding out things}
 183   \begin{itemize}
 184   \item CL-NUMLIB
 185      num-sequence :from LOW to: HIGH :length SEQ-LENGTH
 186      seq(from,to,by/length)
 187    \item
 188 \begin{verbatim}
 189 (documentation
 190      'cl-numlib:num-sequence
 191      'function)
 192 \end{verbatim}
 193    \item This
 194   \end{itemize}
 195 \end{frame}
 196
 197
 198 \section{Computable Statistics}
 199
 200 \begin{frame}{Why are we doing this?}
 201     Computable and Executable Statistics
 202 \end{frame}
 203
 204 \begin{frame}{Can we compute with them?}
 205   3 Examples:
 206   \begin{itemize}
 207   \item Research
 208   \item Consulting
 209   \item Reimplementation
 210   \end{itemize}
 211   Consider whether one can ``compute'' with the information given?
 212 \end{frame}
 213
 214 \begin{frame}[fragile]{Example 1: Theory\ldots}
 215   \label{example1}
 216   Let $f(x;\theta)$ describe the likelihood of XX under the following
 217   assumptions.
 218   \begin{enumerate}
 219   \item assumption-1
 220   \item assumption-2
 221   \end{enumerate}
 222   Then if we use the following algorithm:
 223   \begin{enumerate}
 224   \item step-1
 225   \item step-2
 226   \end{enumerate}
 227   then $\hat{\theta}$ should be $N(0,\hat\sigma^2)$ with the following
 228   characteristics\ldots
 229 \end{frame}
 230
 231 \begin{frame}
 232   \frametitle{Can we compute, using this description?}
 233   Given the information at hand:
 234   \begin{itemize}
 235   \item we ought to have a framework for initial coding for the
 236     actual simulations (test-first!)
 237   \item the implementation is somewhat clear
 238   \item We should ask: what theorems have similar assumptions?
 239   \item We should ask: what theorems have similar conclusions but
 240     different assumptions?
 241   \end{itemize}
 242 \end{frame}
 243
 244 \begin{frame}[fragile]{Realizing Theory}
 245   \small{
 246 \begin{verbatim}
 247 (define-theorem my-proposed-theorem
 248    (:theorem-type '(distribution-properties
 249                     frequentist
 250                     likelihood))
 251    (:assumes '(assumption-1 assumption-2))
 252    (:likelihood-form
 253       (defun likelihood (data theta gamma)
 254         (exponential-family theta gamma)))
 255    (:compute-by
 256       '(progn
 257          (compute-starting-values thetahat gammahat)
 258          (until (convergence)
 259            (setf convergence
 260                  (or (step-1 thetahat)
 261                      (step-2 gammahat))))))
 262    (:claim (assert
 263              (and (equal-distribution thetahat 'normal)
 264                   (equal-distribution gammahat 'normal)))))
 265 \end{verbatim}
 266   }
 267 \end{frame}
 268
 269 \begin{frame}[fragile]{It would be nice to have}
 270 \begin{verbatim}
 271    (theorem-veracity 'my-proposed-theorem)
 272 \end{verbatim}
 273 \end{frame}
 274
 275 \begin{frame}[fragile]{and why not...?}
 276 \begin{verbatim}
 277    (when (theorem-veracity
 278               'my-proposed-theorem)
 279       (write-paper 'my-proposed-theorem
 280                    :style :JASA
 281                    :output-format
 282                          '(LaTeX MSWord)))
 283 \end{verbatim}
 284 \end{frame}
 285
 286 \begin{frame}{Comments}
 287   \begin{itemize}
 288   \item The general problem is very difficult
 289   \item Some progress has been made in small areas of basic
 290     statistics: currently working on linear regression (LS-based,
 291     Normal-bayesian) and the T-test.
 292   \item Areas targetted for medium-term future: resampling methods and
 293     similar algorithms.
 294   \end{itemize}
 295
 296 \end{frame}
 297
 298 \begin{frame}
 299   \frametitle{Example 2: Practice\ldots}
 300   \label{example2}
 301   The dataset comes from a series of clinical trials.  We model the
 302   primary endpoint, ``relief'', as a binary random variable.  There is
 303   a random trial effect on relief as well as severity due to
 304   differences in recruitment and inclusion/exclusion criteria.
 305 \end{frame}
 306
 307 \begin{frame}
 308   \frametitle{Can we compute, using this description?}
 309   \begin{itemize}
 310   \item With a real such description, it is clear what some of the
 311     potential models might be for this dataset
 312   \item It should be clear how to start thinking of a data dictionary
 313     for this problem.
 314   \end{itemize}
 315 \end{frame}
 316
 317 \begin{frame}[fragile]{Can we compute?}
 318 \begin{verbatim}
 319   (dataset-metadata paper-1
 320     :context 'clinical-trials
 321     :variables '((relief :model-type dependent
 322                          :distribution binary)
 323                  (trial  :model-type independent
 324                          :distribution categorical)
 325                  (disease-severity))
 326     :metadata '(inclusion-criteria
 327                 exclusion-criteria
 328                 recruitment-rate))
 329   (propose-analysis paper-1)
 330      ; => '(tables
 331      ;      (logistic regression))
 332 \end{verbatim}
 333 \end{frame}
 334
 335 \begin{frame}{Example 3: The Round-trip\ldots}
 336   \label{example3}
 337   The first examples describe ``ideas $\rightarrow$ code''
 338
 339   Consider the last time you read someone else's implementation of a
 340   statistical procedure (i.e. R package code).  When you read the
 341   code, could you see:
 342   \begin{itemize}
 343   \item the assumptions used?
 344   \item the algorithm implemented?
 345   \item practical guidance for when you might select the algorithm
 346     over others?
 347   \item practical guidance for when you might select the
 348     implementation over others?
 349   \end{itemize}
 350   These are usually components of any reasonable journal article.
 351   \textit{(Q: have you actually read an R package that wasn't yours?)}
 352 \end{frame}
 353
 354 \begin{frame}{Exercise left to the reader!}
 355
 356   (aside: I have been looking at the \textbf{stats} and \textbf{lme4}
 357   packages recently -- \textit{for me}, very clear numerically, much
 358   less so statistically)
 359 \end{frame}
 360
 361
 362
 363 \section{Discussion}
 364
 365 \begin{frame}{Conclusion}
 366   \begin{itemize}
 367   \item Numerics: Linear algebra basics done -- full development
 368   \end{itemize}
 369 \end{frame}
 370
 371
 372 \end{document}
 373
 374 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 375
 376
 377 \section{Preliminaries}
 378
 379 \subsection{Context}
 380
 381 \begin{frame}{Goals for this Talk}{(define, strategic approach,
 382     justify)}
 383
 384   \begin{itemize}
 385   \item To describe the concept of \alert{computable and executable
 386       statistics}, placing it in a historical context.
 387
 388   \item To demonstrate that \alert{a research program}
 389     implemented through  simple steps can increase the efficiency  of
 390     statistical computing approaches by  clearly describing both:
 391     \begin{itemize}
 392     \item numerical characteristics of procedures,
 393     \item statistical concepts driving them.
 394     \end{itemize}
 395
 396   \item To justify that the \alert{approach is worthwhile} and
 397     represents a staged effort towards \alert{increased use of best
 398       practices}.
 399   \end{itemize}
 400   (unfortunately, the last is still incomplete)
 401 \end{frame}
 402
 403
 404 \begin{frame}{Historical Computing Languages}
 405   \begin{itemize}
 406   \item FORTRAN : FORmula TRANslator.  Original numerical computing
 407     language, designed for clean implementation of numerical
 408     algorithms
 409   \item LISP : LISt Processor.  Associated with symbolic
 410     manipulation, AI, and knowledge approaches
 411   \end{itemize}
 412
 413   They represent the 2 generalized needs of statistical computing,
 414   which could be summarized as
 415   \begin{itemize}
 416   \item algorithms/numerics,
 417   \item elicitation, communication, and generation of knowledge (``data
 418     analysis'')
 419   \end{itemize}
 420 \end{frame}
 421
 422 \begin{frame}{Statistical Computing Environments}
 423
 424   Past:
 425   \begin{itemize}
 426   \item SPSS / BMDP / SAS
 427   \item S ( S, S-PLUS, R)
 428   \item LispStat ( XLispStat,  ViSta, ARC , CommonLispStat ) ; QUAIL
 429   \item XGobi (Orca / GGobi / Statistical Reality Engine)
 430   \item MiniTab
 431   \item Stata
 432   \item DataDesk
 433   \item Augsburg Impressionist series (MANET,
 434   \item Excel
 435   \end{itemize}
 436   many others...
 437
 438 \end{frame}
 439
 440 \begin{frame}{How many are left?}
 441
 442   \begin{itemize}
 443   \item R
 444   \item SAS
 445   \item SPSS
 446   \item Stata
 447   \item Minitab
 448   \item very few others...
 449   \end{itemize}
 450   ``R is the Microsoft of the statistical computing world'' -- anonymous.
 451 \end{frame}
 452
 453 \begin{frame}{Selection Pressure}
 454   \begin{itemize}
 455   \item the R user population is growing rapidly, fueled by critical
 456     mass, quality, and value
 457   \item R is a great system for applied data analysis
 458   \item R is not such a great system for research into statistical
 459     computing (backwards compatibility, inertia due to user population)
 460   \end{itemize}
 461   There is a need for alternative experiments for developing new
 462   approaches/ideas/concepts.
 463 \end{frame}
 464
 465 \begin{frame}{Philosophically, why Common Lisp?}
 466   Philosophically:
 467   \begin{itemize}
 468   \item Lisp can cleanly present computational intentions, both
 469     symbolically and numerically.
 470   \item Semantics and context are important: well supported by Lisp
 471     paradigms.
 472   \item Lisp's parentheses describe singular, multi-scale,
 473     \alert{complete thoughts}.
 474   \end{itemize}
 475
 476 \end{frame}
 477
 478 \begin{frame}{Technically, why Common Lisp?}
 479   \begin{itemize}
 480   \item interactive COMPILED language (``R with a compiler'')
 481   \item CLOS is R's S4 object system ``done right''.
 482   \item clean semantics: modality, typing, can be expressed the way
 483     one wants it.
 484   \item programs are data, data are programs, leading to
 485   \item Most modern computing tools available (XML, WWW technologies)
 486   \item ``executable XML''
 487   \end{itemize}
 488   Common Lisp is very close in usage to how people currently use R
 489   (mostly interactive, some batch, and a wish for compilation efficiency).
 490 \end{frame}
 491
 492 \subsection{Background}
 493
 494 \begin{frame}
 495   \frametitle{Desire: Semantics and Statistics}
 496   \begin{itemize}
 497   \item The semantic web (content which is self-descriptive) is an
 498     interesting and potentially useful idea.
 499
 500   \item
 501     Biological informatics support (GO, Entrez) has allowed for
 502     precise definitions of concepts in biology.
 503
 504   \item It is a shame that a field like statistics, requiring such
 505     precision, has less than an imprecise and temporally instable
 506     field such as biology\ldots
 507   \end{itemize}
 508
 509   How can we express statistical work (research, applied work) which
 510   is both human and computer readable (perhaps subject to
 511   transformations first)?
 512 \end{frame}
 513
 514
 515 % \subsection{Context}
 516
 517 % \begin{frame}{Context}{(where I'm coming from, my ``priors'')}
 518 %   \begin{itemize}
 519 %   \item Pharmaceutical Industry
 520 %   \item Modeling and Simulation uses mathematical models/constructs to
 521 %     record beliefs (biology, pharmacology, clinical science) for
 522 %     explication, clinical team alignment, decision support, and
 523 %     quality.
 524 %   \item My work at Novartis is at the intersection of biomedical
 525 %     informatics, statistics, and mathematical modeling.
 526 %   \item As manager: I need a mix of applications and novel research development to
 527 %     solve our challenges better, faster, more efficiently.
 528 %   \item Data analysis is a specialized approach to computer
 529 %     programming, \alert{different} than applications programming or
 530 %     systems programming.
 531 %   \end{itemize}
 532 % \end{frame}
 533
 534
 535 \subsection{Literate Programming is insufficient}
 536
 537 \begin{frame}{Literate Statistical Practice.}
 538   \begin{enumerate}
 539   \item Literate Programming applied to data analysis (Rossini, 1997/2001)
 540   \item among the \alert{most annoying} techniques to integrate into
 541     work-flow if one is not perfectly methodological.
 542   \item Some tools:
 543     \begin{itemize}
 544     \item ESS: supports interactive creation of literate programs.
 545     \item Sweave: tool which exemplifies reporting context; odfWeave
 546       primarily simplifies reporting.
 547     \item Roxygen: primarily supports a literate programming
 548       documentation style, not a literate data analysis programming
 549       style.
 550   \end{itemize}
 551   \item ROI demonstrated in specialized cases: BioConductor.
 552   \item \alert{usually done after the fact} (final step of work-flow)
 553     as a documentation/computational reproducibility technique, rarely
 554     integrated into work-flow.
 555   \end{enumerate}
 556   Many contributors:
 557   Knuth, Claerbout, Carey, de Leeuw, Leisch, Gentleman, Temple-Lang,
 558   \ldots{}
 559 \end{frame}
 560
 561 \begin{frame}
 562   \frametitle{Literate Programming}
 563   \framesubtitle{Why isn't it enough for Data Analysis?}
 564
 565   Only 2 contexts: (executable) code and documentation.  Fine for
 566   application programming,  but for data analysis, we could benefit
 567   from:
 568   \begin{itemize}
 569   \item classification of statistical procedures
 570   \item descriptions of assumptions
 571   \item pragmatic recommendations
 572   \item inheritance of structure through the work-flow of a
 573     statistical methodology or data analysis project
 574   \item datasets and metadata
 575   \end{itemize}
 576   Concept: ontologies describing mathematical assumptions, applications
 577   of methods, work-flow, and statistical data structures can enable
 578   machine communication.
 579
 580   (i.e. informatics framework ala biology)
 581 \end{frame}
 582
 583
 584 \begin{frame}{Communication in Statistical Practice}{\ldots is essential for \ldots}
 585   \begin{itemize}
 586   \item finding
 587   \item explanations
 588   \item agreement
 589   \item receiving information
 590   \end{itemize}
 591   \alert{``machine-readable'' communication/computation lets the
 592     computer help} \\
 593   Semantic Web is about ``machine-enabled computability''.
 594 \end{frame}
 595
 596 \begin{frame}  \frametitle{Semantics}
 597   \framesubtitle{One definition: description and context}
 598
 599   Interoperability is the key, with respect to
 600   \begin{itemize}
 601   \item ``Finding things''
 602   \item Applications and activities with related functionality
 603     \begin{itemize}
 604     \item moving information from one state to another (paper, journal
 605       article, computer program)
 606     \item computer programs which implement solutions to similar tasks
 607     \end{itemize}
 608   \end{itemize}
 609 \end{frame}
 610
 611
 612 \begin{frame}{Statistical Practice is somewhat restricted}
 613   {...but in a good sense, enabling potential for semantics...}
 614
 615   There is a restrictable set of intended actions for what can be done
 616   -- the critical goal is to be able to make a difference by
 617   accelerating activities that should be ``computable'':
 618   \begin{itemize}
 619   \item restricted natural language processing
 620   \item mathematical translation
 621   \item common description of activities for simpler programming/data
 622     analysis (S approach to objects and methods)
 623   \end{itemize}
 624   R is a good basic start (model formulation approach, simple
 625   ``programming with data'' paradigm); we should see if we can do
 626   better!
 627 \end{frame}
 628
 629 \begin{frame}{Computable and Executable Statistics requires}
 630
 631   \begin{itemize}
 632   \item approaches to describe data and metadata (``data'')
 633     \begin{itemize}
 634     \item semantic WWW
 635     \item metadata management and integration, driving
 636     \item data integration
 637     \end{itemize}
 638   \item approaches to describe data analysis methods (``models'')
 639     \begin{itemize}
 640     \item quantitatively: many ontologies (AMS, etc), few meeting
 641       statistical needs.
 642     \item many substantive fields have implementations
 643       (bioinformatics, etc) but not well focused.
 644     \end{itemize}
 645   \item approaches to describe the specific form of interaction
 646     (``instances of models'')
 647     \begin{itemize}
 648     \item Original idea behind ``Literate Statistical Analysis''.
 649     \item That idea is suboptimal, more structure needed (not
 650       necessarily built upon existing...).
 651     \end{itemize}
 652   \end{itemize}
 653 \end{frame}
 654
 655 \subsection{Common Lisp Statistics}
 656
 657 \begin{frame}
 658   \frametitle{Interactive Programming}
 659   \framesubtitle{Everything goes back to being Lisp-like}
 660   \begin{itemize}
 661   \item Interactive programming (as originating with Lisp): works
 662     extremely well for data analysis (Lisp being the original
 663     ``programming with data'' language).
 664   \item Theories/methods for how to do this are reflected in styles
 665     for using R.
 666   \end{itemize}
 667 \end{frame}
 668
 669 \begin{frame}[fragile]
 670   \frametitle{Lisp}
 671
 672   Lisp (LISt Processor) is different than most high-level computing
 673   languages, and is very old (1956).  Lisp is built on lists of things
 674   which are evaluatable.
 675 \begin{verbatim}
 676 (functionName data1 data2 data3)
 677 \end{verbatim}
 678   or ``quoted'':
 679 \begin{verbatim}
 680 '(functionName data1 data2 data3)
 681 \end{verbatim}
 682   which is shorthand for
 683 \begin{verbatim}
 684 (list functionName data1 data2 data3)
 685 \end{verbatim}
 686   The difference is important -- lists of data (the second/third) are
 687   not (yet?!) functions applied to (unencapsulated lists of) data (the first).
 688 \end{frame}
 689
 690 \begin{frame}
 691   \frametitle{Features}
 692   \begin{itemize}
 693   \item Data and Functions semantically the same
 694   \item Natural interactive use through functional programming with
 695     side effects
 696   \item Batch is a simplification of interactive -- not a special mode!
 697   \end{itemize}
 698 \end{frame}
 699
 700
 701
 702 \begin{frame}[fragile]{Representation: XML and Lisp}{executing your data}
 703   Many people are familiar with XML:
 704 \begin{verbatim}
 705 <name phone="+41793674557">Tony Rossini</name>
 706 \end{verbatim}
 707   which is shorter in Lisp:
 708 \begin{verbatim}
 709 (name "Tony Rossini" :phone "+41613674557")
 710 \end{verbatim}
 711   \begin{itemize}
 712   \item Lisp ``parens'', universally hated by unbelievers, are
 713     wonderful for denoting when a ``concept is complete''.
 714   \item Why can't your data self-execute?
 715   \end{itemize}
 716 \end{frame}
 717
 718 \begin{frame}[fragile]{Numerics with Lisp}
 719   \begin{itemize}
 720   \item addition of rational numbers and arithmetic
 721   \item example for mean
 722 \begin{verbatim}
 723  (defun mean (x)
 724     (checktype x 'vector-like)
 725     (/ (loop for i from 0 to (- (nelts *x*) 1)
 726           summing (vref *x* i))
 727        (nelts *x*)))
 728 \end{verbatim}
 729   \item example for variance
 730 \begin{verbatim}
 731 (defun variance (x)
 732   (let ((meanx (mean x))
 733         (nm1 (1- (nelts x))))
 734      (/ (loop for i from 0 to nm1
 735            summing (power (- (vref *x* i) meanx) 2)
 736         nm1))))
 737 \end{verbatim}
 738   \item But through macros, \verb+(vref *x* i)+ could be
 739     \verb+#V(X[i])+ or your favorite syntax.
 740   \end{itemize}
 741
 742 \end{frame}
 743
 744
 745 \begin{frame}{Common Lisp Statistics 1}
 746   \begin{itemize}
 747   \item Originally based on LispStat (reusability)
 748   \item Re-factored structure (some numerics worked with a 1990-era code base).
 749   \item Current activities:
 750     \begin{enumerate}
 751     \item numerics redone using CFFI-based BLAS/LAPLACK (cl-blapack)
 752     \item matrix interface based on MatLisp
 753     \item starting design of a user interface system (interfaces,
 754       visuals).
 755     \item general framework for model specification (regression,
 756       likelihood, ODEs)
 757     \item general framework for algorithm specification (bootstrap,
 758       MLE, algorithmic data anaylsis methods).
 759     \end{enumerate}
 760   \end{itemize}
 761 \end{frame}
 762
 763 \begin{frame}{Common Lisp Statistics 2}
 764
 765   \begin{itemize}
 766   \item Implemented using SBCL.  Contributed fixes for
 767     Clozure/OpenMCL. Goal to target CLISP
 768   \item Supports LispStat prototype object system
 769   \item Package-based design -- only use the components you need, or
 770     the components whose API you like.
 771   \end{itemize}
 772 \end{frame}
 773
 774 \section{Discussion}
 775
 776 \begin{frame}
 777   \frametitle{Outlook}
 778   \begin{itemize}
 779   \item Semantics and Computability have captured a great deal of
 780     attention in the informatics and business computing R\&D worlds
 781   \item Statistically-driven Decision Making and Knowledge Discovery
 782     is, with high likelihood, the next challenging stage after data
 783     integration.
 784   \item Statistical practice (theory and application) can be enhanced,
 785     made more efficient, providing  increased benefit to organizations
 786     and groups using appropriate methods.
 787   \item Lisp as a language, shares characteristics of both Latin
 788     (difficult dead language useful for classical training) and German
 789     (difficult living language useful for general life).  Of course,
 790     for some people, they are not difficult.
 791   \end{itemize}
 792
 793 \end{frame}
 794
 795 \begin{frame}
 796   The research program described in this talk is currently driving the
 797   design of CommonLisp Stat, which leverages concepts and approaches
 798   from the dead and moribund LispStat project.
 799
 800   \begin{itemize}
 801   \item \url{http://repo.or.cz/w/CommonLispStat.git/}
 802   \item \url{http://www.github.com/blindglobe/}
 803   \end{itemize}
 804
 805 \end{frame}
 806 \begin{frame}{Final Comment}
 807
 808   \begin{itemize}
 809   \item In the Pharma industry, it is all about getting the right
 810     drugs to the patient faster.  Data analysis systems seriously
 811     impact this process, being potentially an impediment or an
 812     accelerator.
 813
 814     \begin{itemize}
 815     \item \alert{Information technologies can increase the efficiency
 816         of statistical practice}, though innovation change management
 817       must be taking into account.  (i.e. Statistical practice, while
 818       considered by some an ``art form'', can benefit from
 819       industrialization).
 820     \item \alert{Lisp's features match the basic requirements we need}
 821       (dichotomy: programs as data, data as programs).  Sales pitch,
 822       though...
 823     \item Outlook: Lots of work and experimentation to do!
 824     \end{itemize}
 825   \item {\tiny Gratuitous Advert: We are hiring, have student
 826       internships (undergrad, grad students), and a visiting faculty
 827       program.  Talk with me if possibly interested.}
 828   \end{itemize}
 829 \end{frame}
 830
 831
 832 % % All of the following is optional and typically not needed.
 833 % \appendix
 834
 835
 836 % \section<presentation>*{\appendixname}
 837
 838
 839 % \begin{frame} \frametitle{Complements and Backup}
 840 %   No more, stop here.  Questions?  (now or later).
 841 % \end{frame}
 842
 843 % \begin{frame}{The Industrial Challenge.}{Getting the Consulting Right.}
 844 %   % - A title should summarize the slide in an understandable fashion
 845 %   %   for anyone how does not follow everything on the slide itself.
 846
 847 %   \begin{itemize}
 848 %   \item Recording assumptions for the next data analyst, reviewer.
 849 %     Use \texttt{itemize} a lot.
 850 %   \item
 851 %     Use very short sentences or short phrases.
 852 %   \end{itemize}
 853 % \end{frame}
 854
 855
 856 % \begin{frame}{The Industrial Challenge.}{Getting the Right Research Fast.}
 857 %   % - A title should summarize the slide in an understandable fashion
 858 %   %   for anyone how does not follow everything on the slide itself.
 859
 860 %   \begin{itemize}
 861 %   \item
 862 %     Use \texttt{itemize} a lot.
 863 %   \item
 864 %     Use very short sentences or short phrases.
 865 %   \end{itemize}
 866 % \end{frame}
 867
 868
 869 % \begin{frame}{Explicating the Work-flow}{QA/QC-based improvements.}
 870
 871
 872 % \end{frame}
 873
 874 % \section{Motivation}
 875
 876 % \subsection{IT Can Speed up Deliverables in Statistical Practice}
 877
 878 % \begin{frame}{Our Generic Work-flow and Life-cycle}
 879 %   {describing most data analytic activities}
 880 %   Workflow:
 881 %   \begin{enumerate}
 882 %   \item Scope out the problem
 883 %   \item Sketch out a potential solution
 884 %   \item Implement until road-blocks appear
 885 %   \item Deliver results
 886 %   \end{enumerate}
 887
 888 %   Lifecycle:
 889 %   \begin{enumerate}
 890 %   \item paper sketch
 891 %   \item 1st e-draft of text/code/date (iterate to \#1, discarding)
 892 %   \item cycle through work
 893 %   \item publish
 894 %   \item ``throw-away''
 895 %   \end{enumerate}
 896 %   but there is valuble information that could enable the next
 897 %   generation!
 898 % \end{frame}
 899
 900 % \begin{frame}[fragile]{Paper $\rightarrow$ Computer  $\rightarrow$ Article $\rightarrow$ Computer}{Cut and Paste makes for large errors.}
 901 %   \begin{itemize}
 902 %   \item Problems in a regulatory setting
 903 %   \item Regulatory issues are just ``best practices''
 904 %   \end{itemize}
 905
 906 %   Why do we ``copy/paste'', or analogously, restart our work?
 907
 908 %   pro:
 909 %   \begin{itemize}
 910 %   \item every time we repeat, we reinforce the idea in our brain
 911 %   \item review of ideas can help improve them
 912 %   \end{itemize}
 913 %   con:
 914 %   \begin{itemize}
 915 %   \item inefficiency
 916 %   \item introduction of mistakes
 917 %   \item loss of historical context
 918 %   \item changes to earlier work (on a different development branch)
 919 %     can not propagate.
 920 %   \end{itemize}
 921 % \end{frame}
 922
 923 % \section{Semantics and Statistical Practice}
 924
 925
 926 % \begin{frame}
 927 %   \frametitle{Statistical Activity Leads to Reports}
 928 %   \framesubtitle{You read what you know, do you understand it?}
 929
 930 %   How can we improve the communication of the ideas we have?
 931
 932 %   Precision of communication?
 933
 934 % \end{frame}
 935
 936
 937
 938 % \begin{frame}  \frametitle{Communication Requires Context}
 939 %   \framesubtitle{Intentions imply more than one might like...}
 940
 941 %   \begin{itemize}
 942 %   \item Consideration of what we might do
 943 %   \item Applications with related functionality
 944 %   \end{itemize}
 945 % \end{frame}
 946
 947
 948
 949 % \begin{frame}
 950 %   \frametitle{Design Patterns}
 951 %   \framesubtitle{Supporting Work-flow Transitions}
 952
 953 %   (joint work with H Wickham): The point of this research program is
 954 %   not to describe what to do at any particular stage of work, but to
 955 %   encourage researchers and practitioners to consider how the
 956 %   translation and transfer of information between stages so that work
 957 %   is not lost.
 958
 959 %   Examples of stages in a work-flow:
 960 %   \begin{itemize}
 961 %   \item planning, execution, reporting;
 962 %   \item scoping, illustrative examples or counter examples, algorithmic construction,
 963 %     article writing.
 964 %   \item descriptive statistics, preliminary inferential analysis,
 965 %     model/assumption checking, final inferential analysis,
 966 %     communication of scientific results
 967 %   \end{itemize}
 968 %   Description of work-flows is essential to initiating discussions on
 969 %   quality/efficiency of approaches to work.
 970 % \end{frame}
 971
 972 % \section{Design Challenges}
 973
 974 % \begin{frame}
 975 %   \frametitle{Activities are enhanced by support}
 976
 977 %   \begin{itemize}
 978 %   \item Mathematical manipulation can be enhanced by symbolic
 979 %     computation
 980 %   \item Statistical programming can be enabled by examples and related
 981 %     algorithm implementation
 982 %   \item Datasets, to a limited extent, can self-describe.
 983 %   \end{itemize}
 984 % \end{frame}
 985
 986 % \begin{frame}
 987 %   \frametitle{Executable and Computable Science}
 988
 989 %   Use of algorithms and construction to describe how things work.
 990
 991 %   Support for agent-based approaches
 992 % \end{frame}
 993
 994
 995 % \begin{frame}
 996 %   \frametitle{What is Data?  Metadata?}
 997
 998 %   Data: what we've observed
 999
1000 %   MetaData: context for observations, enables semantics.
1001 % \end{frame}
1002
1003
1004
1005
1006 % % \begin{frame}[fragile]
1007 % %   \frametitle{Defining Variables}
1008 % %   \framesubtitle{Setting variables}
1009 % % \begin{verbatim}
1010 % % (setq <variable> <value>)
1011 % % \end{verbatim}
1012 % %   Example:
1013 % % \begin{verbatim}
1014 % % (setq ess-source-directory
1015 % %       "/home/rossini/R-src")
1016 % % \end{verbatim}
1017 % % \end{frame}
1018
1019 % % \begin{frame}[fragile]
1020 % %   \frametitle{Defining on the fly}
1021 % % \begin{verbatim}
1022 % % (setq ess-source-directory
1023 % %    (lambda () (file-name-as-directory
1024 % %          (expand-file-name
1025 % %            (concat (default-directory)
1026 % %                    ess-suffix "-src")))))
1027 % % \end{verbatim}
1028 % %   (Lambda-expressions are anonymous functions, i.e. ``instant-functions'')
1029 % % \end{frame}
1030
1031
1032 % % \begin{frame}[fragile]
1033 % %   \frametitle{Function Reuse}
1034 % %   By naming the function, we could make the previous example reusable
1035 % %   (if possible):
1036 % % \begin{verbatim}
1037 % % (defun my-src-directory ()
1038 % %       (file-name-as-directory
1039 % %          (expand-file-name
1040 % %            (concat (default-directory)
1041 % %                    ess-suffix "-src"))))
1042 % % \end{verbatim}
1043 % %   Example:
1044 % % \begin{verbatim}
1045 % % (setq ess-source-directory (my-src-directory))
1046 % % \end{verbatim}
1047 % % \end{frame}
1048
1049
1050 % % \begin{frame}
1051 % %   \frametitle{Equality Among Packages}
1052 % %   \begin{itemize}
1053 % %   \item more/less equal can be described specifically through
1054 % %     overriding imports.
1055 % %   \end{itemize}
1056 % % \end{frame}
1057
1058
1059 % \subsection<presentation>*{For Further Reading}
1060
1061 % \begin{frame}[allowframebreaks]
1062 %   \frametitle<presentation>{Related Material}
1063
1064 %   \begin{thebibliography}{10}
1065
1066 %   \beamertemplatebookbibitems
1067 %   % Start with overview books.
1068
1069 %   \bibitem{LispStat1990}
1070 %     L.~Tierney
1071 %     \newblock {\em LispStat}.
1072
1073 %   \beamertemplatearticlebibitems
1074 %   % Followed by interesting articles. Keep the list short.
1075
1076 %   \bibitem{Rossini2001}
1077 %     AJ.~Rossini
1078 %     \newblock Literate Statistical Practice
1079 %     \newblock {\em Proceedings of the Conference on Distributed
1080 %       Statistical Computing}, 2001.
1081
1082 %   \bibitem{RossiniLeisch2003}
1083 %     AJ.~Rossini and F.~Leisch
1084 %     \newblock Literate Statistical Practice
1085 %     \newblock {\em Technical Report Series, University of Washington
1086 %       Department of Biostatistics}, 2003.
1087
1088 %   \beamertemplatearrowbibitems
1089 %   % Followed by interesting articles. Keep the list short.
1090
1091 %   \bibitem{CLS}
1092 %     Common Lisp Stat, 2008.
1093 %     \newblock \url{http://repo.or.cz/CommonLispStat.git/}
1094
1095 %   \end{thebibliography}
1096 % \end{frame}