small clean up of David's section, more to go.
[CommonLispStat.git] / Doc / talks / Rossini-Vandy-Oct2009.tex
blob69c479edab5b021ebec6b35d6f73d09bba32c859
1 \documentclass{beamer}
3 \mode<presentation>
5 \usetheme{classic}
6 \setbeamercovered{transparent}
9 \usepackage[english]{babel}
10 \usepackage[latin1]{inputenc}
11 \usepackage{times}
12 \usepackage[T1]{fontenc}
13 \usepackage{url}
15 \title[CLS]{Back to the Future: Life beyond R, by going back to Lisp}
16 \subtitle{Using History to Design better data analysis systems}
17 \author[Rossini]{Anthony~(Tony)~Rossini}
19 \institute[Novartis and University of Washington]{
20 Group Head, Modeling and Simulation Statistics\\
21 Novartis Pharma AG, Switzerland
22 \and
23 Affiliate Assoc Prof, Biomedical and Health Informatics\\
24 University of Washington, USA}
26 \date[Vanderbilt]{Vanderbilt, Sept 2009, Nashville USA}
27 \subject{Statistical Computing Environments}
29 \begin{document}
31 \begin{frame}
32 \titlepage
33 \end{frame}
35 \begin{frame}[fragile]{Intro to Lisp notation}
36 \begin{verbatim}
37 ;; This is a comment
39 and so is this
41 '(a list of things to become data)
42 (list a list of things to become data)
43 (what-I-execute with-one-thing with-two-thing)
44 ;; that is:
45 (my-fcn-name input1
46 input2) ; and to auto-gen input1:
47 (my-fcn-name (my-fcn-name input3 input4)
48 input2)
49 \end{verbatim}
50 \end{frame}
53 \begin{frame}{What do you do?}
54 When you begin to work on an activity (methodological, theoretical,
55 application/substantive), and go to the keyboard to work on a
56 computer, {\textbf what do you do?}
57 \end{frame}
60 \section{Computable Statistics}
62 \begin{frame}{Can we compute with them?}
63 3 Toy Examples:
64 \begin{itemize}
65 \item Statistical Research (``Annals work'')
66 \item Consulting, Applied Statistics, Scientific Honesty.
67 \item Reimplementation.
68 \end{itemize}
69 Can ``compute'' with the information given? (that is:
70 \begin{itemize}
71 \item do we have sufficient information to communicate enough for
72 the right person to understand or recreate the effort?
73 \item have we sufficient clarity to prevent misunderstandings about
74 intentions and claims?
75 \end{itemize}
77 \end{frame}
79 \begin{frame}[fragile]{Example 1: Theory\ldots}
80 \label{example1}
81 Let $f(x;\theta)$ describe the likelihood of XX under the following
82 assumptions.
83 \begin{enumerate}
84 \item assumption-1
85 \item assumption-2
86 \end{enumerate}
87 Then if we use the following algorithm:
88 \begin{enumerate}
89 \item step-1
90 \item step-2
91 \end{enumerate}
92 then $\hat{\theta}$ should be $N(0,\hat\sigma^2)$ with the following
93 characteristics\ldots
94 \end{frame}
96 \begin{frame}
97 \frametitle{Can we compute, using this description?}
98 Given the information at hand:
99 \begin{itemize}
100 \item we ought to have a framework for initial coding for the
101 actual simulations (test-first!)
102 \item the implementation is somewhat clear
103 \item We should ask: what theorems have similar assumptions?
104 \item We should ask: what theorems have similar conclusions but
105 different assumptions?
106 \end{itemize}
107 \end{frame}
108 \begin{frame}[fragile]{Realizing Theory}
109 \small{
110 \begin{verbatim}
111 (define-theorem my-proposed-theorem
112 (:theorem-type '(distribution-properties
113 frequentist likelihood))
114 (:assumes '(assumption-1 assumption-2))
115 (:likelihood-form
116 (defun likelihood (data theta gamma)
117 (exponential-family theta gamma)))
118 (:compute-by
119 '(progn
120 (compute-start-values thetahat gammahat)
121 (until (convergence)
122 (setf convergence
123 (or (step-1 thetahat)
124 (step-2 gammahat))))))
125 (:claim (equal-distr '(thetahat gammahat) 'normal))))
126 \end{verbatim}
128 \end{frame}
130 \begin{frame}[fragile]{It would be nice to have}
131 \begin{verbatim}
132 (theorem-veracity 'my-proposed-theorem)
133 \end{verbatim}
134 returning some indication of how well it met given computable claims,
135 modulo what proportion of computable claims could be tested.
136 \begin{itemize}
137 \item and have it run some illustrative simulations which suggest
138 which might be problematic in real situations, and real situations
139 for which there are no problems.
140 \item and work through some of the logic based on related claims using
141 identical assumptions to confirm some of the results
142 \end{itemize}
143 \end{frame}
145 \begin{frame}[fragile]{and why not...?}
146 \begin{verbatim}
147 (when (> (theorem-veracity
148 'my-proposed-theorem)
149 0.8)
150 (make-draft-paper 'my-proposed-theorem
151 :style :JASA
152 :output-formats
153 '(LaTeX MSWord)))
154 \end{verbatim}
155 \end{frame}
157 \begin{frame}{Comments}
158 \begin{itemize}
159 \item Of course the general problem is very difficult, but one must
160 start somewhere.
161 \item I'm working on some basic statistical proof of concepts (not
162 finished): T-Test, linear regression (LS-based, Normal-Normal Bayesian)
163 \item Areas targetted for medium-term future: resampling methods and
164 similar algorithms.
165 \end{itemize}
166 \end{frame}
168 \begin{frame}
169 \frametitle{Example 2: Practice\ldots}
170 \label{example2}
171 The dataset comes from a series of clinical trials, some with active
172 control and others using placebo control. We model the primary
173 endpoint, ``relief'', as a binary random variable. There is a
174 random trial effect on relief as well as severity due to differences
175 in recruitment and inclusion/exclusion criteria from 2 different
176 trial networks.
177 \end{frame}
179 \begin{frame}
180 \frametitle{Can we compute, using this description?}
181 \begin{itemize}
182 \item With a real such description, it is clear what some of the
183 potential models might be for this dataset
184 \item It should be clear how to start thinking of a data dictionary
185 for this problem.
186 \end{itemize}
187 \end{frame}
189 \begin{frame}[fragile]{Can we compute?}
190 \begin{verbatim}
191 (dataset-metadata paper-1
192 :context 'clinical-trial 'randomized
193 'active-ctrl 'placebo-ctrl 'metaanalysis
194 :variables '((relief :model-type dependent
195 :distr binary)
196 (trial :model-type independent
197 :distr categorical)
198 (disease-severity))
199 :metadata '(incl-crit-net1 excl-crit-net1
200 incl-crit-net1 excl-crit-net2
201 recr-rate-net1 recr-rate-net2))
202 (propose-analysis paper-1)
203 ; => (list 'tables '(logistic-regression))
204 \end{verbatim}
205 \end{frame}
207 \begin{frame}{Example 3: The Round-trip\ldots}
208 \label{example3}
209 The first examples describe ``ideas $\rightarrow$ code''
211 Consider the last time you read someone else's implementation of a
212 statistical procedure (i.e. R package code). When you read the
213 code, could you see:
214 \begin{itemize}
215 \item the assumptions used?
216 \item the algorithm implemented?
217 \item practical guidance for when you might select the algorithm
218 over others?
219 \item practical guidance for when you might select the
220 implementation over others?
221 \end{itemize}
222 These are usually components of any reasonable journal article.
223 \textit{(Q: have you actually read an R package that wasn't yours?)}
224 \end{frame}
226 \begin{frame}{Exercise left to the reader!}
228 % (aside: I have been looking at the \textbf{stats} and \textbf{lme4}
229 % packages recently -- \textit{for me}, very clear numerically, much
230 % less so statistically)
231 \end{frame}
233 \section{Context}
235 \begin{frame}{Point of Examples}
236 \begin{itemize}
237 \item Few statistical concepts are ``computable'' with existing systems.
239 \item Some of this work is computable, let the computer do it.
241 \item There is little point in having people re-do basics
243 \item Computing environments for statistical work have been stable
244 for far too long, and limit the development and implementation of
245 better, more efficient, and more appropriate methods by allowing
246 people to be lazy (i.e. classic example of people publishing
247 papers on changes which are very minimal from a
248 methodological/theoretical view, but very difficult from an
249 implementation/practical view).
250 \end{itemize}
251 \end{frame}
253 \begin{frame}{Current State}
255 The current approaches to how statisticians interface with computers
256 to perform data analysis can be put into 2 camps
257 \begin{enumerate}
258 \item GUI with a spreadsheet (Excel/Minitab/Rcmdr without the
259 script)
260 \item applications programming (with
261 \end{enumerate}
262 There are different levels of sophistication, and some merging.
263 \end{frame}
265 \begin{frame}{Issues which arise when computing...}
266 \begin{enumerate}
267 \item relevant substantive issues (causality, variable independence,
268 design issues such as sampling strategies) not incorporated.
269 \item irrelevant substantive issues (coding, wide vs. long
270 collection, other non-statistical data management issues) become
271 statistically-relevant.
272 \item little support for encoding theoretical considerations (``expert
273 systems'' for guidance). Must be hard-coded in and hard-coded
274 away (``stars for p-values as evil''). Nearly impossible to
275 construct and apply philosophical opinions to ensure appropriate
276 use (and training through use) of singular or personalized
277 mixtures of statistical philosophies when doing data analysis (or
278 methodological development, or theoretical development).
279 \end{enumerate}
280 \end{frame}
282 \begin{frame}{Problem Statement}
284 How can statistical computing environments support appropriate use
285 of statistical concepts (algorithmic, knowledge-centric,
286 knowledge-managing, philosophical discipline), so that the computing
287 structure doesn't rely on data-munging or numerical skill?
289 \end{frame}
292 \begin{frame}{Goals for this Talk}{(define, strategic approach,
293 justify)}
295 \begin{itemize}
296 \item To describe the concept of \alert{computable statistics},
297 placing it in a historical context.
299 \item To demonstrate that \alert{a research program}
300 implemented through simple steps can increase the efficiency of
301 statistical computing approaches by clearly describing both:
302 \begin{itemize}
303 \item numerical characteristics of procedures,
304 \item statistical concepts driving them.
305 \end{itemize}
307 \item To justify that the \alert{approach is worthwhile} and
308 represents a staged effort towards \alert{increased use of best
309 practices} and efficient tech transfer of modern statistical
310 theory (i.e. why must we wait 10 years for Robins' estimation
311 approaches?)
312 \end{itemize}
313 (unfortunately, the last is still incomplete)
314 \end{frame}
316 \section{CLS Works?}
317 \label{sec:work}
319 \begin{frame}{Is it Vaporware? Not quite}
320 The follow is possible with the help of the open source Common Lisp
321 community, who provided most of the packages, tools, and glue.
322 (Tamas Papp, Raymond Toy, Mark Hoemmomem, and many, many others).
323 Most of the underlying code was written by others, and ``composed''
324 by me.
325 \end{frame}
327 \subsection{Graphics}
328 \label{sec:work:graphics}
330 \begin{frame}{Silly Visualization Example}
331 \includegraphics[width=3in,height=3in]{./test1.png}
332 \end{frame}
334 \begin{frame}[fragile]{How?}
335 \begin{verbatim}
336 (defparameter *frame2*
337 (as-frame (create-xlib-image-context 200 200)
338 :background-color +white+))
339 (bind ((#2A((f1 f2) (f3 f4))
340 (split-frame *frame2*
341 (percent 50)
342 (percent 50))))
343 (defparameter *f1* f1) ; lower left
344 (defparameter *f2* f2) ; lower right f3 f4
345 (defparameter *f3* f3) ; top left f1 f2
346 (defparameter *f4* f4)); top right
347 \end{verbatim}
348 \end{frame}
350 \begin{frame}[fragile]{Functions to Plot}
351 \begin{verbatim}
352 (plot-function *f1* #'sin
353 (interval-of 0 2)
354 :x-title "x" :y-title "sin(x)")
355 (plot-function *f2* #'cos (interval-of 0 2)
356 :x-title "x" :y-title "cos(x)")
357 (plot-function *f3* #'tan (interval-of 0 2)
358 :x-title "x" :y-title "tan(x)")
359 \end{verbatim}
360 \end{frame}
362 \begin{frame}[fragile]{Things to Plot}
363 \small{
364 \begin{verbatim}
365 (let* ((n 500)
366 (xs (num-sequence
367 :from 0 :to 10 :length n))
368 (ys (map 'vector
369 #'(lambda (x) (+ x 8 (random 4.0)))
370 xs))
371 (weights
372 (replicate #'(lambda () (1+ (random 10)))
373 n 'fixnum))
374 (da (plot-simple *f4*
375 (interval-of 0 10)
376 (interval-of 10 20)
377 :x-title "x" :y-title "y")))
378 (draw-symbols da xs ys :weights weights))
379 \end{verbatim}
381 \end{frame}
383 \begin{frame}[fragile]{Copying existing graphics}
384 And we generated the figure on the first page by:
385 \begin{verbatim}
386 (xlib-image-context-to-png
387 (context *f1*)
388 "/home/tony/test1.png")
389 \end{verbatim}
390 \end{frame}
392 \subsection{Statistical Models}
393 \label{sec:work:statmod}
395 \begin{frame}[fragile]{Linear Regression}
396 \small{
397 \begin{verbatim}
398 ;; Worse than LispStat, wrapping LAPACK's dgelsy:
399 (defparameter *result1*
400 (lm (list->vector-like iron)
401 (list->vector-like absorbtion)))
402 *result*1 =>
403 ((#<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
404 -11.504913191235342
405 0.23525771181009483>
408 #<LA-SIMPLE-MATRIX-DOUBLE 2 x 2
409 9.730392177126686e-6 -0.001513787114206932
410 -0.001513787114206932 0.30357851215706255>
412 13 2)
413 \end{verbatim}
415 \end{frame}
417 \subsection{Data Manip/Mgmt}
418 \label{sec:work:data}
420 \begin{frame}[fragile]{DataFrames}
421 \small{
422 \begin{verbatim}
423 (defparameter *my-df-1*
424 (make-instance 'dataframe-array
425 :storage #2A((1 2 3 4 5) (10 20 30 40 50))
426 :doc "This is a boring dataframe-array"
427 :case-labels (list "x" "y")
428 :var-labels (list "a" "b" "c" "d" "e")))
430 (xref *my-df-1* 0 0) ; API change in progress
432 (setf (xref *my-df-1* 0 0) -1d0)
433 \end{verbatim}
435 \end{frame}
437 \begin{frame}[fragile]{Numerical Matrices}
438 \small{
439 \begin{verbatim}
440 (defparameter *mat-1*
441 (make-matrix 3 3
442 :initial-contents #2A((2d0 3d0 -4d0)
443 (3d0 2d0 -4d0)
444 (4d0 4d0 -5d0))))
446 (xref *mat-1* 2 0) ; => 4d0 ; API change
447 (setf (xref *mat-1* 2 0) -4d0)
449 (defparameter *xv*
450 (make-vector 4 :type :row
451 :initial-contents '((1d0 3d0 2d0 4d0))))
452 \end{verbatim}
454 \end{frame}
456 \begin{frame}[fragile]{Macros make the above tolerable}
457 \begin{verbatim}
458 (defparameter *xv*
459 (make-vector 4 :type :row
460 :initial-contents '((1d0 3d0 2d0 4d0))))
462 ; can use defmacro for the following syntax =>
464 (make-row-vector *xv* '((1d0 3d0 2d0 4d0)))
466 ; or reader macros for the following:
467 #mrv(*xv* '((1d0 3d0 2d0 4d0)))
468 \end{verbatim}
469 \end{frame}
471 \section{Common Lisp Statistics}
472 \label{sec:CLS}
474 \begin{frame}{Why CLS?}
475 \begin{itemize}
476 \item a component-based structure for statistical computing,
477 allowing for small and specific specification.
478 \item a means to drive philosophically customized data analysis, and
479 enforce a structure to allow simple comparisons between
480 methodologies.
481 \item This is a ``customization'' through packages to support
482 statistical computing, not a independent language. ``Ala Carte'',
483 not ``Menu''.
484 \end{itemize}
485 \end{frame}
487 \subsection{Implementation Plans}
488 \label{sec:CLS:impl}
491 \begin{frame}{Current Functionality}
492 \begin{itemize}
493 \item basic dataframes (similar to R); subsetting API under
494 development.
495 \item Basic regression (similar to XLispStat)
496 \item matrix storage both in foreign and lisp-centric areas.
497 \item LAPACK (small percentage, increasing), working with both
498 matrix storage types
499 \item static graphics (X11) including preliminary grid functionality based
500 on CAIRO. Generation of PNG files from graphics windows.
501 \item CSV file support
502 \item Common Lisp!
503 \end{itemize}
504 \end{frame}
506 \begin{frame}[fragile]{Computational Environment Supported}
507 \begin{itemize}
508 \item works on Linux, with recent SBCL versions
509 \item Definitely works on bleeding edge Debian (unstable).
510 \item Has worked for weak definitions of ``work'' on 4 different
511 people's computers (not quite, but sort of requires a
512 \verb+/home/tony/+ !)
513 \end{itemize}
514 \end{frame}
516 \begin{frame}{Goals}
517 Short Term
518 \begin{itemize}
519 \item Better integration of data structures with statistical routines
520 (auto-handling with dataframes, rather than manual parsing).
521 \item dataframe to model-matrix tools (leveraging old XlispStat GEE
522 package)
523 \end{itemize}
524 Medium/Long Term
525 \begin{itemize}
526 \item Support for other Common Lisps
527 \item Cleaner front-end API to matrices and numerical algorithms
528 \item constraint system for different statistical algorithm
529 development, as well as for interactive GUIs and graphics
530 \item LispStat compatible (object system in-progress, GUI to do)
531 \item Integrated invisible parallelization when more efficient
532 (multicore, threading, and user-space systems)
533 \end{itemize}
534 \end{frame}
537 \section{Vaporware?}
539 \begin{frame}[fragile]{What does NOT work?}
540 Primarily, the reason that we doing this:
542 \textbf{Computable and Executable Statistics}
544 but consider XML:
545 \begin{verbatim}
546 <car brand="honda" engine="4cyl">accord</car>
547 \end{verbatim}
548 becomes
549 \begin{verbatim}
550 ; data follows keywords...
551 (car :brand 'honda :engine "4cyl" accord)
552 \end{verbatim}
553 \end{frame}
555 \section{Discussion}
560 \begin{frame}{Why use Common Lisp?}
561 \begin{itemize}
562 \item Parens provide clear delineation of a \textbf{Complete
563 Thought} (functional programming with side effects).
564 \item Lisp-2 (symbols represent a different function and variable)
565 \item ANSI standard (built by committee, but the committee was
566 reasonably smart)
567 \item Many implementations
568 \item Most implementations are interactive \textbf{compiled}
569 languages (few are interpreted, nearly all byte-compiled).
570 \item The Original \emph{Programming with Data} Language
571 (\emph{Programs are Data} and \emph{Data are Executable} apply).
572 \item advanced, powerful, first-class macros (macros functionally
573 re-write code, allowing for structural clarity and complete
574 destruction of syntax, should that be reasonable)
575 \end{itemize}
576 \end{frame}
578 \begin{frame}{Available Common Lisp Packages}
579 (They are packages and called packages, not libraries. Some people
580 can rejoice!)
581 \begin{itemize}
582 \item infrastructure \emph{enhancements}: infix-notation, data
583 structures, control and flow structures
584 \item numerics, graphics, GUIs,
585 \item primitive R to CL compiler (which could also be considered an
586 object-code compiler for R); 3 interfaces which embed R within CL.
587 \item Web 2.0 support and TeX-like reporting facilities for PDF
588 output.
589 \end{itemize}
590 See \url{http://www.common-lisp.net/} and
591 \url{http://www.cliki.org/}. CLS sources can be found on
592 \url{http://github.com/blindglobe/}
593 \end{frame}
595 \begin{frame}{Why not use R?}
596 \begin{itemize}
597 \item the R programming language is incomplete and under constant
598 redefinition. Common Lisp is standardized (for many years), with
599 many implementations
600 \item R isn't compiled and appliction delivery can be tough (package
601 delivery is mostly solved)
602 \item Without parens, Common Lisp could be R (interactive, or batch,
603 or through ``compiled applications'').
604 \item R is the Microsoft of statistical computing.
605 \item many ``warts'' that R has, can't be fixed due to sizeable user
606 populations or heavy-weight vested interests.
607 \item Evolutionary development requires strawmen upon which to use
608 for development.
609 \end{itemize}
611 \end{frame}
613 \begin{frame}{Conclusion}
615 This slowly developing research program aims to a statistical
616 computing system which enables sophisticated statistical research
617 which can be readily transfer to applications, is supportable.
619 Related numerical/statistical projects:
620 \begin{itemize}
621 \item Incanter : R/LispStat/Omegahat-like system for Clojure (Lisp
622 on the JVM)
623 \item FEMLisp : system/workshop for finite-element analysis modeling
624 using Lisp
625 \item matlisp/LispLab : LAPACK-based numerical linear algebra packages
626 \item GSLL : GNU Scientific Library, Lisp interface.
627 \item RCL, RCLG, CLSR (embedding R within Common Lisp)
628 \end{itemize}
629 \end{frame}
631 \begin{frame}{What can you do to follow up?}
633 \begin{itemize}
634 \item Read: Introduction to Common Lisp: Paul Graham's ANSI Common Lisp,
635 enjoyable book with boring title, best intro to S4 classes
636 around. Practical Common Lisp, by Peter Seibel
637 \item Consider: how a computing environment could better support
638 features in the research you do (event-time data, design,
639 longitudinal data modeling, missing and coarsened data, multiple
640 comparisons, feature selection).
641 \end{itemize}
642 The next stage of reproducible research will require computable
643 statistics (code that explains itself and can be parsed to generate
644 knowledge about its claims; ``XML's promise'').
645 \end{frame}
647 \end{document}
649 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
653 \section{BACKUPS}
656 \section{Common Lisp}
658 \begin{frame}[fragile]{Finding out things}
659 \begin{itemize}
660 \item CL-NUMLIB
661 num-sequence :from LOW to: HIGH :length SEQ-LENGTH
662 seq(from,to,by/length)
663 \item
664 \begin{verbatim}
665 (documentation
666 'cl-numlib:num-sequence
667 'function)
668 \end{verbatim}
669 \item This
670 \end{itemize}
671 \end{frame}
674 \begin{frame}{Historical Computing Languages}
675 \begin{itemize}
676 \item FORTRAN : FORmula TRANslator. Original numerical computing
677 language, designed for clean implementation of numerical
678 algorithms
679 \item LISP : LISt Processor. Associated with symbolic
680 manipulation, AI, and knowledge approaches
681 \end{itemize}
683 They represent the 2 generalized needs of statistical computing,
684 which could be summarized as
685 \begin{itemize}
686 \item algorithms/numerics,
687 \item elicitation, communication, and generation of knowledge (``data
688 analysis'')
689 \end{itemize}
690 \end{frame}
692 \begin{frame}{Statistical Computing Environments}
694 Past:
695 \begin{itemize}
696 \item SPSS / BMDP / SAS
697 \item S ( S, S-PLUS, R)
698 \item LispStat ( XLispStat, ViSta, ARC , CommonLispStat ) ; QUAIL
699 \item XGobi (Orca / GGobi / Statistical Reality Engine)
700 \item MiniTab
701 \item Stata
702 \item DataDesk
703 \item Augsburg Impressionist series (MANET,
704 \item Excel
705 \end{itemize}
706 many others...
708 \end{frame}
710 \begin{frame}{How many are left?}
712 \begin{itemize}
713 \item R
714 \item SAS
715 \item SPSS
716 \item Stata
717 \item Minitab
718 \item very few others...
719 \end{itemize}
720 ``R is the Microsoft of the statistical computing world'' -- anonymous.
721 \end{frame}
723 \begin{frame}{Selection Pressure}
724 \begin{itemize}
725 \item the R user population is growing rapidly, fueled by critical
726 mass, quality, and value
727 \item R is a great system for applied data analysis
728 \item R is not such a great system for research into statistical
729 computing (backwards compatibility, inertia due to user population)
730 \end{itemize}
731 There is a need for alternative experiments for developing new
732 approaches/ideas/concepts.
733 \end{frame}
735 \begin{frame}{Philosophically, why Common Lisp?}
736 Philosophically:
737 \begin{itemize}
738 \item Lisp can cleanly present computational intentions, both
739 symbolically and numerically.
740 \item Semantics and context are important: well supported by Lisp
741 paradigms.
742 \item Lisp's parentheses describe singular, multi-scale,
743 \alert{complete thoughts}.
744 \end{itemize}
746 \end{frame}
748 \begin{frame}{Technically, why Common Lisp?}
749 \begin{itemize}
750 \item interactive COMPILED language (``R with a compiler'')
751 \item CLOS is R's S4 object system ``done right''.
752 \item clean semantics: modality, typing, can be expressed the way
753 one wants it.
754 \item programs are data, data are programs, leading to
755 \item Most modern computing tools available (XML, WWW technologies)
756 \item ``executable XML''
757 \end{itemize}
758 Common Lisp is very close in usage to how people currently use R
759 (mostly interactive, some batch, and a wish for compilation efficiency).
760 \end{frame}
762 \subsection{Background}
764 \begin{frame}
765 \frametitle{Desire: Semantics and Statistics}
766 \begin{itemize}
767 \item The semantic web (content which is self-descriptive) is an
768 interesting and potentially useful idea.
770 \item
771 Biological informatics support (GO, Entrez) has allowed for
772 precise definitions of concepts in biology.
774 \item It is a shame that a field like statistics, requiring such
775 precision, has less than an imprecise and temporally instable
776 field such as biology\ldots
777 \end{itemize}
779 How can we express statistical work (research, applied work) which
780 is both human and computer readable (perhaps subject to
781 transformations first)?
782 \end{frame}
785 % \subsection{Context}
787 % \begin{frame}{Context}{(where I'm coming from, my ``priors'')}
788 % \begin{itemize}
789 % \item Pharmaceutical Industry
790 % \item Modeling and Simulation uses mathematical models/constructs to
791 % record beliefs (biology, pharmacology, clinical science) for
792 % explication, clinical team alignment, decision support, and
793 % quality.
794 % \item My work at Novartis is at the intersection of biomedical
795 % informatics, statistics, and mathematical modeling.
796 % \item As manager: I need a mix of applications and novel research development to
797 % solve our challenges better, faster, more efficiently.
798 % \item Data analysis is a specialized approach to computer
799 % programming, \alert{different} than applications programming or
800 % systems programming.
801 % \end{itemize}
802 % \end{frame}
805 \subsection{Literate Programming is insufficient}
807 \begin{frame}{Literate Statistical Practice.}
808 \begin{enumerate}
809 \item Literate Programming applied to data analysis (Rossini, 1997/2001)
810 \item among the \alert{most annoying} techniques to integrate into
811 work-flow if one is not perfectly methodological.
812 \item Some tools:
813 \begin{itemize}
814 \item ESS: supports interactive creation of literate programs.
815 \item Sweave: tool which exemplifies reporting context; odfWeave
816 primarily simplifies reporting.
817 \item Roxygen: primarily supports a literate programming
818 documentation style, not a literate data analysis programming
819 style.
820 \end{itemize}
821 \item ROI demonstrated in specialized cases: BioConductor.
822 \item \alert{usually done after the fact} (final step of work-flow)
823 as a documentation/computational reproducibility technique, rarely
824 integrated into work-flow.
825 \end{enumerate}
826 Many contributors:
827 Knuth, Claerbout, Carey, de Leeuw, Leisch, Gentleman, Temple-Lang,
828 \ldots{}
829 \end{frame}
831 \begin{frame}
832 \frametitle{Literate Programming}
833 \framesubtitle{Why isn't it enough for Data Analysis?}
835 Only 2 contexts: (executable) code and documentation. Fine for
836 application programming, but for data analysis, we could benefit
837 from:
838 \begin{itemize}
839 \item classification of statistical procedures
840 \item descriptions of assumptions
841 \item pragmatic recommendations
842 \item inheritance of structure through the work-flow of a
843 statistical methodology or data analysis project
844 \item datasets and metadata
845 \end{itemize}
846 Concept: ontologies describing mathematical assumptions, applications
847 of methods, work-flow, and statistical data structures can enable
848 machine communication.
850 (i.e. informatics framework ala biology)
851 \end{frame}
854 \begin{frame}{Communication in Statistical Practice}{\ldots is essential for \ldots}
855 \begin{itemize}
856 \item finding
857 \item explanations
858 \item agreement
859 \item receiving information
860 \end{itemize}
861 \alert{``machine-readable'' communication/computation lets the
862 computer help} \\
863 Semantic Web is about ``machine-enabled computability''.
864 \end{frame}
866 \begin{frame} \frametitle{Semantics}
867 \framesubtitle{One definition: description and context}
869 Interoperability is the key, with respect to
870 \begin{itemize}
871 \item ``Finding things''
872 \item Applications and activities with related functionality
873 \begin{itemize}
874 \item moving information from one state to another (paper, journal
875 article, computer program)
876 \item computer programs which implement solutions to similar tasks
877 \end{itemize}
878 \end{itemize}
879 \end{frame}
882 \begin{frame}{Statistical Practice is somewhat restricted}
883 {...but in a good sense, enabling potential for semantics...}
885 There is a restrictable set of intended actions for what can be done
886 -- the critical goal is to be able to make a difference by
887 accelerating activities that should be ``computable'':
888 \begin{itemize}
889 \item restricted natural language processing
890 \item mathematical translation
891 \item common description of activities for simpler programming/data
892 analysis (S approach to objects and methods)
893 \end{itemize}
894 R is a good basic start (model formulation approach, simple
895 ``programming with data'' paradigm); we should see if we can do
896 better!
897 \end{frame}
899 \begin{frame}{Computable and Executable Statistics requires}
901 \begin{itemize}
902 \item approaches to describe data and metadata (``data'')
903 \begin{itemize}
904 \item semantic WWW
905 \item metadata management and integration, driving
906 \item data integration
907 \end{itemize}
908 \item approaches to describe data analysis methods (``models'')
909 \begin{itemize}
910 \item quantitatively: many ontologies (AMS, etc), few meeting
911 statistical needs.
912 \item many substantive fields have implementations
913 (bioinformatics, etc) but not well focused.
914 \end{itemize}
915 \item approaches to describe the specific form of interaction
916 (``instances of models'')
917 \begin{itemize}
918 \item Original idea behind ``Literate Statistical Analysis''.
919 \item That idea is suboptimal, more structure needed (not
920 necessarily built upon existing...).
921 \end{itemize}
922 \end{itemize}
923 \end{frame}
925 \subsection{Common Lisp Statistics}
927 \begin{frame}
928 \frametitle{Interactive Programming}
929 \framesubtitle{Everything goes back to being Lisp-like}
930 \begin{itemize}
931 \item Interactive programming (as originating with Lisp): works
932 extremely well for data analysis (Lisp being the original
933 ``programming with data'' language).
934 \item Theories/methods for how to do this are reflected in styles
935 for using R.
936 \end{itemize}
937 \end{frame}
939 \begin{frame}[fragile]
940 \frametitle{Lisp}
942 Lisp (LISt Processor) is different than most high-level computing
943 languages, and is very old (1956). Lisp is built on lists of things
944 which are evaluatable.
945 \begin{verbatim}
946 (functionName data1 data2 data3)
947 \end{verbatim}
948 or ``quoted'':
949 \begin{verbatim}
950 '(functionName data1 data2 data3)
951 \end{verbatim}
952 which is shorthand for
953 \begin{verbatim}
954 (list functionName data1 data2 data3)
955 \end{verbatim}
956 The difference is important -- lists of data (the second/third) are
957 not (yet?!) functions applied to (unencapsulated lists of) data (the first).
958 \end{frame}
960 \begin{frame}
961 \frametitle{Features}
962 \begin{itemize}
963 \item Data and Functions semantically the same
964 \item Natural interactive use through functional programming with
965 side effects
966 \item Batch is a simplification of interactive -- not a special mode!
967 \end{itemize}
968 \end{frame}
972 \begin{frame}[fragile]{Representation: XML and Lisp}{executing your data}
973 Many people are familiar with XML:
974 \begin{verbatim}
975 <name phone="+41793674557">Tony Rossini</name>
976 \end{verbatim}
977 which is shorter in Lisp:
978 \begin{verbatim}
979 (name "Tony Rossini" :phone "+41613674557")
980 \end{verbatim}
981 \begin{itemize}
982 \item Lisp ``parens'', universally hated by unbelievers, are
983 wonderful for denoting when a ``concept is complete''.
984 \item Why can't your data self-execute?
985 \end{itemize}
986 \end{frame}
988 \begin{frame}[fragile]{Numerics with Lisp}
989 \begin{itemize}
990 \item addition of rational numbers and arithmetic
991 \item example for mean
992 \begin{verbatim}
993 (defun mean (x)
994 (checktype x 'vector-like)
995 (/ (loop for i from 0 to (- (nelts *x*) 1)
996 summing (vref *x* i))
997 (nelts *x*)))
998 \end{verbatim}
999 \item example for variance
1000 \begin{verbatim}
1001 (defun variance (x)
1002 (let ((meanx (mean x))
1003 (nm1 (1- (nelts x))))
1004 (/ (loop for i from 0 to nm1
1005 summing (power (- (vref *x* i) meanx) 2)
1006 nm1))))
1007 \end{verbatim}
1008 \item But through macros, \verb+(vref *x* i)+ could be
1009 \verb+#V(X[i])+ or your favorite syntax.
1010 \end{itemize}
1012 \end{frame}
1015 \begin{frame}{Common Lisp Statistics 1}
1016 \begin{itemize}
1017 \item Originally based on LispStat (reusability)
1018 \item Re-factored structure (some numerics worked with a 1990-era code base).
1019 \item Current activities:
1020 \begin{enumerate}
1021 \item numerics redone using CFFI-based BLAS/LAPLACK (cl-blapack)
1022 \item matrix interface based on MatLisp
1023 \item starting design of a user interface system (interfaces,
1024 visuals).
1025 \item general framework for model specification (regression,
1026 likelihood, ODEs)
1027 \item general framework for algorithm specification (bootstrap,
1028 MLE, algorithmic data anaylsis methods).
1029 \end{enumerate}
1030 \end{itemize}
1031 \end{frame}
1033 \begin{frame}{Common Lisp Statistics 2}
1035 \begin{itemize}
1036 \item Implemented using SBCL. Contributed fixes for
1037 Clozure/OpenMCL. Goal to target CLISP
1038 \item Supports LispStat prototype object system
1039 \item Package-based design -- only use the components you need, or
1040 the components whose API you like.
1041 \end{itemize}
1042 \end{frame}
1044 \section{Discussion}
1046 \begin{frame}
1047 \frametitle{Outlook}
1048 \begin{itemize}
1049 \item Semantics and Computability have captured a great deal of
1050 attention in the informatics and business computing R\&D worlds
1051 \item Statistically-driven Decision Making and Knowledge Discovery
1052 is, with high likelihood, the next challenging stage after data
1053 integration.
1054 \item Statistical practice (theory and application) can be enhanced,
1055 made more efficient, providing increased benefit to organizations
1056 and groups using appropriate methods.
1057 \item Lisp as a language, shares characteristics of both Latin
1058 (difficult dead language useful for classical training) and German
1059 (difficult living language useful for general life). Of course,
1060 for some people, they are not difficult.
1061 \end{itemize}
1063 \end{frame}
1065 \begin{frame}
1066 The research program described in this talk is currently driving the
1067 design of CommonLisp Stat, which leverages concepts and approaches
1068 from the dead and moribund LispStat project.
1070 \begin{itemize}
1071 \item \url{http://repo.or.cz/w/CommonLispStat.git/}
1072 \item \url{http://www.github.com/blindglobe/}
1073 \end{itemize}
1075 \end{frame}
1076 \begin{frame}{Final Comment}
1078 \begin{itemize}
1079 \item In the Pharma industry, it is all about getting the right
1080 drugs to the patient faster. Data analysis systems seriously
1081 impact this process, being potentially an impediment or an
1082 accelerator.
1084 \begin{itemize}
1085 \item \alert{Information technologies can increase the efficiency
1086 of statistical practice}, though innovation change management
1087 must be taking into account. (i.e. Statistical practice, while
1088 considered by some an ``art form'', can benefit from
1089 industrialization).
1090 \item \alert{Lisp's features match the basic requirements we need}
1091 (dichotomy: programs as data, data as programs). Sales pitch,
1092 though...
1093 \item Outlook: Lots of work and experimentation to do!
1094 \end{itemize}
1095 \end{itemize}
1096 \end{frame}
1099 % % All of the following is optional and typically not needed.
1100 % \appendix
1103 % \section<presentation>*{\appendixname}
1106 % \begin{frame} \frametitle{Complements and Backup}
1107 % No more, stop here. Questions? (now or later).
1108 % \end{frame}
1110 % \begin{frame}{The Industrial Challenge.}{Getting the Consulting Right.}
1111 % % - A title should summarize the slide in an understandable fashion
1112 % % for anyone how does not follow everything on the slide itself.
1114 % \begin{itemize}
1115 % \item Recording assumptions for the next data analyst, reviewer.
1116 % Use \texttt{itemize} a lot.
1117 % \item
1118 % Use very short sentences or short phrases.
1119 % \end{itemize}
1120 % \end{frame}
1123 % \begin{frame}{The Industrial Challenge.}{Getting the Right Research Fast.}
1124 % % - A title should summarize the slide in an understandable fashion
1125 % % for anyone how does not follow everything on the slide itself.
1127 % \begin{itemize}
1128 % \item
1129 % Use \texttt{itemize} a lot.
1130 % \item
1131 % Use very short sentences or short phrases.
1132 % \end{itemize}
1133 % \end{frame}
1136 % \begin{frame}{Explicating the Work-flow}{QA/QC-based improvements.}
1139 % \end{frame}
1141 % \section{Motivation}
1143 % \subsection{IT Can Speed up Deliverables in Statistical Practice}
1145 % \begin{frame}{Our Generic Work-flow and Life-cycle}
1146 % {describing most data analytic activities}
1147 % Workflow:
1148 % \begin{enumerate}
1149 % \item Scope out the problem
1150 % \item Sketch out a potential solution
1151 % \item Implement until road-blocks appear
1152 % \item Deliver results
1153 % \end{enumerate}
1155 % Lifecycle:
1156 % \begin{enumerate}
1157 % \item paper sketch
1158 % \item 1st e-draft of text/code/date (iterate to \#1, discarding)
1159 % \item cycle through work
1160 % \item publish
1161 % \item ``throw-away''
1162 % \end{enumerate}
1163 % but there is valuble information that could enable the next
1164 % generation!
1165 % \end{frame}
1167 % \begin{frame}[fragile]{Paper $\rightarrow$ Computer $\rightarrow$ Article $\rightarrow$ Computer}{Cut and Paste makes for large errors.}
1168 % \begin{itemize}
1169 % \item Problems in a regulatory setting
1170 % \item Regulatory issues are just ``best practices''
1171 % \end{itemize}
1173 % Why do we ``copy/paste'', or analogously, restart our work?
1175 % pro:
1176 % \begin{itemize}
1177 % \item every time we repeat, we reinforce the idea in our brain
1178 % \item review of ideas can help improve them
1179 % \end{itemize}
1180 % con:
1181 % \begin{itemize}
1182 % \item inefficiency
1183 % \item introduction of mistakes
1184 % \item loss of historical context
1185 % \item changes to earlier work (on a different development branch)
1186 % can not propagate.
1187 % \end{itemize}
1188 % \end{frame}
1190 % \section{Semantics and Statistical Practice}
1193 % \begin{frame}
1194 % \frametitle{Statistical Activity Leads to Reports}
1195 % \framesubtitle{You read what you know, do you understand it?}
1197 % How can we improve the communication of the ideas we have?
1199 % Precision of communication?
1201 % \end{frame}
1205 % \begin{frame} \frametitle{Communication Requires Context}
1206 % \framesubtitle{Intentions imply more than one might like...}
1208 % \begin{itemize}
1209 % \item Consideration of what we might do
1210 % \item Applications with related functionality
1211 % \end{itemize}
1212 % \end{frame}
1216 % \begin{frame}
1217 % \frametitle{Design Patterns}
1218 % \framesubtitle{Supporting Work-flow Transitions}
1220 % (joint work with H Wickham): The point of this research program is
1221 % not to describe what to do at any particular stage of work, but to
1222 % encourage researchers and practitioners to consider how the
1223 % translation and transfer of information between stages so that work
1224 % is not lost.
1226 % Examples of stages in a work-flow:
1227 % \begin{itemize}
1228 % \item planning, execution, reporting;
1229 % \item scoping, illustrative examples or counter examples, algorithmic construction,
1230 % article writing.
1231 % \item descriptive statistics, preliminary inferential analysis,
1232 % model/assumption checking, final inferential analysis,
1233 % communication of scientific results
1234 % \end{itemize}
1235 % Description of work-flows is essential to initiating discussions on
1236 % quality/efficiency of approaches to work.
1237 % \end{frame}
1239 % \section{Design Challenges}
1241 % \begin{frame}
1242 % \frametitle{Activities are enhanced by support}
1244 % \begin{itemize}
1245 % \item Mathematical manipulation can be enhanced by symbolic
1246 % computation
1247 % \item Statistical programming can be enabled by examples and related
1248 % algorithm implementation
1249 % \item Datasets, to a limited extent, can self-describe.
1250 % \end{itemize}
1251 % \end{frame}
1253 % \begin{frame}
1254 % \frametitle{Executable and Computable Science}
1256 % Use of algorithms and construction to describe how things work.
1258 % Support for agent-based approaches
1259 % \end{frame}
1262 % \begin{frame}
1263 % \frametitle{What is Data? Metadata?}
1265 % Data: what we've observed
1267 % MetaData: context for observations, enables semantics.
1268 % \end{frame}
1273 % % \begin{frame}[fragile]
1274 % % \frametitle{Defining Variables}
1275 % % \framesubtitle{Setting variables}
1276 % % \begin{verbatim}
1277 % % (setq <variable> <value>)
1278 % % \end{verbatim}
1279 % % Example:
1280 % % \begin{verbatim}
1281 % % (setq ess-source-directory
1282 % % "/home/rossini/R-src")
1283 % % \end{verbatim}
1284 % % \end{frame}
1286 % % \begin{frame}[fragile]
1287 % % \frametitle{Defining on the fly}
1288 % % \begin{verbatim}
1289 % % (setq ess-source-directory
1290 % % (lambda () (file-name-as-directory
1291 % % (expand-file-name
1292 % % (concat (default-directory)
1293 % % ess-suffix "-src")))))
1294 % % \end{verbatim}
1295 % % (Lambda-expressions are anonymous functions, i.e. ``instant-functions'')
1296 % % \end{frame}
1299 % % \begin{frame}[fragile]
1300 % % \frametitle{Function Reuse}
1301 % % By naming the function, we could make the previous example reusable
1302 % % (if possible):
1303 % % \begin{verbatim}
1304 % % (defun my-src-directory ()
1305 % % (file-name-as-directory
1306 % % (expand-file-name
1307 % % (concat (default-directory)
1308 % % ess-suffix "-src"))))
1309 % % \end{verbatim}
1310 % % Example:
1311 % % \begin{verbatim}
1312 % % (setq ess-source-directory (my-src-directory))
1313 % % \end{verbatim}
1314 % % \end{frame}
1317 % % \begin{frame}
1318 % % \frametitle{Equality Among Packages}
1319 % % \begin{itemize}
1320 % % \item more/less equal can be described specifically through
1321 % % overriding imports.
1322 % % \end{itemize}
1323 % % \end{frame}
1326 % \subsection<presentation>*{For Further Reading}
1328 % \begin{frame}[allowframebreaks]
1329 % \frametitle<presentation>{Related Material}
1331 % \begin{thebibliography}{10}
1333 % \beamertemplatebookbibitems
1334 % % Start with overview books.
1336 % \bibitem{LispStat1990}
1337 % L.~Tierney
1338 % \newblock {\em LispStat}.
1340 % \beamertemplatearticlebibitems
1341 % % Followed by interesting articles. Keep the list short.
1343 % \bibitem{Rossini2001}
1344 % AJ.~Rossini
1345 % \newblock Literate Statistical Practice
1346 % \newblock {\em Proceedings of the Conference on Distributed
1347 % Statistical Computing}, 2001.
1349 % \bibitem{RossiniLeisch2003}
1350 % AJ.~Rossini and F.~Leisch
1351 % \newblock Literate Statistical Practice
1352 % \newblock {\em Technical Report Series, University of Washington
1353 % Department of Biostatistics}, 2003.
1355 % \beamertemplatearrowbibitems
1356 % % Followed by interesting articles. Keep the list short.
1358 % \bibitem{CLS}
1359 % Common Lisp Stat, 2008.
1360 % \newblock \url{http://repo.or.cz/CommonLispStat.git/}
1362 % \end{thebibliography}
1363 % \end{frame}