fixed the Clojure project name.
[CommonLispStat.git] / Doc / talks / Rossini-DSC-July2009.tex
blob6413c748784cbfcf350ecb97aed53fb24d2d4073
1 \documentclass{beamer}
3 \mode<presentation>
5 \usetheme{classic}
6 \setbeamercovered{transparent}
9 \usepackage[english]{babel}
10 \usepackage[latin1]{inputenc}
11 \usepackage{times}
12 \usepackage[T1]{fontenc}
13 \usepackage{url}
15 \title[CLS]{Common Lisp Statistics}
16 \subtitle{Using History to design better data analysis environments}
17 \author[Rossini]{Anthony~(Tony)~Rossini}
19 \institute[Novartis and University of Washington]{
20 Group Head, Modeling and Simulation Statistics\\
21 Novartis Pharma AG, Switzerland
22 \and
23 Affiliate Assoc Prof, Biomedical and Health Informatics\\
24 University of Washington, USA}
26 \date[DSC2009]{DSC 2009, Copenhagen}
27 \subject{Statistical Computing Environments}
29 \begin{document}
31 \begin{frame}
32 \titlepage
33 \end{frame}
35 % Structuring a talk is a difficult task and the following structure
36 % may not be suitable. Here are some rules that apply for this
37 % solution:
39 % - Exactly two or three sections (other than the summary).
40 % - At *most* three subsections per section.
41 % - Talk about 30s to 2min per frame. So there should be between about
42 % 15 and 30 frames, all told.
44 % - A conference audience is likely to know very little of what you
45 % are going to talk about. So *simplify*!
46 % - In a 20min talk, getting the main ideas across is hard
47 % enough. Leave out details, even if it means being less precise than
48 % you think necessary.
49 % - If you omit details that are vital to the proof/implementation,
50 % just say so once. Everybody will be happy with that.
52 \section{What Works?}
53 \label{sec:work}
55 \begin{frame}{Is it Vaporware?}
57 Not quite...
58 \end{frame}
60 \subsection{Graphics}
61 \label{sec:work:graphics}
63 \begin{frame}{Silly Visualization Example}
64 \includegraphics[width=3in,height=3in]{/home/tony/test1.png}
65 \end{frame}
67 \begin{frame}[fragile]{Graphics Device}
68 \begin{verbatim}
69 (defparameter *frame2*
70 (as-frame (create-xlib-image-context 200 200)
71 :background-color +white+))
72 (bind ((#2A((f1 f2) (f3 f4))
73 (split-frame *frame2*
74 (percent 50)
75 (percent 50))))
76 (defparameter *f1* f1) ; lower left
77 (defparameter *f2* f2) ; lower right f3 f4
78 (defparameter *f3* f3) ; top left f1 f2
79 (defparameter *f4* f4)); top right
80 \end{verbatim}
81 \end{frame}
83 \begin{frame}[fragile]{Functions to Plot}
84 \begin{verbatim}
85 (plot-function *f1* #'sin
86 (interval-of 0 2)
87 :x-title "x" :y-title "sin(x)")
88 (plot-function *f2* #'cos (interval-of 0 2)
89 :x-title "x" :y-title "cos(x)")
90 (plot-function *f3* #'tan (interval-of 0 2)
91 :x-title "x" :y-title "tan(x)")
92 \end{verbatim}
93 \end{frame}
95 \begin{frame}[fragile]{Things to Plot}
96 \small{
97 \begin{verbatim}
98 (let* ((n 500)
99 (xs (num-sequence
100 :from 0 :to 10 :length n))
101 (ys (map 'vector
102 #'(lambda (x) (+ x 8 (random 4.0)))
103 xs))
104 (weights
105 (replicate #'(lambda () (1+ (random 10)))
106 n 'fixnum))
107 (da (plot-simple *f4*
108 (interval-of 0 10)
109 (interval-of 10 20)
110 :x-title "x" :y-title "y")))
111 (draw-symbols da xs ys :weights weights))
112 \end{verbatim}
114 \end{frame}
116 \begin{frame}[fragile]{Copying existing graphics}
117 And we generated the figure on the first page by:
118 \begin{verbatim}
119 (xlib-image-context-to-png
120 (context *f1*)
121 "/home/tony/test1.png")
122 \end{verbatim}
123 \end{frame}
125 \subsection{Statistical Models}
126 \label{sec:work:statmod}
128 \begin{frame}[fragile]{Linear Regression}
129 Primitive LispStat, a wrapper around LAPACK's \texttt{dgelsy}:
130 \small{
131 \begin{verbatim}
132 (defparameter *result1*
133 (regression-model
134 (list->vector-like iron)
135 (list->vector-like absorbtion)))
136 *result*1 =>
137 \end{verbatim}
139 \end{frame}
141 \subsection{Numerical Descriptions}
142 \label{sec:work:numdesc}
144 \begin{frame}[fragile]{Descriptives}
145 (mean iron)
147 \end{frame}
149 \subsection{Data Manip/Mgmt}
150 \label{sec:work:data}
152 \begin{frame}[verbatim]{DataFrames}
154 \end{frame}
156 \begin{frame}[verbatim]{Numerical Matrices}
158 \end{frame}
160 \begin{frame}{Managing / Manipulating Data}
162 \end{frame}
165 \begin{frame}{Outline}
166 \tableofcontents
167 \end{frame}
169 \section{Common Lisp Statistics}
170 \label{sec:CLS}
172 \begin{frame}{Why CLS?}
173 \begin{itemize}
174 \item a component-based structure for statistical computing
175 \item Common Lisp provides a simple, \emph{primitive}, syntax
176 \item Common Lisp provides an amazing number of advanced features
177 that keep getting reinvented in other languages.
178 \item Common Lisp has linkages to many amazing features developed in
179 other languages.
180 \item ability to leverage non-statisticians interested in computing
181 technologies (compilers, protocols, interfaces, libraries,
182 functionality which can be reused for statistical purposes)
183 \item This is a ``customization'' through packages to support
184 statistical computing, not a independent language. ``Ala Carte'',
185 not ``Menu''.
186 \end{itemize}
187 \end{frame}
189 \subsection{Implementation Plans}
190 \label{sec:CLS:impl}
193 \begin{frame}{Current Functionality}
194 \begin{itemize}
195 \item basic dataframes (similar to R); indexing/slicing API under
196 development.
197 \item Basic regression (similar to XLispStat)
198 \item matrix storage both in foreign and lisp-centric areas.
199 \item LAPACK (small percentage, increasing), working with both
200 matrix storage types
201 \item static graphics (X11) including preliminary grid functionality based
202 on CAIRO. Generation of PNG files from graphics windows.
203 \item CSV file support
204 \item Common Lisp!
205 \end{itemize}
206 \end{frame}
208 \begin{frame}[fragile]{Computational Environment Supported}
209 \begin{itemize}
210 \item Should work on Linux, with recent SBCL versions
211 \item Definitely works on bleeding edge Debian (unstable).
212 \item Has worked for weak definitions of ``work'' on 4 different
213 people's environments (not quite, but sort of requires a
214 \verb+/home/tony/+ !)
215 \item Threaded support on threaded lisps (SBCL/CCL, soon CLISP).
216 But not yet integrated.
217 \end{itemize}
218 \end{frame}
220 \begin{frame}{Goals}
221 Short Term
222 \begin{itemize}
223 \item Better integration of data structures with statistical routines
224 (auto-handling with dataframes, rather than manual parsing).
225 \end{itemize}
226 Medium/Long Term
227 \begin{itemize}
228 \item Support for CLISP (byte-compiled interpreted lisp) and Clozure
229 Common Lisp (formerly OpenMCL)
230 \item high-level Front-end API to a number of matrix and numerical
231 packages and numerical structures (
232 \item constraint system for interactive GUIs and graphics
233 \item full LispStat compatibility (object system partially works;
234 GUI support coming).
235 \item Integrated threading via Bordeaux threads (portable CL API package).
236 \end{itemize}
237 \end{frame}
239 \subsection{Common Lisp}
240 \label{sec:CLS:lisp}
242 \begin{frame}{Common Lisp}
243 \begin{itemize}
244 \item Lisp-2 (symbols can denote both a separate function and a value)
245 \item ANSI standard (built by committee, but the committee was
246 reasonably smart)
247 \item Many implementations
248 \item Most implementations are interactive \textbf{compiled}
249 languages (few are interpreted, and those are usually
250 byte-compiled).
251 \item Parens provide clear delineation of a \textbf{Complete
252 Thought} (functional programming with side effects).
253 \item The Original \emph{Programming with Data} Language
254 (\emph{Programs are Data} and \emph{Data are Executable} also
255 apply).
256 \item advanced, powerful, first-class macros (macros functionally
257 re-write code)
258 \item
259 \end{itemize}
260 \end{frame}
262 \begin{frame}{Common Lisp Packages}
263 (They are packages and called packages, not libraries. Some people
264 can rejoice!)
265 \begin{itemize}
266 \item infrastructure enhancement: infix-notation, data structures,
267 control and flow structures
268 \item numerics, graphics, GUIs,
269 \item primitive R to CL compiler (which could also be considered an
270 object-code compiler for R); 3 interfaces which embed R within CL.
271 \item
272 \end{itemize}
273 See \url{http://www.common-lisp.net/} and
274 \url{http://www.cliki.org/}. CLS sources can be found on
275 \url{http://github.com/blindglobe/}
276 \end{frame}
278 \section{What else about CLS is still Vaporware?}
280 \begin{frame}{What does NOT work?}
281 Primarily, the reason that we doing this:
283 \textbf{Computable and Executable Statistics}
285 (which is the subject of another talk, slides in the backup).
286 \end{frame}
290 \section{Discussion}
292 \begin{frame}{Conclusion}
293 Active but slow development, spanning the range of needs:
294 \begin{itemize}
295 \item Numerics: Linear algebra basics done -- full development
296 \item Static graphics: progress being made, have a partial
297 grid-solution, need interactive graphics
298 \item LispStat emulation needs to be finished
299 \item Model specification and unification
300 \end{itemize}
301 Related numerical/statistical projects:
302 \begin{itemize}
303 \item Incanter : R/LispStat/Omegahat-like system for Clojure (Lisp
304 on the JVM)
305 \item FEMLisp : system/workshop for finite-element analysis modeling
306 using Lisp
307 \item matlisp/LispLab : LAPACK-based numerical linear algebra packages
308 \item GSLL : GNU Scientific Library, Lisp interface.
309 \end{itemize}
310 Finally: support for a new statistical programming environment
311 modality (subject for another talk).
312 \end{frame}
315 \end{document}
317 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
321 \section{BACKUPS}
324 \section{Common Lisp}
326 \begin{frame}[fragile]{Finding out things}
327 \begin{itemize}
328 \item CL-NUMLIB
329 num-sequence :from LOW to: HIGH :length SEQ-LENGTH
330 seq(from,to,by/length)
331 \item
332 \begin{verbatim}
333 (documentation
334 'cl-numlib:num-sequence
335 'function)
336 \end{verbatim}
337 \item This
338 \end{itemize}
339 \end{frame}
341 \section{Computable Statistics}
343 \begin{frame}{Can we compute with them?}
344 3 Examples, of which we only present the first
345 \begin{itemize}
346 \item Research.
347 \item Consulting, Applied Statistics, Scientific Honesty.
348 \item Reimplementation.
349 \end{itemize}
350 Consider whether one can ``compute'' with the information given?
351 (that is:
352 \begin{itemize}
353 \item do we have sufficient information to communicate enough
354 for the right person to recreate the analysis?
355 \item have we sufficient clarity to prevent misunderstandings about
356 intentions and claims?
357 \end{itemize}
359 \end{frame}
361 \begin{frame}[fragile]{Example 1: Theory\ldots}
362 \label{example1}
363 Let $f(x;\theta)$ describe the likelihood of XX under the following
364 assumptions.
365 \begin{enumerate}
366 \item assumption-1
367 \item assumption-2
368 \end{enumerate}
369 Then if we use the following algorithm:
370 \begin{enumerate}
371 \item step-1
372 \item step-2
373 \end{enumerate}
374 then $\hat{\theta}$ should be $N(0,\hat\sigma^2)$ with the following
375 characteristics\ldots
376 \end{frame}
378 \begin{frame}
379 \frametitle{Can we compute, using this description?}
380 Given the information at hand:
381 \begin{itemize}
382 \item we ought to have a framework for initial coding for the
383 actual simulations (test-first!)
384 \item the implementation is somewhat clear
385 \item We should ask: what theorems have similar assumptions?
386 \item We should ask: what theorems have similar conclusions but
387 different assumptions?
388 \end{itemize}
389 \end{frame}
391 \begin{frame}[fragile]{Realizing Theory}
392 \small{
393 \begin{verbatim}
394 (define-theorem my-proposed-theorem
395 (:theorem-type '(distribution-properties
396 frequentist
397 likelihood))
398 (:assumes '(assumption-1 assumption-2))
399 (:likelihood-form
400 (defun likelihood (data theta gamma)
401 (exponential-family theta gamma)))
402 (:compute-by
403 '(progn
404 (compute-starting-values thetahat gammahat)
405 (until (convergence)
406 (setf convergence
407 (or (step-1 thetahat)
408 (step-2 gammahat))))))
409 (:claim (assert
410 (and (equal-distribution thetahat 'normal)
411 (equal-distribution gammahat 'normal)))))
412 \end{verbatim}
414 \end{frame}
416 \begin{frame}[fragile]{It would be nice to have}
417 \begin{verbatim}
418 (theorem-veracity 'my-proposed-theorem)
419 \end{verbatim}
420 \end{frame}
422 \begin{frame}[fragile]{and why not...?}
423 \begin{verbatim}
424 (when (theorem-veracity
425 'my-proposed-theorem)
426 (write-paper 'my-proposed-theorem
427 :style :JASA
428 :output-format
429 '(LaTeX MSWord)))
430 \end{verbatim}
431 \end{frame}
433 \begin{frame}{Comments}
434 \begin{itemize}
435 \item The general problem is very difficult
436 \item I'm working on some basic statistical proof of concepts (not
437 finished): linear regression (LS-based, Normal-bayesian) and the
438 T-test.
439 \item Areas targetted for medium-term future: resampling methods and
440 similar algorithms.
441 \end{itemize}
442 \end{frame}
444 \begin{frame}
445 \frametitle{Example 2: Practice\ldots}
446 \label{example2}
447 The dataset comes from a series of clinical trials. We model the
448 primary endpoint, ``relief'', as a binary random variable. There is
449 a random trial effect on relief as well as severity due to
450 differences in recruitment and inclusion/exclusion criteria.
451 \end{frame}
453 \begin{frame}
454 \frametitle{Can we compute, using this description?}
455 \begin{itemize}
456 \item With a real such description, it is clear what some of the
457 potential models might be for this dataset
458 \item It should be clear how to start thinking of a data dictionary
459 for this problem.
460 \end{itemize}
461 \end{frame}
463 \begin{frame}[fragile]{Can we compute?}
464 \begin{verbatim}
465 (dataset-metadata paper-1
466 :context 'clinical-trials
467 :variables '((relief :model-type dependent
468 :distribution binary)
469 (trial :model-type independent
470 :distribution categorical)
471 (disease-severity))
472 :metadata '(inclusion-criteria
473 exclusion-criteria
474 recruitment-rate))
475 (propose-analysis paper-1)
476 ; => '(tables
477 ; (logistic regression))
478 \end{verbatim}
479 \end{frame}
481 \begin{frame}{Example 3: The Round-trip\ldots}
482 \label{example3}
483 The first examples describe ``ideas $\rightarrow$ code''
485 Consider the last time you read someone else's implementation of a
486 statistical procedure (i.e. R package code). When you read the
487 code, could you see:
488 \begin{itemize}
489 \item the assumptions used?
490 \item the algorithm implemented?
491 \item practical guidance for when you might select the algorithm
492 over others?
493 \item practical guidance for when you might select the
494 implementation over others?
495 \end{itemize}
496 These are usually components of any reasonable journal article.
497 \textit{(Q: have you actually read an R package that wasn't yours?)}
498 \end{frame}
500 \begin{frame}{Exercise left to the reader!}
502 % (aside: I have been looking at the \textbf{stats} and \textbf{lme4}
503 % packages recently -- \textit{for me}, very clear numerically, much
504 % less so statistically)
505 \end{frame}
509 \section{Context}
511 \begin{frame}{Goals for this Talk}{(define, strategic approach,
512 justify)}
514 \begin{itemize}
515 \item To describe the concept of \alert{computable and executable
516 statistics}, placing it in a historical context.
518 \item To demonstrate that \alert{a research program}
519 implemented through simple steps can increase the efficiency of
520 statistical computing approaches by clearly describing both:
521 \begin{itemize}
522 \item numerical characteristics of procedures,
523 \item statistical concepts driving them.
524 \end{itemize}
526 \item To justify that the \alert{approach is worthwhile} and
527 represents a staged effort towards \alert{increased use of best
528 practices}.
529 \end{itemize}
530 (unfortunately, the last is still incomplete)
531 \end{frame}
534 \begin{frame}{Historical Computing Languages}
535 \begin{itemize}
536 \item FORTRAN : FORmula TRANslator. Original numerical computing
537 language, designed for clean implementation of numerical
538 algorithms
539 \item LISP : LISt Processor. Associated with symbolic
540 manipulation, AI, and knowledge approaches
541 \end{itemize}
543 They represent the 2 generalized needs of statistical computing,
544 which could be summarized as
545 \begin{itemize}
546 \item algorithms/numerics,
547 \item elicitation, communication, and generation of knowledge (``data
548 analysis'')
549 \end{itemize}
550 \end{frame}
552 \begin{frame}{Statistical Computing Environments}
554 Past:
555 \begin{itemize}
556 \item SPSS / BMDP / SAS
557 \item S ( S, S-PLUS, R)
558 \item LispStat ( XLispStat, ViSta, ARC , CommonLispStat ) ; QUAIL
559 \item XGobi (Orca / GGobi / Statistical Reality Engine)
560 \item MiniTab
561 \item Stata
562 \item DataDesk
563 \item Augsburg Impressionist series (MANET,
564 \item Excel
565 \end{itemize}
566 many others...
568 \end{frame}
570 \begin{frame}{How many are left?}
572 \begin{itemize}
573 \item R
574 \item SAS
575 \item SPSS
576 \item Stata
577 \item Minitab
578 \item very few others...
579 \end{itemize}
580 ``R is the Microsoft of the statistical computing world'' -- anonymous.
581 \end{frame}
583 \begin{frame}{Selection Pressure}
584 \begin{itemize}
585 \item the R user population is growing rapidly, fueled by critical
586 mass, quality, and value
587 \item R is a great system for applied data analysis
588 \item R is not such a great system for research into statistical
589 computing (backwards compatibility, inertia due to user population)
590 \end{itemize}
591 There is a need for alternative experiments for developing new
592 approaches/ideas/concepts.
593 \end{frame}
595 \begin{frame}{Philosophically, why Common Lisp?}
596 Philosophically:
597 \begin{itemize}
598 \item Lisp can cleanly present computational intentions, both
599 symbolically and numerically.
600 \item Semantics and context are important: well supported by Lisp
601 paradigms.
602 \item Lisp's parentheses describe singular, multi-scale,
603 \alert{complete thoughts}.
604 \end{itemize}
606 \end{frame}
608 \begin{frame}{Technically, why Common Lisp?}
609 \begin{itemize}
610 \item interactive COMPILED language (``R with a compiler'')
611 \item CLOS is R's S4 object system ``done right''.
612 \item clean semantics: modality, typing, can be expressed the way
613 one wants it.
614 \item programs are data, data are programs, leading to
615 \item Most modern computing tools available (XML, WWW technologies)
616 \item ``executable XML''
617 \end{itemize}
618 Common Lisp is very close in usage to how people currently use R
619 (mostly interactive, some batch, and a wish for compilation efficiency).
620 \end{frame}
622 \subsection{Background}
624 \begin{frame}
625 \frametitle{Desire: Semantics and Statistics}
626 \begin{itemize}
627 \item The semantic web (content which is self-descriptive) is an
628 interesting and potentially useful idea.
630 \item
631 Biological informatics support (GO, Entrez) has allowed for
632 precise definitions of concepts in biology.
634 \item It is a shame that a field like statistics, requiring such
635 precision, has less than an imprecise and temporally instable
636 field such as biology\ldots
637 \end{itemize}
639 How can we express statistical work (research, applied work) which
640 is both human and computer readable (perhaps subject to
641 transformations first)?
642 \end{frame}
645 % \subsection{Context}
647 % \begin{frame}{Context}{(where I'm coming from, my ``priors'')}
648 % \begin{itemize}
649 % \item Pharmaceutical Industry
650 % \item Modeling and Simulation uses mathematical models/constructs to
651 % record beliefs (biology, pharmacology, clinical science) for
652 % explication, clinical team alignment, decision support, and
653 % quality.
654 % \item My work at Novartis is at the intersection of biomedical
655 % informatics, statistics, and mathematical modeling.
656 % \item As manager: I need a mix of applications and novel research development to
657 % solve our challenges better, faster, more efficiently.
658 % \item Data analysis is a specialized approach to computer
659 % programming, \alert{different} than applications programming or
660 % systems programming.
661 % \end{itemize}
662 % \end{frame}
665 \subsection{Literate Programming is insufficient}
667 \begin{frame}{Literate Statistical Practice.}
668 \begin{enumerate}
669 \item Literate Programming applied to data analysis (Rossini, 1997/2001)
670 \item among the \alert{most annoying} techniques to integrate into
671 work-flow if one is not perfectly methodological.
672 \item Some tools:
673 \begin{itemize}
674 \item ESS: supports interactive creation of literate programs.
675 \item Sweave: tool which exemplifies reporting context; odfWeave
676 primarily simplifies reporting.
677 \item Roxygen: primarily supports a literate programming
678 documentation style, not a literate data analysis programming
679 style.
680 \end{itemize}
681 \item ROI demonstrated in specialized cases: BioConductor.
682 \item \alert{usually done after the fact} (final step of work-flow)
683 as a documentation/computational reproducibility technique, rarely
684 integrated into work-flow.
685 \end{enumerate}
686 Many contributors:
687 Knuth, Claerbout, Carey, de Leeuw, Leisch, Gentleman, Temple-Lang,
688 \ldots{}
689 \end{frame}
691 \begin{frame}
692 \frametitle{Literate Programming}
693 \framesubtitle{Why isn't it enough for Data Analysis?}
695 Only 2 contexts: (executable) code and documentation. Fine for
696 application programming, but for data analysis, we could benefit
697 from:
698 \begin{itemize}
699 \item classification of statistical procedures
700 \item descriptions of assumptions
701 \item pragmatic recommendations
702 \item inheritance of structure through the work-flow of a
703 statistical methodology or data analysis project
704 \item datasets and metadata
705 \end{itemize}
706 Concept: ontologies describing mathematical assumptions, applications
707 of methods, work-flow, and statistical data structures can enable
708 machine communication.
710 (i.e. informatics framework ala biology)
711 \end{frame}
714 \begin{frame}{Communication in Statistical Practice}{\ldots is essential for \ldots}
715 \begin{itemize}
716 \item finding
717 \item explanations
718 \item agreement
719 \item receiving information
720 \end{itemize}
721 \alert{``machine-readable'' communication/computation lets the
722 computer help} \\
723 Semantic Web is about ``machine-enabled computability''.
724 \end{frame}
726 \begin{frame} \frametitle{Semantics}
727 \framesubtitle{One definition: description and context}
729 Interoperability is the key, with respect to
730 \begin{itemize}
731 \item ``Finding things''
732 \item Applications and activities with related functionality
733 \begin{itemize}
734 \item moving information from one state to another (paper, journal
735 article, computer program)
736 \item computer programs which implement solutions to similar tasks
737 \end{itemize}
738 \end{itemize}
739 \end{frame}
742 \begin{frame}{Statistical Practice is somewhat restricted}
743 {...but in a good sense, enabling potential for semantics...}
745 There is a restrictable set of intended actions for what can be done
746 -- the critical goal is to be able to make a difference by
747 accelerating activities that should be ``computable'':
748 \begin{itemize}
749 \item restricted natural language processing
750 \item mathematical translation
751 \item common description of activities for simpler programming/data
752 analysis (S approach to objects and methods)
753 \end{itemize}
754 R is a good basic start (model formulation approach, simple
755 ``programming with data'' paradigm); we should see if we can do
756 better!
757 \end{frame}
759 \begin{frame}{Computable and Executable Statistics requires}
761 \begin{itemize}
762 \item approaches to describe data and metadata (``data'')
763 \begin{itemize}
764 \item semantic WWW
765 \item metadata management and integration, driving
766 \item data integration
767 \end{itemize}
768 \item approaches to describe data analysis methods (``models'')
769 \begin{itemize}
770 \item quantitatively: many ontologies (AMS, etc), few meeting
771 statistical needs.
772 \item many substantive fields have implementations
773 (bioinformatics, etc) but not well focused.
774 \end{itemize}
775 \item approaches to describe the specific form of interaction
776 (``instances of models'')
777 \begin{itemize}
778 \item Original idea behind ``Literate Statistical Analysis''.
779 \item That idea is suboptimal, more structure needed (not
780 necessarily built upon existing...).
781 \end{itemize}
782 \end{itemize}
783 \end{frame}
785 \subsection{Common Lisp Statistics}
787 \begin{frame}
788 \frametitle{Interactive Programming}
789 \framesubtitle{Everything goes back to being Lisp-like}
790 \begin{itemize}
791 \item Interactive programming (as originating with Lisp): works
792 extremely well for data analysis (Lisp being the original
793 ``programming with data'' language).
794 \item Theories/methods for how to do this are reflected in styles
795 for using R.
796 \end{itemize}
797 \end{frame}
799 \begin{frame}[fragile]
800 \frametitle{Lisp}
802 Lisp (LISt Processor) is different than most high-level computing
803 languages, and is very old (1956). Lisp is built on lists of things
804 which are evaluatable.
805 \begin{verbatim}
806 (functionName data1 data2 data3)
807 \end{verbatim}
808 or ``quoted'':
809 \begin{verbatim}
810 '(functionName data1 data2 data3)
811 \end{verbatim}
812 which is shorthand for
813 \begin{verbatim}
814 (list functionName data1 data2 data3)
815 \end{verbatim}
816 The difference is important -- lists of data (the second/third) are
817 not (yet?!) functions applied to (unencapsulated lists of) data (the first).
818 \end{frame}
820 \begin{frame}
821 \frametitle{Features}
822 \begin{itemize}
823 \item Data and Functions semantically the same
824 \item Natural interactive use through functional programming with
825 side effects
826 \item Batch is a simplification of interactive -- not a special mode!
827 \end{itemize}
828 \end{frame}
832 \begin{frame}[fragile]{Representation: XML and Lisp}{executing your data}
833 Many people are familiar with XML:
834 \begin{verbatim}
835 <name phone="+41793674557">Tony Rossini</name>
836 \end{verbatim}
837 which is shorter in Lisp:
838 \begin{verbatim}
839 (name "Tony Rossini" :phone "+41613674557")
840 \end{verbatim}
841 \begin{itemize}
842 \item Lisp ``parens'', universally hated by unbelievers, are
843 wonderful for denoting when a ``concept is complete''.
844 \item Why can't your data self-execute?
845 \end{itemize}
846 \end{frame}
848 \begin{frame}[fragile]{Numerics with Lisp}
849 \begin{itemize}
850 \item addition of rational numbers and arithmetic
851 \item example for mean
852 \begin{verbatim}
853 (defun mean (x)
854 (checktype x 'vector-like)
855 (/ (loop for i from 0 to (- (nelts *x*) 1)
856 summing (vref *x* i))
857 (nelts *x*)))
858 \end{verbatim}
859 \item example for variance
860 \begin{verbatim}
861 (defun variance (x)
862 (let ((meanx (mean x))
863 (nm1 (1- (nelts x))))
864 (/ (loop for i from 0 to nm1
865 summing (power (- (vref *x* i) meanx) 2)
866 nm1))))
867 \end{verbatim}
868 \item But through macros, \verb+(vref *x* i)+ could be
869 \verb+#V(X[i])+ or your favorite syntax.
870 \end{itemize}
872 \end{frame}
875 \begin{frame}{Common Lisp Statistics 1}
876 \begin{itemize}
877 \item Originally based on LispStat (reusability)
878 \item Re-factored structure (some numerics worked with a 1990-era code base).
879 \item Current activities:
880 \begin{enumerate}
881 \item numerics redone using CFFI-based BLAS/LAPLACK (cl-blapack)
882 \item matrix interface based on MatLisp
883 \item starting design of a user interface system (interfaces,
884 visuals).
885 \item general framework for model specification (regression,
886 likelihood, ODEs)
887 \item general framework for algorithm specification (bootstrap,
888 MLE, algorithmic data anaylsis methods).
889 \end{enumerate}
890 \end{itemize}
891 \end{frame}
893 \begin{frame}{Common Lisp Statistics 2}
895 \begin{itemize}
896 \item Implemented using SBCL. Contributed fixes for
897 Clozure/OpenMCL. Goal to target CLISP
898 \item Supports LispStat prototype object system
899 \item Package-based design -- only use the components you need, or
900 the components whose API you like.
901 \end{itemize}
902 \end{frame}
904 \section{Discussion}
906 \begin{frame}
907 \frametitle{Outlook}
908 \begin{itemize}
909 \item Semantics and Computability have captured a great deal of
910 attention in the informatics and business computing R\&D worlds
911 \item Statistically-driven Decision Making and Knowledge Discovery
912 is, with high likelihood, the next challenging stage after data
913 integration.
914 \item Statistical practice (theory and application) can be enhanced,
915 made more efficient, providing increased benefit to organizations
916 and groups using appropriate methods.
917 \item Lisp as a language, shares characteristics of both Latin
918 (difficult dead language useful for classical training) and German
919 (difficult living language useful for general life). Of course,
920 for some people, they are not difficult.
921 \end{itemize}
923 \end{frame}
925 \begin{frame}
926 The research program described in this talk is currently driving the
927 design of CommonLisp Stat, which leverages concepts and approaches
928 from the dead and moribund LispStat project.
930 \begin{itemize}
931 \item \url{http://repo.or.cz/w/CommonLispStat.git/}
932 \item \url{http://www.github.com/blindglobe/}
933 \end{itemize}
935 \end{frame}
936 \begin{frame}{Final Comment}
938 \begin{itemize}
939 \item In the Pharma industry, it is all about getting the right
940 drugs to the patient faster. Data analysis systems seriously
941 impact this process, being potentially an impediment or an
942 accelerator.
944 \begin{itemize}
945 \item \alert{Information technologies can increase the efficiency
946 of statistical practice}, though innovation change management
947 must be taking into account. (i.e. Statistical practice, while
948 considered by some an ``art form'', can benefit from
949 industrialization).
950 \item \alert{Lisp's features match the basic requirements we need}
951 (dichotomy: programs as data, data as programs). Sales pitch,
952 though...
953 \item Outlook: Lots of work and experimentation to do!
954 \end{itemize}
955 \end{itemize}
956 \end{frame}
959 % % All of the following is optional and typically not needed.
960 % \appendix
963 % \section<presentation>*{\appendixname}
966 % \begin{frame} \frametitle{Complements and Backup}
967 % No more, stop here. Questions? (now or later).
968 % \end{frame}
970 % \begin{frame}{The Industrial Challenge.}{Getting the Consulting Right.}
971 % % - A title should summarize the slide in an understandable fashion
972 % % for anyone how does not follow everything on the slide itself.
974 % \begin{itemize}
975 % \item Recording assumptions for the next data analyst, reviewer.
976 % Use \texttt{itemize} a lot.
977 % \item
978 % Use very short sentences or short phrases.
979 % \end{itemize}
980 % \end{frame}
983 % \begin{frame}{The Industrial Challenge.}{Getting the Right Research Fast.}
984 % % - A title should summarize the slide in an understandable fashion
985 % % for anyone how does not follow everything on the slide itself.
987 % \begin{itemize}
988 % \item
989 % Use \texttt{itemize} a lot.
990 % \item
991 % Use very short sentences or short phrases.
992 % \end{itemize}
993 % \end{frame}
996 % \begin{frame}{Explicating the Work-flow}{QA/QC-based improvements.}
999 % \end{frame}
1001 % \section{Motivation}
1003 % \subsection{IT Can Speed up Deliverables in Statistical Practice}
1005 % \begin{frame}{Our Generic Work-flow and Life-cycle}
1006 % {describing most data analytic activities}
1007 % Workflow:
1008 % \begin{enumerate}
1009 % \item Scope out the problem
1010 % \item Sketch out a potential solution
1011 % \item Implement until road-blocks appear
1012 % \item Deliver results
1013 % \end{enumerate}
1015 % Lifecycle:
1016 % \begin{enumerate}
1017 % \item paper sketch
1018 % \item 1st e-draft of text/code/date (iterate to \#1, discarding)
1019 % \item cycle through work
1020 % \item publish
1021 % \item ``throw-away''
1022 % \end{enumerate}
1023 % but there is valuble information that could enable the next
1024 % generation!
1025 % \end{frame}
1027 % \begin{frame}[fragile]{Paper $\rightarrow$ Computer $\rightarrow$ Article $\rightarrow$ Computer}{Cut and Paste makes for large errors.}
1028 % \begin{itemize}
1029 % \item Problems in a regulatory setting
1030 % \item Regulatory issues are just ``best practices''
1031 % \end{itemize}
1033 % Why do we ``copy/paste'', or analogously, restart our work?
1035 % pro:
1036 % \begin{itemize}
1037 % \item every time we repeat, we reinforce the idea in our brain
1038 % \item review of ideas can help improve them
1039 % \end{itemize}
1040 % con:
1041 % \begin{itemize}
1042 % \item inefficiency
1043 % \item introduction of mistakes
1044 % \item loss of historical context
1045 % \item changes to earlier work (on a different development branch)
1046 % can not propagate.
1047 % \end{itemize}
1048 % \end{frame}
1050 % \section{Semantics and Statistical Practice}
1053 % \begin{frame}
1054 % \frametitle{Statistical Activity Leads to Reports}
1055 % \framesubtitle{You read what you know, do you understand it?}
1057 % How can we improve the communication of the ideas we have?
1059 % Precision of communication?
1061 % \end{frame}
1065 % \begin{frame} \frametitle{Communication Requires Context}
1066 % \framesubtitle{Intentions imply more than one might like...}
1068 % \begin{itemize}
1069 % \item Consideration of what we might do
1070 % \item Applications with related functionality
1071 % \end{itemize}
1072 % \end{frame}
1076 % \begin{frame}
1077 % \frametitle{Design Patterns}
1078 % \framesubtitle{Supporting Work-flow Transitions}
1080 % (joint work with H Wickham): The point of this research program is
1081 % not to describe what to do at any particular stage of work, but to
1082 % encourage researchers and practitioners to consider how the
1083 % translation and transfer of information between stages so that work
1084 % is not lost.
1086 % Examples of stages in a work-flow:
1087 % \begin{itemize}
1088 % \item planning, execution, reporting;
1089 % \item scoping, illustrative examples or counter examples, algorithmic construction,
1090 % article writing.
1091 % \item descriptive statistics, preliminary inferential analysis,
1092 % model/assumption checking, final inferential analysis,
1093 % communication of scientific results
1094 % \end{itemize}
1095 % Description of work-flows is essential to initiating discussions on
1096 % quality/efficiency of approaches to work.
1097 % \end{frame}
1099 % \section{Design Challenges}
1101 % \begin{frame}
1102 % \frametitle{Activities are enhanced by support}
1104 % \begin{itemize}
1105 % \item Mathematical manipulation can be enhanced by symbolic
1106 % computation
1107 % \item Statistical programming can be enabled by examples and related
1108 % algorithm implementation
1109 % \item Datasets, to a limited extent, can self-describe.
1110 % \end{itemize}
1111 % \end{frame}
1113 % \begin{frame}
1114 % \frametitle{Executable and Computable Science}
1116 % Use of algorithms and construction to describe how things work.
1118 % Support for agent-based approaches
1119 % \end{frame}
1122 % \begin{frame}
1123 % \frametitle{What is Data? Metadata?}
1125 % Data: what we've observed
1127 % MetaData: context for observations, enables semantics.
1128 % \end{frame}
1133 % % \begin{frame}[fragile]
1134 % % \frametitle{Defining Variables}
1135 % % \framesubtitle{Setting variables}
1136 % % \begin{verbatim}
1137 % % (setq <variable> <value>)
1138 % % \end{verbatim}
1139 % % Example:
1140 % % \begin{verbatim}
1141 % % (setq ess-source-directory
1142 % % "/home/rossini/R-src")
1143 % % \end{verbatim}
1144 % % \end{frame}
1146 % % \begin{frame}[fragile]
1147 % % \frametitle{Defining on the fly}
1148 % % \begin{verbatim}
1149 % % (setq ess-source-directory
1150 % % (lambda () (file-name-as-directory
1151 % % (expand-file-name
1152 % % (concat (default-directory)
1153 % % ess-suffix "-src")))))
1154 % % \end{verbatim}
1155 % % (Lambda-expressions are anonymous functions, i.e. ``instant-functions'')
1156 % % \end{frame}
1159 % % \begin{frame}[fragile]
1160 % % \frametitle{Function Reuse}
1161 % % By naming the function, we could make the previous example reusable
1162 % % (if possible):
1163 % % \begin{verbatim}
1164 % % (defun my-src-directory ()
1165 % % (file-name-as-directory
1166 % % (expand-file-name
1167 % % (concat (default-directory)
1168 % % ess-suffix "-src"))))
1169 % % \end{verbatim}
1170 % % Example:
1171 % % \begin{verbatim}
1172 % % (setq ess-source-directory (my-src-directory))
1173 % % \end{verbatim}
1174 % % \end{frame}
1177 % % \begin{frame}
1178 % % \frametitle{Equality Among Packages}
1179 % % \begin{itemize}
1180 % % \item more/less equal can be described specifically through
1181 % % overriding imports.
1182 % % \end{itemize}
1183 % % \end{frame}
1186 % \subsection<presentation>*{For Further Reading}
1188 % \begin{frame}[allowframebreaks]
1189 % \frametitle<presentation>{Related Material}
1191 % \begin{thebibliography}{10}
1193 % \beamertemplatebookbibitems
1194 % % Start with overview books.
1196 % \bibitem{LispStat1990}
1197 % L.~Tierney
1198 % \newblock {\em LispStat}.
1200 % \beamertemplatearticlebibitems
1201 % % Followed by interesting articles. Keep the list short.
1203 % \bibitem{Rossini2001}
1204 % AJ.~Rossini
1205 % \newblock Literate Statistical Practice
1206 % \newblock {\em Proceedings of the Conference on Distributed
1207 % Statistical Computing}, 2001.
1209 % \bibitem{RossiniLeisch2003}
1210 % AJ.~Rossini and F.~Leisch
1211 % \newblock Literate Statistical Practice
1212 % \newblock {\em Technical Report Series, University of Washington
1213 % Department of Biostatistics}, 2003.
1215 % \beamertemplatearrowbibitems
1216 % % Followed by interesting articles. Keep the list short.
1218 % \bibitem{CLS}
1219 % Common Lisp Stat, 2008.
1220 % \newblock \url{http://repo.or.cz/CommonLispStat.git/}
1222 % \end{thebibliography}
1223 % \end{frame}