more computational encoding of probability theory
[CommonLispStat.git] / Doc / talks / Rossini-DSC-July2009.tex
blob4fff19784b09076c11ce0a9e63e1a93532f8371b
1 \documentclass{beamer}
3 \mode<presentation>
5 \usetheme{classic}
6 \setbeamercovered{transparent}
9 \usepackage[english]{babel}
10 \usepackage[latin1]{inputenc}
11 \usepackage{times}
12 \usepackage[T1]{fontenc}
13 \usepackage{url}
15 \title[CLS]{Common Lisp Statistics}
16 \subtitle{Using History to design better data analysis environments}
17 \author[Rossini]{Anthony~(Tony)~Rossini}
19 \institute[Novartis and University of Washington]{
20 Group Head, Modeling and Simulation Statistics\\
21 Novartis Pharma AG, Switzerland
22 \and
23 Affiliate Assoc Prof, Biomedical and Health Informatics\\
24 University of Washington, USA}
26 \date[DSC2009]{DSC 2009, Copenhagen}
27 \subject{Statistical Computing Environments}
29 \begin{document}
31 \begin{frame}
32 \titlepage
33 \end{frame}
35 \section{What Works?}
36 \label{sec:work}
38 \begin{frame}{Is it Vaporware? Not quite}
39 The follow is possible with the help of the open source Common Lisp
40 community, who provided most of the packages, tools, and glue.
41 (Tamas Papp, Raymond Toy, Mark Hoemmomem, and many, many others).
42 Most of the underlying code was written by others, and ``composed''
43 by me.
44 \end{frame}
46 \subsection{Graphics}
47 \label{sec:work:graphics}
49 \begin{frame}{Silly Visualization Example}
50 \includegraphics[width=3in,height=3in]{/home/tony/test1.png}
51 \end{frame}
53 \begin{frame}[fragile]{How?}
54 \begin{verbatim}
55 (defparameter *frame2*
56 (as-frame (create-xlib-image-context 200 200)
57 :background-color +white+))
58 (bind ((#2A((f1 f2) (f3 f4))
59 (split-frame *frame2*
60 (percent 50)
61 (percent 50))))
62 (defparameter *f1* f1) ; lower left
63 (defparameter *f2* f2) ; lower right f3 f4
64 (defparameter *f3* f3) ; top left f1 f2
65 (defparameter *f4* f4)); top right
66 \end{verbatim}
67 \end{frame}
69 \begin{frame}[fragile]{Functions to Plot}
70 \begin{verbatim}
71 (plot-function *f1* #'sin
72 (interval-of 0 2)
73 :x-title "x" :y-title "sin(x)")
74 (plot-function *f2* #'cos (interval-of 0 2)
75 :x-title "x" :y-title "cos(x)")
76 (plot-function *f3* #'tan (interval-of 0 2)
77 :x-title "x" :y-title "tan(x)")
78 \end{verbatim}
79 \end{frame}
81 \begin{frame}[fragile]{Things to Plot}
82 \small{
83 \begin{verbatim}
84 (let* ((n 500)
85 (xs (num-sequence
86 :from 0 :to 10 :length n))
87 (ys (map 'vector
88 #'(lambda (x) (+ x 8 (random 4.0)))
89 xs))
90 (weights
91 (replicate #'(lambda () (1+ (random 10)))
92 n 'fixnum))
93 (da (plot-simple *f4*
94 (interval-of 0 10)
95 (interval-of 10 20)
96 :x-title "x" :y-title "y")))
97 (draw-symbols da xs ys :weights weights))
98 \end{verbatim}
100 \end{frame}
102 \begin{frame}[fragile]{Copying existing graphics}
103 And we generated the figure on the first page by:
104 \begin{verbatim}
105 (xlib-image-context-to-png
106 (context *f1*)
107 "/home/tony/test1.png")
108 \end{verbatim}
109 \end{frame}
111 \subsection{Statistical Models}
112 \label{sec:work:statmod}
114 \begin{frame}[fragile]{Linear Regression}
115 \small{
116 \begin{verbatim}
117 ;; Worse than LispStat, wrapping LAPACK's dgelsy:
118 (defparameter *result1*
119 (lm (list->vector-like iron)
120 (list->vector-like absorbtion)))
121 *result*1 =>
122 ((#<LA-SIMPLE-VECTOR-DOUBLE (2 x 1)
123 -11.504913191235342
124 0.23525771181009483>
127 #<LA-SIMPLE-MATRIX-DOUBLE 2 x 2
128 9.730392177126686e-6 -0.001513787114206932
129 -0.001513787114206932 0.30357851215706255>
131 13 2)
132 \end{verbatim}
134 \end{frame}
136 \subsection{Data Manip/Mgmt}
137 \label{sec:work:data}
139 \begin{frame}[fragile]{DataFrames}
140 \small{
141 \begin{verbatim}
142 (defparameter *my-df-1*
143 (make-instance 'dataframe-array
144 :storage #2A((1 2 3 4 5) (10 20 30 40 50))
145 :doc "This is a boring dataframe-array"
146 :case-labels (list "x" "y")
147 :var-labels (list "a" "b" "c" "d" "e")))
149 (xref *my-df-1* 0 0) ; API change in progress
151 (setf (xref *my-df-1* 0 0) -1d0)
152 \end{verbatim}
154 \end{frame}
156 \begin{frame}[fragile]{Numerical Matrices}
157 \small{
158 \begin{verbatim}
159 (defparameter *mat-1*
160 (make-matrix 3 3
161 :initial-contents #2A((2d0 3d0 -4d0)
162 (3d0 2d0 -4d0)
163 (4d0 4d0 -5d0))))
165 (xref *mat-1* 2 0) ; => 4d0 ; API change
166 (setf (xref *mat-1* 2 0) -4d0)
168 (defparameter *xv*
169 (make-vector 4 :type :row
170 :initial-contents '((1d0 3d0 2d0 4d0))))
171 \end{verbatim}
173 \end{frame}
175 \begin{frame}[fragile]{Macros make the above tolerable}
176 \begin{verbatim}
177 (defparameter *xv*
178 (make-vector 4 :type :row
179 :initial-contents '((1d0 3d0 2d0 4d0))))
181 ; can use defmacro for the following syntax =>
183 (make-row-vector *xv* '((1d0 3d0 2d0 4d0)))
185 ; or reader macros for the following:
186 #mrv(*xv* '((1d0 3d0 2d0 4d0)))
187 \end{verbatim}
188 \end{frame}
190 \begin{frame}{Outline}
191 \tableofcontents
192 \end{frame}
194 \section{Common Lisp Statistics}
195 \label{sec:CLS}
197 \begin{frame}{Why CLS?}
198 \begin{itemize}
199 \item a component-based structure for statistical computing
200 \item Common Lisp provides a simple, \emph{primitive}, syntax
201 \item Common Lisp provides an amazing number of advanced features
202 that keep getting reinvented in other languages.
203 \item Common Lisp has linkages to many amazing features developed in
204 other languages.
205 \item ability to leverage non-statisticians interested in computing
206 technologies (compilers, protocols, interfaces, libraries,
207 functionality which can be reused for statistical purposes)
208 \item This is a ``customization'' through packages to support
209 statistical computing, not a independent language. ``Ala Carte'',
210 not ``Menu''.
211 \end{itemize}
212 \end{frame}
214 \subsection{Implementation Plans}
215 \label{sec:CLS:impl}
218 \begin{frame}{Current Functionality}
219 \begin{itemize}
220 \item basic dataframes (similar to R); indexing/slicing API under
221 development.
222 \item Basic regression (similar to XLispStat)
223 \item matrix storage both in foreign and lisp-centric areas.
224 \item LAPACK (small percentage, increasing), working with both
225 matrix storage types
226 \item static graphics (X11) including preliminary grid functionality based
227 on CAIRO. Generation of PNG files from graphics windows.
228 \item CSV file support
229 \item Common Lisp!
230 \end{itemize}
231 \end{frame}
233 \begin{frame}[fragile]{Computational Environment Supported}
234 \begin{itemize}
235 \item Should work on Linux, with recent SBCL versions
236 \item Definitely works on bleeding edge Debian (unstable).
237 \item Has worked for weak definitions of ``work'' on 4 different
238 people's environments (not quite, but sort of requires a
239 \verb+/home/tony/+ !)
240 \item Threaded support on threaded lisps (SBCL/CCL, soon CLISP).
241 But not yet integrated.
242 \end{itemize}
243 \end{frame}
245 \begin{frame}{Goals}
246 Short Term
247 \begin{itemize}
248 \item Better integration of data structures with statistical routines
249 (auto-handling with dataframes, rather than manual parsing).
250 \end{itemize}
251 Medium/Long Term
252 \begin{itemize}
253 \item Support for CLISP (byte-compiled interpreted lisp) and Clozure
254 Common Lisp (formerly OpenMCL)
255 \item high-level Front-end API to a number of matrix and numerical
256 packages and numerical structures (
257 \item constraint system for interactive GUIs and graphics
258 \item full LispStat compatibility (object system partially works;
259 GUI support coming).
260 \item Integrated threading via Bordeaux threads (portable CL API package).
261 \end{itemize}
262 \end{frame}
264 \subsection{Common Lisp}
265 \label{sec:CLS:lisp}
267 \begin{frame}{Common Lisp}
268 \begin{itemize}
269 \item Parens provide clear delineation of a \textbf{Complete
270 Thought} (functional programming with side effects).
271 \item Lisp-2 (symbols can denote both a separate function and a value)
272 \item ANSI standard (built by committee, but the committee was
273 reasonably smart)
274 \item Many implementations
275 \item Most implementations are interactive \textbf{compiled}
276 languages (few are interpreted, and those are usually
277 byte-compiled).
278 \item The Original \emph{Programming with Data} Language
279 (\emph{Programs are Data} and \emph{Data are Executable} also
280 apply).
281 \item advanced, powerful, first-class macros (macros functionally
282 re-write code, allowing for structural clarity and complete
283 destruction of syntax, should that be reasonable)
284 \end{itemize}
285 \end{frame}
287 \begin{frame}{Common Lisp Packages}
288 (They are packages and called packages, not libraries. Some people
289 can rejoice!)
290 \begin{itemize}
291 \item infrastructure enhancement: infix-notation, data structures,
292 control and flow structures
293 \item numerics, graphics, GUIs,
294 \item primitive R to CL compiler (which could also be considered an
295 object-code compiler for R); 3 interfaces which embed R within CL.
296 \item Web 2.0 support and reporting facilities (similar to TeX) for PDF.
297 \end{itemize}
298 See \url{http://www.common-lisp.net/} and
299 \url{http://www.cliki.org/}. CLS sources can be found on
300 \url{http://github.com/blindglobe/}
301 \end{frame}
303 \section{What else about CLS is still Vaporware?}
305 \begin{frame}[fragile]{What does NOT work?}
306 Primarily, the reason that we doing this:
308 \textbf{Computable and Executable Statistics}
310 (which is the subject of another talk, slides in the backup). But
311 consider XML:
312 \begin{verbatim}
313 <car brand="honda" engine="4cyl">accord</car>
314 \end{verbatim}
315 becomes
316 \begin{verbatim}
317 ; data follows keywords...
318 (car :brand 'honda :engine "4cyl" accord)
319 \end{verbatim}
320 \end{frame}
322 \section{Discussion}
324 \begin{frame}{Conclusion}
325 Active but slow development, spanning the range of needs:
326 \begin{itemize}
327 \item Numerics: Linear algebra basics done -- full development
328 \item Static graphics: progress being made, have a partial
329 grid-solution, need interactive graphics
330 \item LispStat emulation needs to be finished
331 \item Model specification and unification
332 \end{itemize}
333 Related numerical/statistical projects:
334 \begin{itemize}
335 \item Incanter : R/LispStat/Omegahat-like system for Clojure (Lisp
336 on the JVM)
337 \item FEMLisp : system/workshop for finite-element analysis modeling
338 using Lisp
339 \item matlisp/LispLab : LAPACK-based numerical linear algebra packages
340 \item GSLL : GNU Scientific Library, Lisp interface.
341 \end{itemize}
342 \end{frame}
344 \begin{frame}{Followup}
345 I'd be happy to talk with anyone on the following topics:
346 \begin{itemize}
347 \item Introduction to Common Lisp
348 \item support for new statistical programming environment modalities
349 (subject for another talk).
350 \item computable and executable statistics (code that explains
351 itself and can be parsed to generate knowledge about its claims;
352 ``XML's promise'')
353 \end{itemize}
354 and if you are interested in getting involved, or trying it out.
355 \end{frame}
357 \end{document}
359 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
363 \section{BACKUPS}
366 \section{Common Lisp}
368 \begin{frame}[fragile]{Finding out things}
369 \begin{itemize}
370 \item CL-NUMLIB
371 num-sequence :from LOW to: HIGH :length SEQ-LENGTH
372 seq(from,to,by/length)
373 \item
374 \begin{verbatim}
375 (documentation
376 'cl-numlib:num-sequence
377 'function)
378 \end{verbatim}
379 \item This
380 \end{itemize}
381 \end{frame}
383 \section{Computable Statistics}
385 \begin{frame}{Can we compute with them?}
386 3 Examples, of which we only present the first
387 \begin{itemize}
388 \item Research.
389 \item Consulting, Applied Statistics, Scientific Honesty.
390 \item Reimplementation.
391 \end{itemize}
392 Consider whether one can ``compute'' with the information given?
393 (that is:
394 \begin{itemize}
395 \item do we have sufficient information to communicate enough
396 for the right person to recreate the analysis?
397 \item have we sufficient clarity to prevent misunderstandings about
398 intentions and claims?
399 \end{itemize}
401 \end{frame}
403 \begin{frame}[fragile]{Example 1: Theory\ldots}
404 \label{example1}
405 Let $f(x;\theta)$ describe the likelihood of XX under the following
406 assumptions.
407 \begin{enumerate}
408 \item assumption-1
409 \item assumption-2
410 \end{enumerate}
411 Then if we use the following algorithm:
412 \begin{enumerate}
413 \item step-1
414 \item step-2
415 \end{enumerate}
416 then $\hat{\theta}$ should be $N(0,\hat\sigma^2)$ with the following
417 characteristics\ldots
418 \end{frame}
420 \begin{frame}
421 \frametitle{Can we compute, using this description?}
422 Given the information at hand:
423 \begin{itemize}
424 \item we ought to have a framework for initial coding for the
425 actual simulations (test-first!)
426 \item the implementation is somewhat clear
427 \item We should ask: what theorems have similar assumptions?
428 \item We should ask: what theorems have similar conclusions but
429 different assumptions?
430 \end{itemize}
431 \end{frame}
433 \begin{frame}[fragile]{Realizing Theory}
434 \small{
435 \begin{verbatim}
436 (define-theorem my-proposed-theorem
437 (:theorem-type '(distribution-properties
438 frequentist
439 likelihood))
440 (:assumes '(assumption-1 assumption-2))
441 (:likelihood-form
442 (defun likelihood (data theta gamma)
443 (exponential-family theta gamma)))
444 (:compute-by
445 '(progn
446 (compute-starting-values thetahat gammahat)
447 (until (convergence)
448 (setf convergence
449 (or (step-1 thetahat)
450 (step-2 gammahat))))))
451 (:claim (assert
452 (and (equal-distribution thetahat 'normal)
453 (equal-distribution gammahat 'normal)))))
454 \end{verbatim}
456 \end{frame}
458 \begin{frame}[fragile]{It would be nice to have}
459 \begin{verbatim}
460 (theorem-veracity 'my-proposed-theorem)
461 \end{verbatim}
462 \end{frame}
464 \begin{frame}[fragile]{and why not...?}
465 \begin{verbatim}
466 (when (theorem-veracity
467 'my-proposed-theorem)
468 (write-paper 'my-proposed-theorem
469 :style :JASA
470 :output-format
471 '(LaTeX MSWord)))
472 \end{verbatim}
473 \end{frame}
475 \begin{frame}{Comments}
476 \begin{itemize}
477 \item The general problem is very difficult
478 \item I'm working on some basic statistical proof of concepts (not
479 finished): linear regression (LS-based, Normal-bayesian) and the
480 T-test.
481 \item Areas targetted for medium-term future: resampling methods and
482 similar algorithms.
483 \end{itemize}
484 \end{frame}
486 \begin{frame}
487 \frametitle{Example 2: Practice\ldots}
488 \label{example2}
489 The dataset comes from a series of clinical trials. We model the
490 primary endpoint, ``relief'', as a binary random variable. There is
491 a random trial effect on relief as well as severity due to
492 differences in recruitment and inclusion/exclusion criteria.
493 \end{frame}
495 \begin{frame}
496 \frametitle{Can we compute, using this description?}
497 \begin{itemize}
498 \item With a real such description, it is clear what some of the
499 potential models might be for this dataset
500 \item It should be clear how to start thinking of a data dictionary
501 for this problem.
502 \end{itemize}
503 \end{frame}
505 \begin{frame}[fragile]{Can we compute?}
506 \begin{verbatim}
507 (dataset-metadata paper-1
508 :context 'clinical-trials
509 :variables '((relief :model-type dependent
510 :distribution binary)
511 (trial :model-type independent
512 :distribution categorical)
513 (disease-severity))
514 :metadata '(inclusion-criteria
515 exclusion-criteria
516 recruitment-rate))
517 (propose-analysis paper-1)
518 ; => '(tables
519 ; (logistic regression))
520 \end{verbatim}
521 \end{frame}
523 \begin{frame}{Example 3: The Round-trip\ldots}
524 \label{example3}
525 The first examples describe ``ideas $\rightarrow$ code''
527 Consider the last time you read someone else's implementation of a
528 statistical procedure (i.e. R package code). When you read the
529 code, could you see:
530 \begin{itemize}
531 \item the assumptions used?
532 \item the algorithm implemented?
533 \item practical guidance for when you might select the algorithm
534 over others?
535 \item practical guidance for when you might select the
536 implementation over others?
537 \end{itemize}
538 These are usually components of any reasonable journal article.
539 \textit{(Q: have you actually read an R package that wasn't yours?)}
540 \end{frame}
542 \begin{frame}{Exercise left to the reader!}
544 % (aside: I have been looking at the \textbf{stats} and \textbf{lme4}
545 % packages recently -- \textit{for me}, very clear numerically, much
546 % less so statistically)
547 \end{frame}
551 \section{Context}
553 \begin{frame}{Goals for this Talk}{(define, strategic approach,
554 justify)}
556 \begin{itemize}
557 \item To describe the concept of \alert{computable and executable
558 statistics}, placing it in a historical context.
560 \item To demonstrate that \alert{a research program}
561 implemented through simple steps can increase the efficiency of
562 statistical computing approaches by clearly describing both:
563 \begin{itemize}
564 \item numerical characteristics of procedures,
565 \item statistical concepts driving them.
566 \end{itemize}
568 \item To justify that the \alert{approach is worthwhile} and
569 represents a staged effort towards \alert{increased use of best
570 practices}.
571 \end{itemize}
572 (unfortunately, the last is still incomplete)
573 \end{frame}
576 \begin{frame}{Historical Computing Languages}
577 \begin{itemize}
578 \item FORTRAN : FORmula TRANslator. Original numerical computing
579 language, designed for clean implementation of numerical
580 algorithms
581 \item LISP : LISt Processor. Associated with symbolic
582 manipulation, AI, and knowledge approaches
583 \end{itemize}
585 They represent the 2 generalized needs of statistical computing,
586 which could be summarized as
587 \begin{itemize}
588 \item algorithms/numerics,
589 \item elicitation, communication, and generation of knowledge (``data
590 analysis'')
591 \end{itemize}
592 \end{frame}
594 \begin{frame}{Statistical Computing Environments}
596 Past:
597 \begin{itemize}
598 \item SPSS / BMDP / SAS
599 \item S ( S, S-PLUS, R)
600 \item LispStat ( XLispStat, ViSta, ARC , CommonLispStat ) ; QUAIL
601 \item XGobi (Orca / GGobi / Statistical Reality Engine)
602 \item MiniTab
603 \item Stata
604 \item DataDesk
605 \item Augsburg Impressionist series (MANET,
606 \item Excel
607 \end{itemize}
608 many others...
610 \end{frame}
612 \begin{frame}{How many are left?}
614 \begin{itemize}
615 \item R
616 \item SAS
617 \item SPSS
618 \item Stata
619 \item Minitab
620 \item very few others...
621 \end{itemize}
622 ``R is the Microsoft of the statistical computing world'' -- anonymous.
623 \end{frame}
625 \begin{frame}{Selection Pressure}
626 \begin{itemize}
627 \item the R user population is growing rapidly, fueled by critical
628 mass, quality, and value
629 \item R is a great system for applied data analysis
630 \item R is not such a great system for research into statistical
631 computing (backwards compatibility, inertia due to user population)
632 \end{itemize}
633 There is a need for alternative experiments for developing new
634 approaches/ideas/concepts.
635 \end{frame}
637 \begin{frame}{Philosophically, why Common Lisp?}
638 Philosophically:
639 \begin{itemize}
640 \item Lisp can cleanly present computational intentions, both
641 symbolically and numerically.
642 \item Semantics and context are important: well supported by Lisp
643 paradigms.
644 \item Lisp's parentheses describe singular, multi-scale,
645 \alert{complete thoughts}.
646 \end{itemize}
648 \end{frame}
650 \begin{frame}{Technically, why Common Lisp?}
651 \begin{itemize}
652 \item interactive COMPILED language (``R with a compiler'')
653 \item CLOS is R's S4 object system ``done right''.
654 \item clean semantics: modality, typing, can be expressed the way
655 one wants it.
656 \item programs are data, data are programs, leading to
657 \item Most modern computing tools available (XML, WWW technologies)
658 \item ``executable XML''
659 \end{itemize}
660 Common Lisp is very close in usage to how people currently use R
661 (mostly interactive, some batch, and a wish for compilation efficiency).
662 \end{frame}
664 \subsection{Background}
666 \begin{frame}
667 \frametitle{Desire: Semantics and Statistics}
668 \begin{itemize}
669 \item The semantic web (content which is self-descriptive) is an
670 interesting and potentially useful idea.
672 \item
673 Biological informatics support (GO, Entrez) has allowed for
674 precise definitions of concepts in biology.
676 \item It is a shame that a field like statistics, requiring such
677 precision, has less than an imprecise and temporally instable
678 field such as biology\ldots
679 \end{itemize}
681 How can we express statistical work (research, applied work) which
682 is both human and computer readable (perhaps subject to
683 transformations first)?
684 \end{frame}
687 % \subsection{Context}
689 % \begin{frame}{Context}{(where I'm coming from, my ``priors'')}
690 % \begin{itemize}
691 % \item Pharmaceutical Industry
692 % \item Modeling and Simulation uses mathematical models/constructs to
693 % record beliefs (biology, pharmacology, clinical science) for
694 % explication, clinical team alignment, decision support, and
695 % quality.
696 % \item My work at Novartis is at the intersection of biomedical
697 % informatics, statistics, and mathematical modeling.
698 % \item As manager: I need a mix of applications and novel research development to
699 % solve our challenges better, faster, more efficiently.
700 % \item Data analysis is a specialized approach to computer
701 % programming, \alert{different} than applications programming or
702 % systems programming.
703 % \end{itemize}
704 % \end{frame}
707 \subsection{Literate Programming is insufficient}
709 \begin{frame}{Literate Statistical Practice.}
710 \begin{enumerate}
711 \item Literate Programming applied to data analysis (Rossini, 1997/2001)
712 \item among the \alert{most annoying} techniques to integrate into
713 work-flow if one is not perfectly methodological.
714 \item Some tools:
715 \begin{itemize}
716 \item ESS: supports interactive creation of literate programs.
717 \item Sweave: tool which exemplifies reporting context; odfWeave
718 primarily simplifies reporting.
719 \item Roxygen: primarily supports a literate programming
720 documentation style, not a literate data analysis programming
721 style.
722 \end{itemize}
723 \item ROI demonstrated in specialized cases: BioConductor.
724 \item \alert{usually done after the fact} (final step of work-flow)
725 as a documentation/computational reproducibility technique, rarely
726 integrated into work-flow.
727 \end{enumerate}
728 Many contributors:
729 Knuth, Claerbout, Carey, de Leeuw, Leisch, Gentleman, Temple-Lang,
730 \ldots{}
731 \end{frame}
733 \begin{frame}
734 \frametitle{Literate Programming}
735 \framesubtitle{Why isn't it enough for Data Analysis?}
737 Only 2 contexts: (executable) code and documentation. Fine for
738 application programming, but for data analysis, we could benefit
739 from:
740 \begin{itemize}
741 \item classification of statistical procedures
742 \item descriptions of assumptions
743 \item pragmatic recommendations
744 \item inheritance of structure through the work-flow of a
745 statistical methodology or data analysis project
746 \item datasets and metadata
747 \end{itemize}
748 Concept: ontologies describing mathematical assumptions, applications
749 of methods, work-flow, and statistical data structures can enable
750 machine communication.
752 (i.e. informatics framework ala biology)
753 \end{frame}
756 \begin{frame}{Communication in Statistical Practice}{\ldots is essential for \ldots}
757 \begin{itemize}
758 \item finding
759 \item explanations
760 \item agreement
761 \item receiving information
762 \end{itemize}
763 \alert{``machine-readable'' communication/computation lets the
764 computer help} \\
765 Semantic Web is about ``machine-enabled computability''.
766 \end{frame}
768 \begin{frame} \frametitle{Semantics}
769 \framesubtitle{One definition: description and context}
771 Interoperability is the key, with respect to
772 \begin{itemize}
773 \item ``Finding things''
774 \item Applications and activities with related functionality
775 \begin{itemize}
776 \item moving information from one state to another (paper, journal
777 article, computer program)
778 \item computer programs which implement solutions to similar tasks
779 \end{itemize}
780 \end{itemize}
781 \end{frame}
784 \begin{frame}{Statistical Practice is somewhat restricted}
785 {...but in a good sense, enabling potential for semantics...}
787 There is a restrictable set of intended actions for what can be done
788 -- the critical goal is to be able to make a difference by
789 accelerating activities that should be ``computable'':
790 \begin{itemize}
791 \item restricted natural language processing
792 \item mathematical translation
793 \item common description of activities for simpler programming/data
794 analysis (S approach to objects and methods)
795 \end{itemize}
796 R is a good basic start (model formulation approach, simple
797 ``programming with data'' paradigm); we should see if we can do
798 better!
799 \end{frame}
801 \begin{frame}{Computable and Executable Statistics requires}
803 \begin{itemize}
804 \item approaches to describe data and metadata (``data'')
805 \begin{itemize}
806 \item semantic WWW
807 \item metadata management and integration, driving
808 \item data integration
809 \end{itemize}
810 \item approaches to describe data analysis methods (``models'')
811 \begin{itemize}
812 \item quantitatively: many ontologies (AMS, etc), few meeting
813 statistical needs.
814 \item many substantive fields have implementations
815 (bioinformatics, etc) but not well focused.
816 \end{itemize}
817 \item approaches to describe the specific form of interaction
818 (``instances of models'')
819 \begin{itemize}
820 \item Original idea behind ``Literate Statistical Analysis''.
821 \item That idea is suboptimal, more structure needed (not
822 necessarily built upon existing...).
823 \end{itemize}
824 \end{itemize}
825 \end{frame}
827 \subsection{Common Lisp Statistics}
829 \begin{frame}
830 \frametitle{Interactive Programming}
831 \framesubtitle{Everything goes back to being Lisp-like}
832 \begin{itemize}
833 \item Interactive programming (as originating with Lisp): works
834 extremely well for data analysis (Lisp being the original
835 ``programming with data'' language).
836 \item Theories/methods for how to do this are reflected in styles
837 for using R.
838 \end{itemize}
839 \end{frame}
841 \begin{frame}[fragile]
842 \frametitle{Lisp}
844 Lisp (LISt Processor) is different than most high-level computing
845 languages, and is very old (1956). Lisp is built on lists of things
846 which are evaluatable.
847 \begin{verbatim}
848 (functionName data1 data2 data3)
849 \end{verbatim}
850 or ``quoted'':
851 \begin{verbatim}
852 '(functionName data1 data2 data3)
853 \end{verbatim}
854 which is shorthand for
855 \begin{verbatim}
856 (list functionName data1 data2 data3)
857 \end{verbatim}
858 The difference is important -- lists of data (the second/third) are
859 not (yet?!) functions applied to (unencapsulated lists of) data (the first).
860 \end{frame}
862 \begin{frame}
863 \frametitle{Features}
864 \begin{itemize}
865 \item Data and Functions semantically the same
866 \item Natural interactive use through functional programming with
867 side effects
868 \item Batch is a simplification of interactive -- not a special mode!
869 \end{itemize}
870 \end{frame}
874 \begin{frame}[fragile]{Representation: XML and Lisp}{executing your data}
875 Many people are familiar with XML:
876 \begin{verbatim}
877 <name phone="+41793674557">Tony Rossini</name>
878 \end{verbatim}
879 which is shorter in Lisp:
880 \begin{verbatim}
881 (name "Tony Rossini" :phone "+41613674557")
882 \end{verbatim}
883 \begin{itemize}
884 \item Lisp ``parens'', universally hated by unbelievers, are
885 wonderful for denoting when a ``concept is complete''.
886 \item Why can't your data self-execute?
887 \end{itemize}
888 \end{frame}
890 \begin{frame}[fragile]{Numerics with Lisp}
891 \begin{itemize}
892 \item addition of rational numbers and arithmetic
893 \item example for mean
894 \begin{verbatim}
895 (defun mean (x)
896 (checktype x 'vector-like)
897 (/ (loop for i from 0 to (- (nelts *x*) 1)
898 summing (vref *x* i))
899 (nelts *x*)))
900 \end{verbatim}
901 \item example for variance
902 \begin{verbatim}
903 (defun variance (x)
904 (let ((meanx (mean x))
905 (nm1 (1- (nelts x))))
906 (/ (loop for i from 0 to nm1
907 summing (power (- (vref *x* i) meanx) 2)
908 nm1))))
909 \end{verbatim}
910 \item But through macros, \verb+(vref *x* i)+ could be
911 \verb+#V(X[i])+ or your favorite syntax.
912 \end{itemize}
914 \end{frame}
917 \begin{frame}{Common Lisp Statistics 1}
918 \begin{itemize}
919 \item Originally based on LispStat (reusability)
920 \item Re-factored structure (some numerics worked with a 1990-era code base).
921 \item Current activities:
922 \begin{enumerate}
923 \item numerics redone using CFFI-based BLAS/LAPLACK (cl-blapack)
924 \item matrix interface based on MatLisp
925 \item starting design of a user interface system (interfaces,
926 visuals).
927 \item general framework for model specification (regression,
928 likelihood, ODEs)
929 \item general framework for algorithm specification (bootstrap,
930 MLE, algorithmic data anaylsis methods).
931 \end{enumerate}
932 \end{itemize}
933 \end{frame}
935 \begin{frame}{Common Lisp Statistics 2}
937 \begin{itemize}
938 \item Implemented using SBCL. Contributed fixes for
939 Clozure/OpenMCL. Goal to target CLISP
940 \item Supports LispStat prototype object system
941 \item Package-based design -- only use the components you need, or
942 the components whose API you like.
943 \end{itemize}
944 \end{frame}
946 \section{Discussion}
948 \begin{frame}
949 \frametitle{Outlook}
950 \begin{itemize}
951 \item Semantics and Computability have captured a great deal of
952 attention in the informatics and business computing R\&D worlds
953 \item Statistically-driven Decision Making and Knowledge Discovery
954 is, with high likelihood, the next challenging stage after data
955 integration.
956 \item Statistical practice (theory and application) can be enhanced,
957 made more efficient, providing increased benefit to organizations
958 and groups using appropriate methods.
959 \item Lisp as a language, shares characteristics of both Latin
960 (difficult dead language useful for classical training) and German
961 (difficult living language useful for general life). Of course,
962 for some people, they are not difficult.
963 \end{itemize}
965 \end{frame}
967 \begin{frame}
968 The research program described in this talk is currently driving the
969 design of CommonLisp Stat, which leverages concepts and approaches
970 from the dead and moribund LispStat project.
972 \begin{itemize}
973 \item \url{http://repo.or.cz/w/CommonLispStat.git/}
974 \item \url{http://www.github.com/blindglobe/}
975 \end{itemize}
977 \end{frame}
978 \begin{frame}{Final Comment}
980 \begin{itemize}
981 \item In the Pharma industry, it is all about getting the right
982 drugs to the patient faster. Data analysis systems seriously
983 impact this process, being potentially an impediment or an
984 accelerator.
986 \begin{itemize}
987 \item \alert{Information technologies can increase the efficiency
988 of statistical practice}, though innovation change management
989 must be taking into account. (i.e. Statistical practice, while
990 considered by some an ``art form'', can benefit from
991 industrialization).
992 \item \alert{Lisp's features match the basic requirements we need}
993 (dichotomy: programs as data, data as programs). Sales pitch,
994 though...
995 \item Outlook: Lots of work and experimentation to do!
996 \end{itemize}
997 \end{itemize}
998 \end{frame}
1001 % % All of the following is optional and typically not needed.
1002 % \appendix
1005 % \section<presentation>*{\appendixname}
1008 % \begin{frame} \frametitle{Complements and Backup}
1009 % No more, stop here. Questions? (now or later).
1010 % \end{frame}
1012 % \begin{frame}{The Industrial Challenge.}{Getting the Consulting Right.}
1013 % % - A title should summarize the slide in an understandable fashion
1014 % % for anyone how does not follow everything on the slide itself.
1016 % \begin{itemize}
1017 % \item Recording assumptions for the next data analyst, reviewer.
1018 % Use \texttt{itemize} a lot.
1019 % \item
1020 % Use very short sentences or short phrases.
1021 % \end{itemize}
1022 % \end{frame}
1025 % \begin{frame}{The Industrial Challenge.}{Getting the Right Research Fast.}
1026 % % - A title should summarize the slide in an understandable fashion
1027 % % for anyone how does not follow everything on the slide itself.
1029 % \begin{itemize}
1030 % \item
1031 % Use \texttt{itemize} a lot.
1032 % \item
1033 % Use very short sentences or short phrases.
1034 % \end{itemize}
1035 % \end{frame}
1038 % \begin{frame}{Explicating the Work-flow}{QA/QC-based improvements.}
1041 % \end{frame}
1043 % \section{Motivation}
1045 % \subsection{IT Can Speed up Deliverables in Statistical Practice}
1047 % \begin{frame}{Our Generic Work-flow and Life-cycle}
1048 % {describing most data analytic activities}
1049 % Workflow:
1050 % \begin{enumerate}
1051 % \item Scope out the problem
1052 % \item Sketch out a potential solution
1053 % \item Implement until road-blocks appear
1054 % \item Deliver results
1055 % \end{enumerate}
1057 % Lifecycle:
1058 % \begin{enumerate}
1059 % \item paper sketch
1060 % \item 1st e-draft of text/code/date (iterate to \#1, discarding)
1061 % \item cycle through work
1062 % \item publish
1063 % \item ``throw-away''
1064 % \end{enumerate}
1065 % but there is valuble information that could enable the next
1066 % generation!
1067 % \end{frame}
1069 % \begin{frame}[fragile]{Paper $\rightarrow$ Computer $\rightarrow$ Article $\rightarrow$ Computer}{Cut and Paste makes for large errors.}
1070 % \begin{itemize}
1071 % \item Problems in a regulatory setting
1072 % \item Regulatory issues are just ``best practices''
1073 % \end{itemize}
1075 % Why do we ``copy/paste'', or analogously, restart our work?
1077 % pro:
1078 % \begin{itemize}
1079 % \item every time we repeat, we reinforce the idea in our brain
1080 % \item review of ideas can help improve them
1081 % \end{itemize}
1082 % con:
1083 % \begin{itemize}
1084 % \item inefficiency
1085 % \item introduction of mistakes
1086 % \item loss of historical context
1087 % \item changes to earlier work (on a different development branch)
1088 % can not propagate.
1089 % \end{itemize}
1090 % \end{frame}
1092 % \section{Semantics and Statistical Practice}
1095 % \begin{frame}
1096 % \frametitle{Statistical Activity Leads to Reports}
1097 % \framesubtitle{You read what you know, do you understand it?}
1099 % How can we improve the communication of the ideas we have?
1101 % Precision of communication?
1103 % \end{frame}
1107 % \begin{frame} \frametitle{Communication Requires Context}
1108 % \framesubtitle{Intentions imply more than one might like...}
1110 % \begin{itemize}
1111 % \item Consideration of what we might do
1112 % \item Applications with related functionality
1113 % \end{itemize}
1114 % \end{frame}
1118 % \begin{frame}
1119 % \frametitle{Design Patterns}
1120 % \framesubtitle{Supporting Work-flow Transitions}
1122 % (joint work with H Wickham): The point of this research program is
1123 % not to describe what to do at any particular stage of work, but to
1124 % encourage researchers and practitioners to consider how the
1125 % translation and transfer of information between stages so that work
1126 % is not lost.
1128 % Examples of stages in a work-flow:
1129 % \begin{itemize}
1130 % \item planning, execution, reporting;
1131 % \item scoping, illustrative examples or counter examples, algorithmic construction,
1132 % article writing.
1133 % \item descriptive statistics, preliminary inferential analysis,
1134 % model/assumption checking, final inferential analysis,
1135 % communication of scientific results
1136 % \end{itemize}
1137 % Description of work-flows is essential to initiating discussions on
1138 % quality/efficiency of approaches to work.
1139 % \end{frame}
1141 % \section{Design Challenges}
1143 % \begin{frame}
1144 % \frametitle{Activities are enhanced by support}
1146 % \begin{itemize}
1147 % \item Mathematical manipulation can be enhanced by symbolic
1148 % computation
1149 % \item Statistical programming can be enabled by examples and related
1150 % algorithm implementation
1151 % \item Datasets, to a limited extent, can self-describe.
1152 % \end{itemize}
1153 % \end{frame}
1155 % \begin{frame}
1156 % \frametitle{Executable and Computable Science}
1158 % Use of algorithms and construction to describe how things work.
1160 % Support for agent-based approaches
1161 % \end{frame}
1164 % \begin{frame}
1165 % \frametitle{What is Data? Metadata?}
1167 % Data: what we've observed
1169 % MetaData: context for observations, enables semantics.
1170 % \end{frame}
1175 % % \begin{frame}[fragile]
1176 % % \frametitle{Defining Variables}
1177 % % \framesubtitle{Setting variables}
1178 % % \begin{verbatim}
1179 % % (setq <variable> <value>)
1180 % % \end{verbatim}
1181 % % Example:
1182 % % \begin{verbatim}
1183 % % (setq ess-source-directory
1184 % % "/home/rossini/R-src")
1185 % % \end{verbatim}
1186 % % \end{frame}
1188 % % \begin{frame}[fragile]
1189 % % \frametitle{Defining on the fly}
1190 % % \begin{verbatim}
1191 % % (setq ess-source-directory
1192 % % (lambda () (file-name-as-directory
1193 % % (expand-file-name
1194 % % (concat (default-directory)
1195 % % ess-suffix "-src")))))
1196 % % \end{verbatim}
1197 % % (Lambda-expressions are anonymous functions, i.e. ``instant-functions'')
1198 % % \end{frame}
1201 % % \begin{frame}[fragile]
1202 % % \frametitle{Function Reuse}
1203 % % By naming the function, we could make the previous example reusable
1204 % % (if possible):
1205 % % \begin{verbatim}
1206 % % (defun my-src-directory ()
1207 % % (file-name-as-directory
1208 % % (expand-file-name
1209 % % (concat (default-directory)
1210 % % ess-suffix "-src"))))
1211 % % \end{verbatim}
1212 % % Example:
1213 % % \begin{verbatim}
1214 % % (setq ess-source-directory (my-src-directory))
1215 % % \end{verbatim}
1216 % % \end{frame}
1219 % % \begin{frame}
1220 % % \frametitle{Equality Among Packages}
1221 % % \begin{itemize}
1222 % % \item more/less equal can be described specifically through
1223 % % overriding imports.
1224 % % \end{itemize}
1225 % % \end{frame}
1228 % \subsection<presentation>*{For Further Reading}
1230 % \begin{frame}[allowframebreaks]
1231 % \frametitle<presentation>{Related Material}
1233 % \begin{thebibliography}{10}
1235 % \beamertemplatebookbibitems
1236 % % Start with overview books.
1238 % \bibitem{LispStat1990}
1239 % L.~Tierney
1240 % \newblock {\em LispStat}.
1242 % \beamertemplatearticlebibitems
1243 % % Followed by interesting articles. Keep the list short.
1245 % \bibitem{Rossini2001}
1246 % AJ.~Rossini
1247 % \newblock Literate Statistical Practice
1248 % \newblock {\em Proceedings of the Conference on Distributed
1249 % Statistical Computing}, 2001.
1251 % \bibitem{RossiniLeisch2003}
1252 % AJ.~Rossini and F.~Leisch
1253 % \newblock Literate Statistical Practice
1254 % \newblock {\em Technical Report Series, University of Washington
1255 % Department of Biostatistics}, 2003.
1257 % \beamertemplatearrowbibitems
1258 % % Followed by interesting articles. Keep the list short.
1260 % \bibitem{CLS}
1261 % Common Lisp Stat, 2008.
1262 % \newblock \url{http://repo.or.cz/CommonLispStat.git/}
1264 % \end{thebibliography}
1265 % \end{frame}