restructured talk, show what we have then what we plan.
[CommonLispStat.git] / Doc / talks / Rossini-DSC-July2009.tex
blobeeef728fde5dfb62b37a01b280584c702ac6a6dd
1 \documentclass{beamer}
3 \mode<presentation>
5 \usetheme{classic}
6 \setbeamercovered{transparent}
9 \usepackage[english]{babel}
10 \usepackage[latin1]{inputenc}
11 \usepackage{times}
12 \usepackage[T1]{fontenc}
14 \title[CLS]{Common Lisp Statistics}
15 \subtitle{Using History to design better data analysis environments}
16 \author[Rossini]{Anthony~(Tony)~Rossini}
18 \institute[Novartis and University of Washington]{
19 Group Head, Modeling and Simulation Statistics\\
20 Novartis Pharma AG, Switzerland
21 \and
22 Affiliate Assoc Prof, Biomedical and Health Informatics\\
23 University of Washington, USA}
25 \date[DSC2009]{DSC 2009, Copenhagen}
26 \subject{Statistical Computing Environments}
28 \begin{document}
30 \begin{frame}
31 \titlepage
32 \end{frame}
34 \begin{frame}{Outline}
35 \tableofcontents
36 \end{frame}
38 % Structuring a talk is a difficult task and the following structure
39 % may not be suitable. Here are some rules that apply for this
40 % solution:
42 % - Exactly two or three sections (other than the summary).
43 % - At *most* three subsections per section.
44 % - Talk about 30s to 2min per frame. So there should be between about
45 % 15 and 30 frames, all told.
47 % - A conference audience is likely to know very little of what you
48 % are going to talk about. So *simplify*!
49 % - In a 20min talk, getting the main ideas across is hard
50 % enough. Leave out details, even if it means being less precise than
51 % you think necessary.
52 % - If you omit details that are vital to the proof/implementation,
53 % just say so once. Everybody will be happy with that.
55 \section{What Works?}
56 \label{sec:work}
58 \begin{frame}{Is it Vaporware?}
60 Not quite...
61 \end{frame}
63 \subsection{Graphics}
64 \label{sec:work:graphics}
66 \begin{frame}{Silly Visualization Example}
67 \includegraphics[width=3in,height=3in]{/home/tony/test1.png}
68 \end{frame}
70 \begin{frame}[fragile]{Graphics Device}
71 \begin{verbatim}
72 (defparameter *frame2*
73 (as-frame (create-xlib-image-context 200 200)
74 :background-color +white+))
75 (bind ((#2A((f1 f2) (f3 f4))
76 (split-frame *frame2*
77 (percent 50) (percent 50))))
78 (defparameter *f1* f1) ; bottom left
79 (defparameter *f2* f2) ; bottom right f3 f4
80 (defparameter *f3* f3) ; top left f1 f2
81 (defparameter *f4* f4)) ; top right
82 \end{verbatim}
83 \end{frame}
85 \begin{frame}[fragile]{Functions to Plot}
86 \begin{verbatim}
87 (plot-function *f1* #'sin
88 (interval-of 0 2)
89 :x-title "x" :y-title "sin(x)")
90 (plot-function *f2* #'cos (interval-of 0 2)
91 :x-title "x" :y-title "cos(x)")
92 (plot-function *f3* #'tan (interval-of 0 2)
93 :x-title "x" :y-title "tan(x)")
94 \end{verbatim}
95 \end{frame}
97 \begin{frame}[fragile]{Things to Plot}
98 \begin{verbatim}
99 (let* ((n 500)
100 (xs (num-sequence
101 :from 0 :to 10 :length n))
102 (ys (map 'vector
103 #'(lambda (x) (+ x 8 (random 4.0)))
104 xs))
105 (weights
106 (replicate #'(lambda () (1+ (random 10)))
107 n 'fixnum))
108 (da (plot-simple *f4*
109 (interval-of 0 10) (interval-of 10 20)
110 :x-title "x" :y-title "y")))
111 (draw-symbols da xs ys :weights weights))
112 \end{verbatim}
113 \end{frame}
115 \subsection{Statistical Models}
116 \label{sec:work:statmod}
119 \begin{frame}[fragile]{Linear Regression}
120 Primitive LispStat, a wrapper around LAPACK's \texttt{dgelsy}:
121 \begin{verbatim}
122 (defparameter *result1*
123 (regression-model (list->vector-like iron)
124 (list->vector-like absorbtion)))
125 *result*1 =>
127 \end{verbatim}
128 \end{frame}
130 \subsection{Numerical Descriptions}
131 \label{sec:work:numdesc}
133 \begin{frame}[fragile]{Descriptives}
134 (mean iron)
136 \end{frame}
138 \subsection{Data Manip/Mgmt}
139 \label{sec:work:data}
141 \begin{frame}[verbatim]{DataFrames}
143 \end{frame}
145 \begin{frame}[verbatim]{Numerical Matrices}
147 \end{frame}
149 \begin{frame}{Managing / Manipulating Data}
151 \end{frame}
155 \section{Graphics}
156 \label{sec:practice}
160 \begin{frame}[fragile]{Copying existing graphics}
161 And we generated the figure on the first page by:
162 \begin{verbatim}
163 (xlib-image-context-to-png
164 (context *f1*)
165 "/home/tony/test1.png")
166 \end{verbatim}
167 \end{frame}
170 \section{Common Lisp Statistics}
172 \begin{frame}{Why CLS?}
173 \begin{itemize}
174 \item a component-based structure for statistical computing
175 \item ability to leverage non-statisticians interested in computing
176 technologies (compilers, protocols, interfaces)
177 \end{itemize}
178 \end{frame}
180 \begin{frame}{Current Functionality}
181 \begin{itemize}
182 \item basic dataframes (similar to R); indexing/slicing API under
183 development.
184 \item Basic regression (similar to XLispStat)
185 \item matrix storage both in foreign and lisp-centric areas.
186 \item LAPACK (small percentage, increasing), working with both
187 matrix storage types
188 \item static graphics (X11) including preliminary grid functionality based
189 on CAIRO. Generation of PNG files from graphics windows.
190 \item CSV file support
191 \item Common Lisp!
192 \end{itemize}
193 \end{frame}
195 \begin{frame}[fragile]{Computational Environment Supported}
196 \begin{itemize}
197 \item Should work on Linux, with recent SBCL versions
198 \item Definitely works on bleeding edge Debian (unstable).
199 \item Has worked on 4 different people's environments (not quite,
200 but sort of requires a \verb+/home/tony/+ !)
201 \item
202 \end{itemize}
203 \end{frame}
205 \begin{frame}{Common Lisp}
206 advanced iteration
207 \end{frame}
210 \begin{frame}[fragile]{Finding out things}
211 \begin{itemize}
212 \item CL-NUMLIB
213 num-sequence :from LOW to: HIGH :length SEQ-LENGTH
214 seq(from,to,by/length)
215 \item
216 \begin{verbatim}
217 (documentation
218 'cl-numlib:num-sequence
219 'function)
220 \end{verbatim}
221 \item This
222 \end{itemize}
223 \end{frame}
226 \section{Computable Statistics}
228 \begin{frame}{What does NOT work?}
229 Primarily, the reason that we doing this:
231 \textbf{Computable and Executable Statistics}
232 \end{frame}
234 \begin{frame}{Can we compute with them?}
235 3 Examples:
236 \begin{itemize}
237 \item Research
238 \item Consulting
239 \item Reimplementation
240 \end{itemize}
241 Consider whether one can ``compute'' with the information given?
242 \end{frame}
244 \begin{frame}[fragile]{Example 1: Theory\ldots}
245 \label{example1}
246 Let $f(x;\theta)$ describe the likelihood of XX under the following
247 assumptions.
248 \begin{enumerate}
249 \item assumption-1
250 \item assumption-2
251 \end{enumerate}
252 Then if we use the following algorithm:
253 \begin{enumerate}
254 \item step-1
255 \item step-2
256 \end{enumerate}
257 then $\hat{\theta}$ should be $N(0,\hat\sigma^2)$ with the following
258 characteristics\ldots
259 \end{frame}
261 \begin{frame}
262 \frametitle{Can we compute, using this description?}
263 Given the information at hand:
264 \begin{itemize}
265 \item we ought to have a framework for initial coding for the
266 actual simulations (test-first!)
267 \item the implementation is somewhat clear
268 \item We should ask: what theorems have similar assumptions?
269 \item We should ask: what theorems have similar conclusions but
270 different assumptions?
271 \end{itemize}
272 \end{frame}
274 \begin{frame}[fragile]{Realizing Theory}
275 \small{
276 \begin{verbatim}
277 (define-theorem my-proposed-theorem
278 (:theorem-type '(distribution-properties
279 frequentist
280 likelihood))
281 (:assumes '(assumption-1 assumption-2))
282 (:likelihood-form
283 (defun likelihood (data theta gamma)
284 (exponential-family theta gamma)))
285 (:compute-by
286 '(progn
287 (compute-starting-values thetahat gammahat)
288 (until (convergence)
289 (setf convergence
290 (or (step-1 thetahat)
291 (step-2 gammahat))))))
292 (:claim (assert
293 (and (equal-distribution thetahat 'normal)
294 (equal-distribution gammahat 'normal)))))
295 \end{verbatim}
297 \end{frame}
299 \begin{frame}[fragile]{It would be nice to have}
300 \begin{verbatim}
301 (theorem-veracity 'my-proposed-theorem)
302 \end{verbatim}
303 \end{frame}
305 \begin{frame}[fragile]{and why not...?}
306 \begin{verbatim}
307 (when (theorem-veracity
308 'my-proposed-theorem)
309 (write-paper 'my-proposed-theorem
310 :style :JASA
311 :output-format
312 '(LaTeX MSWord)))
313 \end{verbatim}
314 \end{frame}
316 \begin{frame}{Comments}
317 \begin{itemize}
318 \item The general problem is very difficult
319 \item Some progress has been made in small areas of basic
320 statistics: currently working on linear regression (LS-based,
321 Normal-bayesian) and the T-test.
322 \item Areas targetted for medium-term future: resampling methods and
323 similar algorithms.
324 \end{itemize}
326 \end{frame}
328 \begin{frame}
329 \frametitle{Example 2: Practice\ldots}
330 \label{example2}
331 The dataset comes from a series of clinical trials. We model the
332 primary endpoint, ``relief'', as a binary random variable. There is
333 a random trial effect on relief as well as severity due to
334 differences in recruitment and inclusion/exclusion criteria.
335 \end{frame}
337 \begin{frame}
338 \frametitle{Can we compute, using this description?}
339 \begin{itemize}
340 \item With a real such description, it is clear what some of the
341 potential models might be for this dataset
342 \item It should be clear how to start thinking of a data dictionary
343 for this problem.
344 \end{itemize}
345 \end{frame}
347 \begin{frame}[fragile]{Can we compute?}
348 \begin{verbatim}
349 (dataset-metadata paper-1
350 :context 'clinical-trials
351 :variables '((relief :model-type dependent
352 :distribution binary)
353 (trial :model-type independent
354 :distribution categorical)
355 (disease-severity))
356 :metadata '(inclusion-criteria
357 exclusion-criteria
358 recruitment-rate))
359 (propose-analysis paper-1)
360 ; => '(tables
361 ; (logistic regression))
362 \end{verbatim}
363 \end{frame}
365 \begin{frame}{Example 3: The Round-trip\ldots}
366 \label{example3}
367 The first examples describe ``ideas $\rightarrow$ code''
369 Consider the last time you read someone else's implementation of a
370 statistical procedure (i.e. R package code). When you read the
371 code, could you see:
372 \begin{itemize}
373 \item the assumptions used?
374 \item the algorithm implemented?
375 \item practical guidance for when you might select the algorithm
376 over others?
377 \item practical guidance for when you might select the
378 implementation over others?
379 \end{itemize}
380 These are usually components of any reasonable journal article.
381 \textit{(Q: have you actually read an R package that wasn't yours?)}
382 \end{frame}
384 \begin{frame}{Exercise left to the reader!}
386 (aside: I have been looking at the \textbf{stats} and \textbf{lme4}
387 packages recently -- \textit{for me}, very clear numerically, much
388 less so statistically)
389 \end{frame}
393 \section{Discussion}
395 \begin{frame}{Conclusion}
396 \begin{itemize}
397 \item Numerics: Linear algebra basics done -- full development
398 \item Static graphics: progress being made, have a partial grid-solution
399 \item
400 \end{itemize}
401 \end{frame}
404 \end{document}
406 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
409 \section{Preliminaries}
411 \subsection{Context}
413 \begin{frame}{Goals for this Talk}{(define, strategic approach,
414 justify)}
416 \begin{itemize}
417 \item To describe the concept of \alert{computable and executable
418 statistics}, placing it in a historical context.
420 \item To demonstrate that \alert{a research program}
421 implemented through simple steps can increase the efficiency of
422 statistical computing approaches by clearly describing both:
423 \begin{itemize}
424 \item numerical characteristics of procedures,
425 \item statistical concepts driving them.
426 \end{itemize}
428 \item To justify that the \alert{approach is worthwhile} and
429 represents a staged effort towards \alert{increased use of best
430 practices}.
431 \end{itemize}
432 (unfortunately, the last is still incomplete)
433 \end{frame}
436 \begin{frame}{Historical Computing Languages}
437 \begin{itemize}
438 \item FORTRAN : FORmula TRANslator. Original numerical computing
439 language, designed for clean implementation of numerical
440 algorithms
441 \item LISP : LISt Processor. Associated with symbolic
442 manipulation, AI, and knowledge approaches
443 \end{itemize}
445 They represent the 2 generalized needs of statistical computing,
446 which could be summarized as
447 \begin{itemize}
448 \item algorithms/numerics,
449 \item elicitation, communication, and generation of knowledge (``data
450 analysis'')
451 \end{itemize}
452 \end{frame}
454 \begin{frame}{Statistical Computing Environments}
456 Past:
457 \begin{itemize}
458 \item SPSS / BMDP / SAS
459 \item S ( S, S-PLUS, R)
460 \item LispStat ( XLispStat, ViSta, ARC , CommonLispStat ) ; QUAIL
461 \item XGobi (Orca / GGobi / Statistical Reality Engine)
462 \item MiniTab
463 \item Stata
464 \item DataDesk
465 \item Augsburg Impressionist series (MANET,
466 \item Excel
467 \end{itemize}
468 many others...
470 \end{frame}
472 \begin{frame}{How many are left?}
474 \begin{itemize}
475 \item R
476 \item SAS
477 \item SPSS
478 \item Stata
479 \item Minitab
480 \item very few others...
481 \end{itemize}
482 ``R is the Microsoft of the statistical computing world'' -- anonymous.
483 \end{frame}
485 \begin{frame}{Selection Pressure}
486 \begin{itemize}
487 \item the R user population is growing rapidly, fueled by critical
488 mass, quality, and value
489 \item R is a great system for applied data analysis
490 \item R is not such a great system for research into statistical
491 computing (backwards compatibility, inertia due to user population)
492 \end{itemize}
493 There is a need for alternative experiments for developing new
494 approaches/ideas/concepts.
495 \end{frame}
497 \begin{frame}{Philosophically, why Common Lisp?}
498 Philosophically:
499 \begin{itemize}
500 \item Lisp can cleanly present computational intentions, both
501 symbolically and numerically.
502 \item Semantics and context are important: well supported by Lisp
503 paradigms.
504 \item Lisp's parentheses describe singular, multi-scale,
505 \alert{complete thoughts}.
506 \end{itemize}
508 \end{frame}
510 \begin{frame}{Technically, why Common Lisp?}
511 \begin{itemize}
512 \item interactive COMPILED language (``R with a compiler'')
513 \item CLOS is R's S4 object system ``done right''.
514 \item clean semantics: modality, typing, can be expressed the way
515 one wants it.
516 \item programs are data, data are programs, leading to
517 \item Most modern computing tools available (XML, WWW technologies)
518 \item ``executable XML''
519 \end{itemize}
520 Common Lisp is very close in usage to how people currently use R
521 (mostly interactive, some batch, and a wish for compilation efficiency).
522 \end{frame}
524 \subsection{Background}
526 \begin{frame}
527 \frametitle{Desire: Semantics and Statistics}
528 \begin{itemize}
529 \item The semantic web (content which is self-descriptive) is an
530 interesting and potentially useful idea.
532 \item
533 Biological informatics support (GO, Entrez) has allowed for
534 precise definitions of concepts in biology.
536 \item It is a shame that a field like statistics, requiring such
537 precision, has less than an imprecise and temporally instable
538 field such as biology\ldots
539 \end{itemize}
541 How can we express statistical work (research, applied work) which
542 is both human and computer readable (perhaps subject to
543 transformations first)?
544 \end{frame}
547 % \subsection{Context}
549 % \begin{frame}{Context}{(where I'm coming from, my ``priors'')}
550 % \begin{itemize}
551 % \item Pharmaceutical Industry
552 % \item Modeling and Simulation uses mathematical models/constructs to
553 % record beliefs (biology, pharmacology, clinical science) for
554 % explication, clinical team alignment, decision support, and
555 % quality.
556 % \item My work at Novartis is at the intersection of biomedical
557 % informatics, statistics, and mathematical modeling.
558 % \item As manager: I need a mix of applications and novel research development to
559 % solve our challenges better, faster, more efficiently.
560 % \item Data analysis is a specialized approach to computer
561 % programming, \alert{different} than applications programming or
562 % systems programming.
563 % \end{itemize}
564 % \end{frame}
567 \subsection{Literate Programming is insufficient}
569 \begin{frame}{Literate Statistical Practice.}
570 \begin{enumerate}
571 \item Literate Programming applied to data analysis (Rossini, 1997/2001)
572 \item among the \alert{most annoying} techniques to integrate into
573 work-flow if one is not perfectly methodological.
574 \item Some tools:
575 \begin{itemize}
576 \item ESS: supports interactive creation of literate programs.
577 \item Sweave: tool which exemplifies reporting context; odfWeave
578 primarily simplifies reporting.
579 \item Roxygen: primarily supports a literate programming
580 documentation style, not a literate data analysis programming
581 style.
582 \end{itemize}
583 \item ROI demonstrated in specialized cases: BioConductor.
584 \item \alert{usually done after the fact} (final step of work-flow)
585 as a documentation/computational reproducibility technique, rarely
586 integrated into work-flow.
587 \end{enumerate}
588 Many contributors:
589 Knuth, Claerbout, Carey, de Leeuw, Leisch, Gentleman, Temple-Lang,
590 \ldots{}
591 \end{frame}
593 \begin{frame}
594 \frametitle{Literate Programming}
595 \framesubtitle{Why isn't it enough for Data Analysis?}
597 Only 2 contexts: (executable) code and documentation. Fine for
598 application programming, but for data analysis, we could benefit
599 from:
600 \begin{itemize}
601 \item classification of statistical procedures
602 \item descriptions of assumptions
603 \item pragmatic recommendations
604 \item inheritance of structure through the work-flow of a
605 statistical methodology or data analysis project
606 \item datasets and metadata
607 \end{itemize}
608 Concept: ontologies describing mathematical assumptions, applications
609 of methods, work-flow, and statistical data structures can enable
610 machine communication.
612 (i.e. informatics framework ala biology)
613 \end{frame}
616 \begin{frame}{Communication in Statistical Practice}{\ldots is essential for \ldots}
617 \begin{itemize}
618 \item finding
619 \item explanations
620 \item agreement
621 \item receiving information
622 \end{itemize}
623 \alert{``machine-readable'' communication/computation lets the
624 computer help} \\
625 Semantic Web is about ``machine-enabled computability''.
626 \end{frame}
628 \begin{frame} \frametitle{Semantics}
629 \framesubtitle{One definition: description and context}
631 Interoperability is the key, with respect to
632 \begin{itemize}
633 \item ``Finding things''
634 \item Applications and activities with related functionality
635 \begin{itemize}
636 \item moving information from one state to another (paper, journal
637 article, computer program)
638 \item computer programs which implement solutions to similar tasks
639 \end{itemize}
640 \end{itemize}
641 \end{frame}
644 \begin{frame}{Statistical Practice is somewhat restricted}
645 {...but in a good sense, enabling potential for semantics...}
647 There is a restrictable set of intended actions for what can be done
648 -- the critical goal is to be able to make a difference by
649 accelerating activities that should be ``computable'':
650 \begin{itemize}
651 \item restricted natural language processing
652 \item mathematical translation
653 \item common description of activities for simpler programming/data
654 analysis (S approach to objects and methods)
655 \end{itemize}
656 R is a good basic start (model formulation approach, simple
657 ``programming with data'' paradigm); we should see if we can do
658 better!
659 \end{frame}
661 \begin{frame}{Computable and Executable Statistics requires}
663 \begin{itemize}
664 \item approaches to describe data and metadata (``data'')
665 \begin{itemize}
666 \item semantic WWW
667 \item metadata management and integration, driving
668 \item data integration
669 \end{itemize}
670 \item approaches to describe data analysis methods (``models'')
671 \begin{itemize}
672 \item quantitatively: many ontologies (AMS, etc), few meeting
673 statistical needs.
674 \item many substantive fields have implementations
675 (bioinformatics, etc) but not well focused.
676 \end{itemize}
677 \item approaches to describe the specific form of interaction
678 (``instances of models'')
679 \begin{itemize}
680 \item Original idea behind ``Literate Statistical Analysis''.
681 \item That idea is suboptimal, more structure needed (not
682 necessarily built upon existing...).
683 \end{itemize}
684 \end{itemize}
685 \end{frame}
687 \subsection{Common Lisp Statistics}
689 \begin{frame}
690 \frametitle{Interactive Programming}
691 \framesubtitle{Everything goes back to being Lisp-like}
692 \begin{itemize}
693 \item Interactive programming (as originating with Lisp): works
694 extremely well for data analysis (Lisp being the original
695 ``programming with data'' language).
696 \item Theories/methods for how to do this are reflected in styles
697 for using R.
698 \end{itemize}
699 \end{frame}
701 \begin{frame}[fragile]
702 \frametitle{Lisp}
704 Lisp (LISt Processor) is different than most high-level computing
705 languages, and is very old (1956). Lisp is built on lists of things
706 which are evaluatable.
707 \begin{verbatim}
708 (functionName data1 data2 data3)
709 \end{verbatim}
710 or ``quoted'':
711 \begin{verbatim}
712 '(functionName data1 data2 data3)
713 \end{verbatim}
714 which is shorthand for
715 \begin{verbatim}
716 (list functionName data1 data2 data3)
717 \end{verbatim}
718 The difference is important -- lists of data (the second/third) are
719 not (yet?!) functions applied to (unencapsulated lists of) data (the first).
720 \end{frame}
722 \begin{frame}
723 \frametitle{Features}
724 \begin{itemize}
725 \item Data and Functions semantically the same
726 \item Natural interactive use through functional programming with
727 side effects
728 \item Batch is a simplification of interactive -- not a special mode!
729 \end{itemize}
730 \end{frame}
734 \begin{frame}[fragile]{Representation: XML and Lisp}{executing your data}
735 Many people are familiar with XML:
736 \begin{verbatim}
737 <name phone="+41793674557">Tony Rossini</name>
738 \end{verbatim}
739 which is shorter in Lisp:
740 \begin{verbatim}
741 (name "Tony Rossini" :phone "+41613674557")
742 \end{verbatim}
743 \begin{itemize}
744 \item Lisp ``parens'', universally hated by unbelievers, are
745 wonderful for denoting when a ``concept is complete''.
746 \item Why can't your data self-execute?
747 \end{itemize}
748 \end{frame}
750 \begin{frame}[fragile]{Numerics with Lisp}
751 \begin{itemize}
752 \item addition of rational numbers and arithmetic
753 \item example for mean
754 \begin{verbatim}
755 (defun mean (x)
756 (checktype x 'vector-like)
757 (/ (loop for i from 0 to (- (nelts *x*) 1)
758 summing (vref *x* i))
759 (nelts *x*)))
760 \end{verbatim}
761 \item example for variance
762 \begin{verbatim}
763 (defun variance (x)
764 (let ((meanx (mean x))
765 (nm1 (1- (nelts x))))
766 (/ (loop for i from 0 to nm1
767 summing (power (- (vref *x* i) meanx) 2)
768 nm1))))
769 \end{verbatim}
770 \item But through macros, \verb+(vref *x* i)+ could be
771 \verb+#V(X[i])+ or your favorite syntax.
772 \end{itemize}
774 \end{frame}
777 \begin{frame}{Common Lisp Statistics 1}
778 \begin{itemize}
779 \item Originally based on LispStat (reusability)
780 \item Re-factored structure (some numerics worked with a 1990-era code base).
781 \item Current activities:
782 \begin{enumerate}
783 \item numerics redone using CFFI-based BLAS/LAPLACK (cl-blapack)
784 \item matrix interface based on MatLisp
785 \item starting design of a user interface system (interfaces,
786 visuals).
787 \item general framework for model specification (regression,
788 likelihood, ODEs)
789 \item general framework for algorithm specification (bootstrap,
790 MLE, algorithmic data anaylsis methods).
791 \end{enumerate}
792 \end{itemize}
793 \end{frame}
795 \begin{frame}{Common Lisp Statistics 2}
797 \begin{itemize}
798 \item Implemented using SBCL. Contributed fixes for
799 Clozure/OpenMCL. Goal to target CLISP
800 \item Supports LispStat prototype object system
801 \item Package-based design -- only use the components you need, or
802 the components whose API you like.
803 \end{itemize}
804 \end{frame}
806 \section{Discussion}
808 \begin{frame}
809 \frametitle{Outlook}
810 \begin{itemize}
811 \item Semantics and Computability have captured a great deal of
812 attention in the informatics and business computing R\&D worlds
813 \item Statistically-driven Decision Making and Knowledge Discovery
814 is, with high likelihood, the next challenging stage after data
815 integration.
816 \item Statistical practice (theory and application) can be enhanced,
817 made more efficient, providing increased benefit to organizations
818 and groups using appropriate methods.
819 \item Lisp as a language, shares characteristics of both Latin
820 (difficult dead language useful for classical training) and German
821 (difficult living language useful for general life). Of course,
822 for some people, they are not difficult.
823 \end{itemize}
825 \end{frame}
827 \begin{frame}
828 The research program described in this talk is currently driving the
829 design of CommonLisp Stat, which leverages concepts and approaches
830 from the dead and moribund LispStat project.
832 \begin{itemize}
833 \item \url{http://repo.or.cz/w/CommonLispStat.git/}
834 \item \url{http://www.github.com/blindglobe/}
835 \end{itemize}
837 \end{frame}
838 \begin{frame}{Final Comment}
840 \begin{itemize}
841 \item In the Pharma industry, it is all about getting the right
842 drugs to the patient faster. Data analysis systems seriously
843 impact this process, being potentially an impediment or an
844 accelerator.
846 \begin{itemize}
847 \item \alert{Information technologies can increase the efficiency
848 of statistical practice}, though innovation change management
849 must be taking into account. (i.e. Statistical practice, while
850 considered by some an ``art form'', can benefit from
851 industrialization).
852 \item \alert{Lisp's features match the basic requirements we need}
853 (dichotomy: programs as data, data as programs). Sales pitch,
854 though...
855 \item Outlook: Lots of work and experimentation to do!
856 \end{itemize}
857 \end{itemize}
858 \end{frame}
861 % % All of the following is optional and typically not needed.
862 % \appendix
865 % \section<presentation>*{\appendixname}
868 % \begin{frame} \frametitle{Complements and Backup}
869 % No more, stop here. Questions? (now or later).
870 % \end{frame}
872 % \begin{frame}{The Industrial Challenge.}{Getting the Consulting Right.}
873 % % - A title should summarize the slide in an understandable fashion
874 % % for anyone how does not follow everything on the slide itself.
876 % \begin{itemize}
877 % \item Recording assumptions for the next data analyst, reviewer.
878 % Use \texttt{itemize} a lot.
879 % \item
880 % Use very short sentences or short phrases.
881 % \end{itemize}
882 % \end{frame}
885 % \begin{frame}{The Industrial Challenge.}{Getting the Right Research Fast.}
886 % % - A title should summarize the slide in an understandable fashion
887 % % for anyone how does not follow everything on the slide itself.
889 % \begin{itemize}
890 % \item
891 % Use \texttt{itemize} a lot.
892 % \item
893 % Use very short sentences or short phrases.
894 % \end{itemize}
895 % \end{frame}
898 % \begin{frame}{Explicating the Work-flow}{QA/QC-based improvements.}
901 % \end{frame}
903 % \section{Motivation}
905 % \subsection{IT Can Speed up Deliverables in Statistical Practice}
907 % \begin{frame}{Our Generic Work-flow and Life-cycle}
908 % {describing most data analytic activities}
909 % Workflow:
910 % \begin{enumerate}
911 % \item Scope out the problem
912 % \item Sketch out a potential solution
913 % \item Implement until road-blocks appear
914 % \item Deliver results
915 % \end{enumerate}
917 % Lifecycle:
918 % \begin{enumerate}
919 % \item paper sketch
920 % \item 1st e-draft of text/code/date (iterate to \#1, discarding)
921 % \item cycle through work
922 % \item publish
923 % \item ``throw-away''
924 % \end{enumerate}
925 % but there is valuble information that could enable the next
926 % generation!
927 % \end{frame}
929 % \begin{frame}[fragile]{Paper $\rightarrow$ Computer $\rightarrow$ Article $\rightarrow$ Computer}{Cut and Paste makes for large errors.}
930 % \begin{itemize}
931 % \item Problems in a regulatory setting
932 % \item Regulatory issues are just ``best practices''
933 % \end{itemize}
935 % Why do we ``copy/paste'', or analogously, restart our work?
937 % pro:
938 % \begin{itemize}
939 % \item every time we repeat, we reinforce the idea in our brain
940 % \item review of ideas can help improve them
941 % \end{itemize}
942 % con:
943 % \begin{itemize}
944 % \item inefficiency
945 % \item introduction of mistakes
946 % \item loss of historical context
947 % \item changes to earlier work (on a different development branch)
948 % can not propagate.
949 % \end{itemize}
950 % \end{frame}
952 % \section{Semantics and Statistical Practice}
955 % \begin{frame}
956 % \frametitle{Statistical Activity Leads to Reports}
957 % \framesubtitle{You read what you know, do you understand it?}
959 % How can we improve the communication of the ideas we have?
961 % Precision of communication?
963 % \end{frame}
967 % \begin{frame} \frametitle{Communication Requires Context}
968 % \framesubtitle{Intentions imply more than one might like...}
970 % \begin{itemize}
971 % \item Consideration of what we might do
972 % \item Applications with related functionality
973 % \end{itemize}
974 % \end{frame}
978 % \begin{frame}
979 % \frametitle{Design Patterns}
980 % \framesubtitle{Supporting Work-flow Transitions}
982 % (joint work with H Wickham): The point of this research program is
983 % not to describe what to do at any particular stage of work, but to
984 % encourage researchers and practitioners to consider how the
985 % translation and transfer of information between stages so that work
986 % is not lost.
988 % Examples of stages in a work-flow:
989 % \begin{itemize}
990 % \item planning, execution, reporting;
991 % \item scoping, illustrative examples or counter examples, algorithmic construction,
992 % article writing.
993 % \item descriptive statistics, preliminary inferential analysis,
994 % model/assumption checking, final inferential analysis,
995 % communication of scientific results
996 % \end{itemize}
997 % Description of work-flows is essential to initiating discussions on
998 % quality/efficiency of approaches to work.
999 % \end{frame}
1001 % \section{Design Challenges}
1003 % \begin{frame}
1004 % \frametitle{Activities are enhanced by support}
1006 % \begin{itemize}
1007 % \item Mathematical manipulation can be enhanced by symbolic
1008 % computation
1009 % \item Statistical programming can be enabled by examples and related
1010 % algorithm implementation
1011 % \item Datasets, to a limited extent, can self-describe.
1012 % \end{itemize}
1013 % \end{frame}
1015 % \begin{frame}
1016 % \frametitle{Executable and Computable Science}
1018 % Use of algorithms and construction to describe how things work.
1020 % Support for agent-based approaches
1021 % \end{frame}
1024 % \begin{frame}
1025 % \frametitle{What is Data? Metadata?}
1027 % Data: what we've observed
1029 % MetaData: context for observations, enables semantics.
1030 % \end{frame}
1035 % % \begin{frame}[fragile]
1036 % % \frametitle{Defining Variables}
1037 % % \framesubtitle{Setting variables}
1038 % % \begin{verbatim}
1039 % % (setq <variable> <value>)
1040 % % \end{verbatim}
1041 % % Example:
1042 % % \begin{verbatim}
1043 % % (setq ess-source-directory
1044 % % "/home/rossini/R-src")
1045 % % \end{verbatim}
1046 % % \end{frame}
1048 % % \begin{frame}[fragile]
1049 % % \frametitle{Defining on the fly}
1050 % % \begin{verbatim}
1051 % % (setq ess-source-directory
1052 % % (lambda () (file-name-as-directory
1053 % % (expand-file-name
1054 % % (concat (default-directory)
1055 % % ess-suffix "-src")))))
1056 % % \end{verbatim}
1057 % % (Lambda-expressions are anonymous functions, i.e. ``instant-functions'')
1058 % % \end{frame}
1061 % % \begin{frame}[fragile]
1062 % % \frametitle{Function Reuse}
1063 % % By naming the function, we could make the previous example reusable
1064 % % (if possible):
1065 % % \begin{verbatim}
1066 % % (defun my-src-directory ()
1067 % % (file-name-as-directory
1068 % % (expand-file-name
1069 % % (concat (default-directory)
1070 % % ess-suffix "-src"))))
1071 % % \end{verbatim}
1072 % % Example:
1073 % % \begin{verbatim}
1074 % % (setq ess-source-directory (my-src-directory))
1075 % % \end{verbatim}
1076 % % \end{frame}
1079 % % \begin{frame}
1080 % % \frametitle{Equality Among Packages}
1081 % % \begin{itemize}
1082 % % \item more/less equal can be described specifically through
1083 % % overriding imports.
1084 % % \end{itemize}
1085 % % \end{frame}
1088 % \subsection<presentation>*{For Further Reading}
1090 % \begin{frame}[allowframebreaks]
1091 % \frametitle<presentation>{Related Material}
1093 % \begin{thebibliography}{10}
1095 % \beamertemplatebookbibitems
1096 % % Start with overview books.
1098 % \bibitem{LispStat1990}
1099 % L.~Tierney
1100 % \newblock {\em LispStat}.
1102 % \beamertemplatearticlebibitems
1103 % % Followed by interesting articles. Keep the list short.
1105 % \bibitem{Rossini2001}
1106 % AJ.~Rossini
1107 % \newblock Literate Statistical Practice
1108 % \newblock {\em Proceedings of the Conference on Distributed
1109 % Statistical Computing}, 2001.
1111 % \bibitem{RossiniLeisch2003}
1112 % AJ.~Rossini and F.~Leisch
1113 % \newblock Literate Statistical Practice
1114 % \newblock {\em Technical Report Series, University of Washington
1115 % Department of Biostatistics}, 2003.
1117 % \beamertemplatearrowbibitems
1118 % % Followed by interesting articles. Keep the list short.
1120 % \bibitem{CLS}
1121 % Common Lisp Stat, 2008.
1122 % \newblock \url{http://repo.or.cz/CommonLispStat.git/}
1124 % \end{thebibliography}
1125 % \end{frame}