Tony's DSC talk, initial useless version
[CommonLispStat.git] / Doc / talks / Rossini-DSC-July2009.tex
blobf1e36fe01b9c44cc910d13def5268b67e0a79d7e
1 \documentclass{beamer}
3 \mode<presentation>
5 \usetheme{classic}
6 \setbeamercovered{transparent}
9 \usepackage[english]{babel}
10 \usepackage[latin1]{inputenc}
11 \usepackage{times}
12 \usepackage[T1]{fontenc}
15 \title[CLS]{Common Lisp Statistics}
16 \subtitle{Using History to design better data analysis environments}
17 \author[Rossini]{Anthony~(Tony)~Rossini}
19 \institute[Novartis and University of Washington] % (optional, but mostly needed)
21 Group Head, Modeling and Simulation\\
22 Novartis Pharma AG, Switzerland
23 \and
24 Affiliate Assoc Prof, Biomedical and Health Informatics\\
25 University of Washington, USA}
27 \date[DSC2009]{DSC 2009, Copenhagen}
28 \subject{Statistical Computing Environments}
30 \begin{document}
32 \begin{frame}
33 \titlepage
34 \end{frame}
36 \begin{frame}{Outline}
37 \tableofcontents
38 \end{frame}
40 % Structuring a talk is a difficult task and the following structure
41 % may not be suitable. Here are some rules that apply for this
42 % solution:
44 % - Exactly two or three sections (other than the summary).
45 % - At *most* three subsections per section.
46 % - Talk about 30s to 2min per frame. So there should be between about
47 % 15 and 30 frames, all told.
49 % - A conference audience is likely to know very little of what you
50 % are going to talk about. So *simplify*!
51 % - In a 20min talk, getting the main ideas across is hard
52 % enough. Leave out details, even if it means being less precise than
53 % you think necessary.
54 % - If you omit details that are vital to the proof/implementation,
55 % just say so once. Everybody will be happy with that.
57 \section{What Works?}
59 \begin{frame}{Silly Visualization Example}
60 \includegraphics[width=3in,height=3in]{/home/tony/test1.png}
61 \end{frame}
64 \begin{frame}{Linear Regression}
66 \end{frame}
69 \begin{frame}{Descriptives}
71 \end{frame}
73 \begin{frame}{Data Management}
75 \end{frame}
78 \begin{frame}{Workflow Management}
80 \end{frame}
83 \section{The Practical}
84 \label{sec:practice}
87 \begin{frame}[fragile]{Graphics Device}
88 \begin{verbatim}
89 (defparameter *frame2*
90 (as-frame
91 (create-xlib-image-context 200 200)
92 :background-color +white+))
93 (bind ((#2A((f1 f2) (f3 f4))
94 (split-frame *frame2*
95 (percent 50) (percent 50))))
96 (defparameter *f1* f1) ; bottom left
97 (defparameter *f2* f2) ; bottom right 3 4
98 (defparameter *f3* f3) ; top left 1 2
99 (defparameter *f4* f4)) ; top right
100 \end{verbatim}
101 \end{frame}
103 \begin{frame}[fragile]{Functions to Plot}
104 \begin{verbatim}
105 (plot-function *f1* #'sin
106 (interval-of 0 2)
107 :x-title "x" :y-title "sin(x)")
108 (plot-function *f2* #'cos (interval-of 0 2)
109 :x-title "x" :y-title "cos(x)")
110 (plot-function *f3* #'tan (interval-of 0 2)
111 :x-title "x" :y-title "tan(x)")
112 \end{verbatim}
113 \end{frame}
115 \begin{frame}[fragile]{Things to Plot}
116 \begin{verbatim}
117 (let* ((n 500)
118 (xs (num-sequence
119 :from 0 :to 10 :length n))
120 (ys (map 'vector
121 #'(lambda (x) (+ x 8 (random 4.0)))
122 xs))
123 (weights
124 (replicate #'(lambda () (1+ (random 10)))
125 n 'fixnum))
126 (da (plot-simple *f4*
127 (interval-of 0 10) (interval-of 10 20)
128 :x-title "x" :y-title "y")))
129 (draw-symbols da xs ys :weights weights))
130 \end{verbatim}
131 \end{frame}
133 \begin{frame}[fragile]{Copying existing graphics}
134 And we generated the figure on the first page by:
135 \begin{verbatim}
136 (xlib-image-context-to-png
137 (context *f1*)
138 "/home/tony/test1.png")
139 \end{verbatim}
140 \end{frame}
143 \section{Common Lisp Statistics}
145 \begin{frame}{Why CLS?}
146 \begin{itemize}
147 \item a component-based structure for statistical computing
148 \item ability to leverage non-statisticians interested in computing
149 technologies (compilers, protocols, interfaces)
150 \end{itemize}
151 \end{frame}
153 \begin{frame}{Current Functionality}
154 \begin{itemize}
155 \item dataframes (similar to R)
156 \item Basic regression (similar to XLispStat)
157 \item matrix storage both in foreign and lisp-centric areas.
158 \item LAPACK (small percentage, increasing), working with both
159 matrix storage types
160 \item static graphics (X11) including preliminary grid functionality based
161 on CAIRO. Generation of PNG files from graphics windows.
162 \item CSV file support
163 \item Common Lisp!
164 \end{itemize}
165 \end{frame}
167 \begin{frame}{Computational Environment Supported}
168 \begin{itemize}
169 \item Should work on Linux, with recent SBCL versions
170 \item Definitely works on bleeding edge Debian (unstable).
171 \item Has worked on 4 different people's environments (not quite,
172 but sort of requires a \verb+/home/tony/+ !)
173 \item
174 \end{itemize}
175 \end{frame}
177 \begin{frame}{Common Lisp}
178 advanced iteration
179 \end{frame}
182 \begin{frame}[fragile]{Finding out things}
183 \begin{itemize}
184 \item CL-NUMLIB
185 num-sequence :from LOW to: HIGH :length SEQ-LENGTH
186 seq(from,to,by/length)
187 \item
188 \begin{verbatim}
189 (documentation
190 'cl-numlib:num-sequence
191 'function)
192 \end{verbatim}
193 \item This
194 \end{itemize}
195 \end{frame}
198 \section{Computable Statistics}
200 \begin{frame}{Why are we doing this?}
201 Computable and Executable Statistics
202 \end{frame}
204 \begin{frame}{Can we compute with them?}
205 3 Examples:
206 \begin{itemize}
207 \item Research
208 \item Consulting
209 \item Reimplementation
210 \end{itemize}
211 Consider whether one can ``compute'' with the information given?
212 \end{frame}
214 \begin{frame}[fragile]{Example 1: Theory\ldots}
215 \label{example1}
216 Let $f(x;\theta)$ describe the likelihood of XX under the following
217 assumptions.
218 \begin{enumerate}
219 \item assumption-1
220 \item assumption-2
221 \end{enumerate}
222 Then if we use the following algorithm:
223 \begin{enumerate}
224 \item step-1
225 \item step-2
226 \end{enumerate}
227 then $\hat{\theta}$ should be $N(0,\hat\sigma^2)$ with the following
228 characteristics\ldots
229 \end{frame}
231 \begin{frame}
232 \frametitle{Can we compute, using this description?}
233 Given the information at hand:
234 \begin{itemize}
235 \item we ought to have a framework for initial coding for the
236 actual simulations (test-first!)
237 \item the implementation is somewhat clear
238 \item We should ask: what theorems have similar assumptions?
239 \item We should ask: what theorems have similar conclusions but
240 different assumptions?
241 \end{itemize}
242 \end{frame}
244 \begin{frame}[fragile]{Realizing Theory}
245 \small{
246 \begin{verbatim}
247 (define-theorem my-proposed-theorem
248 (:theorem-type '(distribution-properties
249 frequentist
250 likelihood))
251 (:assumes '(assumption-1 assumption-2))
252 (:likelihood-form
253 (defun likelihood (data theta gamma)
254 (exponential-family theta gamma)))
255 (:compute-by
256 '(progn
257 (compute-starting-values thetahat gammahat)
258 (until (convergence)
259 (setf convergence
260 (or (step-1 thetahat)
261 (step-2 gammahat))))))
262 (:claim (assert
263 (and (equal-distribution thetahat 'normal)
264 (equal-distribution gammahat 'normal)))))
265 \end{verbatim}
267 \end{frame}
269 \begin{frame}[fragile]{It would be nice to have}
270 \begin{verbatim}
271 (theorem-veracity 'my-proposed-theorem)
272 \end{verbatim}
273 \end{frame}
275 \begin{frame}[fragile]{and why not...?}
276 \begin{verbatim}
277 (when (theorem-veracity
278 'my-proposed-theorem)
279 (write-paper 'my-proposed-theorem
280 :style :JASA
281 :output-format
282 '(LaTeX MSWord)))
283 \end{verbatim}
284 \end{frame}
286 \begin{frame}{Comments}
287 \begin{itemize}
288 \item The general problem is very difficult
289 \item Some progress has been made in small areas of basic
290 statistics: currently working on linear regression (LS-based,
291 Normal-bayesian) and the T-test.
292 \item Areas targetted for medium-term future: resampling methods and
293 similar algorithms.
294 \end{itemize}
296 \end{frame}
298 \begin{frame}
299 \frametitle{Example 2: Practice\ldots}
300 \label{example2}
301 The dataset comes from a series of clinical trials. We model the
302 primary endpoint, ``relief'', as a binary random variable. There is
303 a random trial effect on relief as well as severity due to
304 differences in recruitment and inclusion/exclusion criteria.
305 \end{frame}
307 \begin{frame}
308 \frametitle{Can we compute, using this description?}
309 \begin{itemize}
310 \item With a real such description, it is clear what some of the
311 potential models might be for this dataset
312 \item It should be clear how to start thinking of a data dictionary
313 for this problem.
314 \end{itemize}
315 \end{frame}
317 \begin{frame}[fragile]{Can we compute?}
318 \begin{verbatim}
319 (dataset-metadata paper-1
320 :context 'clinical-trials
321 :variables '((relief :model-type dependent
322 :distribution binary)
323 (trial :model-type independent
324 :distribution categorical)
325 (disease-severity))
326 :metadata '(inclusion-criteria
327 exclusion-criteria
328 recruitment-rate))
329 (propose-analysis paper-1)
330 ; => '(tables
331 ; (logistic regression))
332 \end{verbatim}
333 \end{frame}
335 \begin{frame}{Example 3: The Round-trip\ldots}
336 \label{example3}
337 The first examples describe ``ideas $\rightarrow$ code''
339 Consider the last time you read someone else's implementation of a
340 statistical procedure (i.e. R package code). When you read the
341 code, could you see:
342 \begin{itemize}
343 \item the assumptions used?
344 \item the algorithm implemented?
345 \item practical guidance for when you might select the algorithm
346 over others?
347 \item practical guidance for when you might select the
348 implementation over others?
349 \end{itemize}
350 These are usually components of any reasonable journal article.
351 \textit{(Q: have you actually read an R package that wasn't yours?)}
352 \end{frame}
354 \begin{frame}{Exercise left to the reader!}
356 (aside: I have been looking at the \textbf{stats} and \textbf{lme4}
357 packages recently -- \textit{for me}, very clear numerically, much
358 less so statistically)
359 \end{frame}
363 \section{Discussion}
365 \begin{frame}{Conclusion}
366 \begin{itemize}
367 \item Numerics: Linear algebra basics done -- full development
368 \end{itemize}
369 \end{frame}
372 \end{document}
374 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
377 \section{Preliminaries}
379 \subsection{Context}
381 \begin{frame}{Goals for this Talk}{(define, strategic approach,
382 justify)}
384 \begin{itemize}
385 \item To describe the concept of \alert{computable and executable
386 statistics}, placing it in a historical context.
388 \item To demonstrate that \alert{a research program}
389 implemented through simple steps can increase the efficiency of
390 statistical computing approaches by clearly describing both:
391 \begin{itemize}
392 \item numerical characteristics of procedures,
393 \item statistical concepts driving them.
394 \end{itemize}
396 \item To justify that the \alert{approach is worthwhile} and
397 represents a staged effort towards \alert{increased use of best
398 practices}.
399 \end{itemize}
400 (unfortunately, the last is still incomplete)
401 \end{frame}
404 \begin{frame}{Historical Computing Languages}
405 \begin{itemize}
406 \item FORTRAN : FORmula TRANslator. Original numerical computing
407 language, designed for clean implementation of numerical
408 algorithms
409 \item LISP : LISt Processor. Associated with symbolic
410 manipulation, AI, and knowledge approaches
411 \end{itemize}
413 They represent the 2 generalized needs of statistical computing,
414 which could be summarized as
415 \begin{itemize}
416 \item algorithms/numerics,
417 \item elicitation, communication, and generation of knowledge (``data
418 analysis'')
419 \end{itemize}
420 \end{frame}
422 \begin{frame}{Statistical Computing Environments}
424 Past:
425 \begin{itemize}
426 \item SPSS / BMDP / SAS
427 \item S ( S, S-PLUS, R)
428 \item LispStat ( XLispStat, ViSta, ARC , CommonLispStat ) ; QUAIL
429 \item XGobi (Orca / GGobi / Statistical Reality Engine)
430 \item MiniTab
431 \item Stata
432 \item DataDesk
433 \item Augsburg Impressionist series (MANET,
434 \item Excel
435 \end{itemize}
436 many others...
438 \end{frame}
440 \begin{frame}{How many are left?}
442 \begin{itemize}
443 \item R
444 \item SAS
445 \item SPSS
446 \item Stata
447 \item Minitab
448 \item very few others...
449 \end{itemize}
450 ``R is the Microsoft of the statistical computing world'' -- anonymous.
451 \end{frame}
453 \begin{frame}{Selection Pressure}
454 \begin{itemize}
455 \item the R user population is growing rapidly, fueled by critical
456 mass, quality, and value
457 \item R is a great system for applied data analysis
458 \item R is not such a great system for research into statistical
459 computing (backwards compatibility, inertia due to user population)
460 \end{itemize}
461 There is a need for alternative experiments for developing new
462 approaches/ideas/concepts.
463 \end{frame}
465 \begin{frame}{Philosophically, why Common Lisp?}
466 Philosophically:
467 \begin{itemize}
468 \item Lisp can cleanly present computational intentions, both
469 symbolically and numerically.
470 \item Semantics and context are important: well supported by Lisp
471 paradigms.
472 \item Lisp's parentheses describe singular, multi-scale,
473 \alert{complete thoughts}.
474 \end{itemize}
476 \end{frame}
478 \begin{frame}{Technically, why Common Lisp?}
479 \begin{itemize}
480 \item interactive COMPILED language (``R with a compiler'')
481 \item CLOS is R's S4 object system ``done right''.
482 \item clean semantics: modality, typing, can be expressed the way
483 one wants it.
484 \item programs are data, data are programs, leading to
485 \item Most modern computing tools available (XML, WWW technologies)
486 \item ``executable XML''
487 \end{itemize}
488 Common Lisp is very close in usage to how people currently use R
489 (mostly interactive, some batch, and a wish for compilation efficiency).
490 \end{frame}
492 \subsection{Background}
494 \begin{frame}
495 \frametitle{Desire: Semantics and Statistics}
496 \begin{itemize}
497 \item The semantic web (content which is self-descriptive) is an
498 interesting and potentially useful idea.
500 \item
501 Biological informatics support (GO, Entrez) has allowed for
502 precise definitions of concepts in biology.
504 \item It is a shame that a field like statistics, requiring such
505 precision, has less than an imprecise and temporally instable
506 field such as biology\ldots
507 \end{itemize}
509 How can we express statistical work (research, applied work) which
510 is both human and computer readable (perhaps subject to
511 transformations first)?
512 \end{frame}
515 % \subsection{Context}
517 % \begin{frame}{Context}{(where I'm coming from, my ``priors'')}
518 % \begin{itemize}
519 % \item Pharmaceutical Industry
520 % \item Modeling and Simulation uses mathematical models/constructs to
521 % record beliefs (biology, pharmacology, clinical science) for
522 % explication, clinical team alignment, decision support, and
523 % quality.
524 % \item My work at Novartis is at the intersection of biomedical
525 % informatics, statistics, and mathematical modeling.
526 % \item As manager: I need a mix of applications and novel research development to
527 % solve our challenges better, faster, more efficiently.
528 % \item Data analysis is a specialized approach to computer
529 % programming, \alert{different} than applications programming or
530 % systems programming.
531 % \end{itemize}
532 % \end{frame}
535 \subsection{Literate Programming is insufficient}
537 \begin{frame}{Literate Statistical Practice.}
538 \begin{enumerate}
539 \item Literate Programming applied to data analysis (Rossini, 1997/2001)
540 \item among the \alert{most annoying} techniques to integrate into
541 work-flow if one is not perfectly methodological.
542 \item Some tools:
543 \begin{itemize}
544 \item ESS: supports interactive creation of literate programs.
545 \item Sweave: tool which exemplifies reporting context; odfWeave
546 primarily simplifies reporting.
547 \item Roxygen: primarily supports a literate programming
548 documentation style, not a literate data analysis programming
549 style.
550 \end{itemize}
551 \item ROI demonstrated in specialized cases: BioConductor.
552 \item \alert{usually done after the fact} (final step of work-flow)
553 as a documentation/computational reproducibility technique, rarely
554 integrated into work-flow.
555 \end{enumerate}
556 Many contributors:
557 Knuth, Claerbout, Carey, de Leeuw, Leisch, Gentleman, Temple-Lang,
558 \ldots{}
559 \end{frame}
561 \begin{frame}
562 \frametitle{Literate Programming}
563 \framesubtitle{Why isn't it enough for Data Analysis?}
565 Only 2 contexts: (executable) code and documentation. Fine for
566 application programming, but for data analysis, we could benefit
567 from:
568 \begin{itemize}
569 \item classification of statistical procedures
570 \item descriptions of assumptions
571 \item pragmatic recommendations
572 \item inheritance of structure through the work-flow of a
573 statistical methodology or data analysis project
574 \item datasets and metadata
575 \end{itemize}
576 Concept: ontologies describing mathematical assumptions, applications
577 of methods, work-flow, and statistical data structures can enable
578 machine communication.
580 (i.e. informatics framework ala biology)
581 \end{frame}
584 \begin{frame}{Communication in Statistical Practice}{\ldots is essential for \ldots}
585 \begin{itemize}
586 \item finding
587 \item explanations
588 \item agreement
589 \item receiving information
590 \end{itemize}
591 \alert{``machine-readable'' communication/computation lets the
592 computer help} \\
593 Semantic Web is about ``machine-enabled computability''.
594 \end{frame}
596 \begin{frame} \frametitle{Semantics}
597 \framesubtitle{One definition: description and context}
599 Interoperability is the key, with respect to
600 \begin{itemize}
601 \item ``Finding things''
602 \item Applications and activities with related functionality
603 \begin{itemize}
604 \item moving information from one state to another (paper, journal
605 article, computer program)
606 \item computer programs which implement solutions to similar tasks
607 \end{itemize}
608 \end{itemize}
609 \end{frame}
612 \begin{frame}{Statistical Practice is somewhat restricted}
613 {...but in a good sense, enabling potential for semantics...}
615 There is a restrictable set of intended actions for what can be done
616 -- the critical goal is to be able to make a difference by
617 accelerating activities that should be ``computable'':
618 \begin{itemize}
619 \item restricted natural language processing
620 \item mathematical translation
621 \item common description of activities for simpler programming/data
622 analysis (S approach to objects and methods)
623 \end{itemize}
624 R is a good basic start (model formulation approach, simple
625 ``programming with data'' paradigm); we should see if we can do
626 better!
627 \end{frame}
629 \begin{frame}{Computable and Executable Statistics requires}
631 \begin{itemize}
632 \item approaches to describe data and metadata (``data'')
633 \begin{itemize}
634 \item semantic WWW
635 \item metadata management and integration, driving
636 \item data integration
637 \end{itemize}
638 \item approaches to describe data analysis methods (``models'')
639 \begin{itemize}
640 \item quantitatively: many ontologies (AMS, etc), few meeting
641 statistical needs.
642 \item many substantive fields have implementations
643 (bioinformatics, etc) but not well focused.
644 \end{itemize}
645 \item approaches to describe the specific form of interaction
646 (``instances of models'')
647 \begin{itemize}
648 \item Original idea behind ``Literate Statistical Analysis''.
649 \item That idea is suboptimal, more structure needed (not
650 necessarily built upon existing...).
651 \end{itemize}
652 \end{itemize}
653 \end{frame}
655 \subsection{Common Lisp Statistics}
657 \begin{frame}
658 \frametitle{Interactive Programming}
659 \framesubtitle{Everything goes back to being Lisp-like}
660 \begin{itemize}
661 \item Interactive programming (as originating with Lisp): works
662 extremely well for data analysis (Lisp being the original
663 ``programming with data'' language).
664 \item Theories/methods for how to do this are reflected in styles
665 for using R.
666 \end{itemize}
667 \end{frame}
669 \begin{frame}[fragile]
670 \frametitle{Lisp}
672 Lisp (LISt Processor) is different than most high-level computing
673 languages, and is very old (1956). Lisp is built on lists of things
674 which are evaluatable.
675 \begin{verbatim}
676 (functionName data1 data2 data3)
677 \end{verbatim}
678 or ``quoted'':
679 \begin{verbatim}
680 '(functionName data1 data2 data3)
681 \end{verbatim}
682 which is shorthand for
683 \begin{verbatim}
684 (list functionName data1 data2 data3)
685 \end{verbatim}
686 The difference is important -- lists of data (the second/third) are
687 not (yet?!) functions applied to (unencapsulated lists of) data (the first).
688 \end{frame}
690 \begin{frame}
691 \frametitle{Features}
692 \begin{itemize}
693 \item Data and Functions semantically the same
694 \item Natural interactive use through functional programming with
695 side effects
696 \item Batch is a simplification of interactive -- not a special mode!
697 \end{itemize}
698 \end{frame}
702 \begin{frame}[fragile]{Representation: XML and Lisp}{executing your data}
703 Many people are familiar with XML:
704 \begin{verbatim}
705 <name phone="+41793674557">Tony Rossini</name>
706 \end{verbatim}
707 which is shorter in Lisp:
708 \begin{verbatim}
709 (name "Tony Rossini" :phone "+41613674557")
710 \end{verbatim}
711 \begin{itemize}
712 \item Lisp ``parens'', universally hated by unbelievers, are
713 wonderful for denoting when a ``concept is complete''.
714 \item Why can't your data self-execute?
715 \end{itemize}
716 \end{frame}
718 \begin{frame}[fragile]{Numerics with Lisp}
719 \begin{itemize}
720 \item addition of rational numbers and arithmetic
721 \item example for mean
722 \begin{verbatim}
723 (defun mean (x)
724 (checktype x 'vector-like)
725 (/ (loop for i from 0 to (- (nelts *x*) 1)
726 summing (vref *x* i))
727 (nelts *x*)))
728 \end{verbatim}
729 \item example for variance
730 \begin{verbatim}
731 (defun variance (x)
732 (let ((meanx (mean x))
733 (nm1 (1- (nelts x))))
734 (/ (loop for i from 0 to nm1
735 summing (power (- (vref *x* i) meanx) 2)
736 nm1))))
737 \end{verbatim}
738 \item But through macros, \verb+(vref *x* i)+ could be
739 \verb+#V(X[i])+ or your favorite syntax.
740 \end{itemize}
742 \end{frame}
745 \begin{frame}{Common Lisp Statistics 1}
746 \begin{itemize}
747 \item Originally based on LispStat (reusability)
748 \item Re-factored structure (some numerics worked with a 1990-era code base).
749 \item Current activities:
750 \begin{enumerate}
751 \item numerics redone using CFFI-based BLAS/LAPLACK (cl-blapack)
752 \item matrix interface based on MatLisp
753 \item starting design of a user interface system (interfaces,
754 visuals).
755 \item general framework for model specification (regression,
756 likelihood, ODEs)
757 \item general framework for algorithm specification (bootstrap,
758 MLE, algorithmic data anaylsis methods).
759 \end{enumerate}
760 \end{itemize}
761 \end{frame}
763 \begin{frame}{Common Lisp Statistics 2}
765 \begin{itemize}
766 \item Implemented using SBCL. Contributed fixes for
767 Clozure/OpenMCL. Goal to target CLISP
768 \item Supports LispStat prototype object system
769 \item Package-based design -- only use the components you need, or
770 the components whose API you like.
771 \end{itemize}
772 \end{frame}
774 \section{Discussion}
776 \begin{frame}
777 \frametitle{Outlook}
778 \begin{itemize}
779 \item Semantics and Computability have captured a great deal of
780 attention in the informatics and business computing R\&D worlds
781 \item Statistically-driven Decision Making and Knowledge Discovery
782 is, with high likelihood, the next challenging stage after data
783 integration.
784 \item Statistical practice (theory and application) can be enhanced,
785 made more efficient, providing increased benefit to organizations
786 and groups using appropriate methods.
787 \item Lisp as a language, shares characteristics of both Latin
788 (difficult dead language useful for classical training) and German
789 (difficult living language useful for general life). Of course,
790 for some people, they are not difficult.
791 \end{itemize}
793 \end{frame}
795 \begin{frame}
796 The research program described in this talk is currently driving the
797 design of CommonLisp Stat, which leverages concepts and approaches
798 from the dead and moribund LispStat project.
800 \begin{itemize}
801 \item \url{http://repo.or.cz/w/CommonLispStat.git/}
802 \item \url{http://www.github.com/blindglobe/}
803 \end{itemize}
805 \end{frame}
806 \begin{frame}{Final Comment}
808 \begin{itemize}
809 \item In the Pharma industry, it is all about getting the right
810 drugs to the patient faster. Data analysis systems seriously
811 impact this process, being potentially an impediment or an
812 accelerator.
814 \begin{itemize}
815 \item \alert{Information technologies can increase the efficiency
816 of statistical practice}, though innovation change management
817 must be taking into account. (i.e. Statistical practice, while
818 considered by some an ``art form'', can benefit from
819 industrialization).
820 \item \alert{Lisp's features match the basic requirements we need}
821 (dichotomy: programs as data, data as programs). Sales pitch,
822 though...
823 \item Outlook: Lots of work and experimentation to do!
824 \end{itemize}
825 \item {\tiny Gratuitous Advert: We are hiring, have student
826 internships (undergrad, grad students), and a visiting faculty
827 program. Talk with me if possibly interested.}
828 \end{itemize}
829 \end{frame}
832 % % All of the following is optional and typically not needed.
833 % \appendix
836 % \section<presentation>*{\appendixname}
839 % \begin{frame} \frametitle{Complements and Backup}
840 % No more, stop here. Questions? (now or later).
841 % \end{frame}
843 % \begin{frame}{The Industrial Challenge.}{Getting the Consulting Right.}
844 % % - A title should summarize the slide in an understandable fashion
845 % % for anyone how does not follow everything on the slide itself.
847 % \begin{itemize}
848 % \item Recording assumptions for the next data analyst, reviewer.
849 % Use \texttt{itemize} a lot.
850 % \item
851 % Use very short sentences or short phrases.
852 % \end{itemize}
853 % \end{frame}
856 % \begin{frame}{The Industrial Challenge.}{Getting the Right Research Fast.}
857 % % - A title should summarize the slide in an understandable fashion
858 % % for anyone how does not follow everything on the slide itself.
860 % \begin{itemize}
861 % \item
862 % Use \texttt{itemize} a lot.
863 % \item
864 % Use very short sentences or short phrases.
865 % \end{itemize}
866 % \end{frame}
869 % \begin{frame}{Explicating the Work-flow}{QA/QC-based improvements.}
872 % \end{frame}
874 % \section{Motivation}
876 % \subsection{IT Can Speed up Deliverables in Statistical Practice}
878 % \begin{frame}{Our Generic Work-flow and Life-cycle}
879 % {describing most data analytic activities}
880 % Workflow:
881 % \begin{enumerate}
882 % \item Scope out the problem
883 % \item Sketch out a potential solution
884 % \item Implement until road-blocks appear
885 % \item Deliver results
886 % \end{enumerate}
888 % Lifecycle:
889 % \begin{enumerate}
890 % \item paper sketch
891 % \item 1st e-draft of text/code/date (iterate to \#1, discarding)
892 % \item cycle through work
893 % \item publish
894 % \item ``throw-away''
895 % \end{enumerate}
896 % but there is valuble information that could enable the next
897 % generation!
898 % \end{frame}
900 % \begin{frame}[fragile]{Paper $\rightarrow$ Computer $\rightarrow$ Article $\rightarrow$ Computer}{Cut and Paste makes for large errors.}
901 % \begin{itemize}
902 % \item Problems in a regulatory setting
903 % \item Regulatory issues are just ``best practices''
904 % \end{itemize}
906 % Why do we ``copy/paste'', or analogously, restart our work?
908 % pro:
909 % \begin{itemize}
910 % \item every time we repeat, we reinforce the idea in our brain
911 % \item review of ideas can help improve them
912 % \end{itemize}
913 % con:
914 % \begin{itemize}
915 % \item inefficiency
916 % \item introduction of mistakes
917 % \item loss of historical context
918 % \item changes to earlier work (on a different development branch)
919 % can not propagate.
920 % \end{itemize}
921 % \end{frame}
923 % \section{Semantics and Statistical Practice}
926 % \begin{frame}
927 % \frametitle{Statistical Activity Leads to Reports}
928 % \framesubtitle{You read what you know, do you understand it?}
930 % How can we improve the communication of the ideas we have?
932 % Precision of communication?
934 % \end{frame}
938 % \begin{frame} \frametitle{Communication Requires Context}
939 % \framesubtitle{Intentions imply more than one might like...}
941 % \begin{itemize}
942 % \item Consideration of what we might do
943 % \item Applications with related functionality
944 % \end{itemize}
945 % \end{frame}
949 % \begin{frame}
950 % \frametitle{Design Patterns}
951 % \framesubtitle{Supporting Work-flow Transitions}
953 % (joint work with H Wickham): The point of this research program is
954 % not to describe what to do at any particular stage of work, but to
955 % encourage researchers and practitioners to consider how the
956 % translation and transfer of information between stages so that work
957 % is not lost.
959 % Examples of stages in a work-flow:
960 % \begin{itemize}
961 % \item planning, execution, reporting;
962 % \item scoping, illustrative examples or counter examples, algorithmic construction,
963 % article writing.
964 % \item descriptive statistics, preliminary inferential analysis,
965 % model/assumption checking, final inferential analysis,
966 % communication of scientific results
967 % \end{itemize}
968 % Description of work-flows is essential to initiating discussions on
969 % quality/efficiency of approaches to work.
970 % \end{frame}
972 % \section{Design Challenges}
974 % \begin{frame}
975 % \frametitle{Activities are enhanced by support}
977 % \begin{itemize}
978 % \item Mathematical manipulation can be enhanced by symbolic
979 % computation
980 % \item Statistical programming can be enabled by examples and related
981 % algorithm implementation
982 % \item Datasets, to a limited extent, can self-describe.
983 % \end{itemize}
984 % \end{frame}
986 % \begin{frame}
987 % \frametitle{Executable and Computable Science}
989 % Use of algorithms and construction to describe how things work.
991 % Support for agent-based approaches
992 % \end{frame}
995 % \begin{frame}
996 % \frametitle{What is Data? Metadata?}
998 % Data: what we've observed
1000 % MetaData: context for observations, enables semantics.
1001 % \end{frame}
1006 % % \begin{frame}[fragile]
1007 % % \frametitle{Defining Variables}
1008 % % \framesubtitle{Setting variables}
1009 % % \begin{verbatim}
1010 % % (setq <variable> <value>)
1011 % % \end{verbatim}
1012 % % Example:
1013 % % \begin{verbatim}
1014 % % (setq ess-source-directory
1015 % % "/home/rossini/R-src")
1016 % % \end{verbatim}
1017 % % \end{frame}
1019 % % \begin{frame}[fragile]
1020 % % \frametitle{Defining on the fly}
1021 % % \begin{verbatim}
1022 % % (setq ess-source-directory
1023 % % (lambda () (file-name-as-directory
1024 % % (expand-file-name
1025 % % (concat (default-directory)
1026 % % ess-suffix "-src")))))
1027 % % \end{verbatim}
1028 % % (Lambda-expressions are anonymous functions, i.e. ``instant-functions'')
1029 % % \end{frame}
1032 % % \begin{frame}[fragile]
1033 % % \frametitle{Function Reuse}
1034 % % By naming the function, we could make the previous example reusable
1035 % % (if possible):
1036 % % \begin{verbatim}
1037 % % (defun my-src-directory ()
1038 % % (file-name-as-directory
1039 % % (expand-file-name
1040 % % (concat (default-directory)
1041 % % ess-suffix "-src"))))
1042 % % \end{verbatim}
1043 % % Example:
1044 % % \begin{verbatim}
1045 % % (setq ess-source-directory (my-src-directory))
1046 % % \end{verbatim}
1047 % % \end{frame}
1050 % % \begin{frame}
1051 % % \frametitle{Equality Among Packages}
1052 % % \begin{itemize}
1053 % % \item more/less equal can be described specifically through
1054 % % overriding imports.
1055 % % \end{itemize}
1056 % % \end{frame}
1059 % \subsection<presentation>*{For Further Reading}
1061 % \begin{frame}[allowframebreaks]
1062 % \frametitle<presentation>{Related Material}
1064 % \begin{thebibliography}{10}
1066 % \beamertemplatebookbibitems
1067 % % Start with overview books.
1069 % \bibitem{LispStat1990}
1070 % L.~Tierney
1071 % \newblock {\em LispStat}.
1073 % \beamertemplatearticlebibitems
1074 % % Followed by interesting articles. Keep the list short.
1076 % \bibitem{Rossini2001}
1077 % AJ.~Rossini
1078 % \newblock Literate Statistical Practice
1079 % \newblock {\em Proceedings of the Conference on Distributed
1080 % Statistical Computing}, 2001.
1082 % \bibitem{RossiniLeisch2003}
1083 % AJ.~Rossini and F.~Leisch
1084 % \newblock Literate Statistical Practice
1085 % \newblock {\em Technical Report Series, University of Washington
1086 % Department of Biostatistics}, 2003.
1088 % \beamertemplatearrowbibitems
1089 % % Followed by interesting articles. Keep the list short.
1091 % \bibitem{CLS}
1092 % Common Lisp Stat, 2008.
1093 % \newblock \url{http://repo.or.cz/CommonLispStat.git/}
1095 % \end{thebibliography}
1096 % \end{frame}