From fccdec5f26ee73f10f5f52161f9f0cbcfe0d159f Mon Sep 17 00:00:00 2001 From: AJ Rossini Date: Wed, 11 Mar 2009 16:45:22 +0100 Subject: [PATCH] Draft of Rice talk, initially, but it's wrong. --- Doc/Rossini-RiceU-2009Mar.tex | 824 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 824 insertions(+) create mode 100644 Doc/Rossini-RiceU-2009Mar.tex diff --git a/Doc/Rossini-RiceU-2009Mar.tex b/Doc/Rossini-RiceU-2009Mar.tex new file mode 100644 index 0000000..5df72de --- /dev/null +++ b/Doc/Rossini-RiceU-2009Mar.tex @@ -0,0 +1,824 @@ +\documentclass{beamer} + +\mode +{ + \usetheme{classic} + \setbeamercovered{transparent} +} + +\usepackage[english]{babel} +\usepackage[latin1]{inputenc} +\usepackage{times} +\usepackage[T1]{fontenc} + +\title[CLS]{Common Lisp Statistics} +\subtitle{Using History to design better data analysis environments} +\author[Rossini]{Anthony~(Tony)~Rossini} + +\institute[Novartis and University of Washington] % (optional, but mostly needed) +{ + Group Head, Modeling and Simulation\\ + Novartis Pharma AG, Switzerland + \and + Affiliate Assoc Prof, Biomedical and Health Informatics\\ + University of Washington, USA} + +\date[Rice 09]{Rice, Mar 2009} +\subject{Statistical Computing Environments} + +\begin{document} + +\begin{frame} + \titlepage +\end{frame} + +\begin{frame}{Outline} + \tableofcontents +\end{frame} + +% Structuring a talk is a difficult task and the following structure +% may not be suitable. Here are some rules that apply for this +% solution: + +% - Exactly two or three sections (other than the summary). +% - At *most* three subsections per section. +% - Talk about 30s to 2min per frame. So there should be between about +% 15 and 30 frames, all told. + +% - A conference audience is likely to know very little of what you +% are going to talk about. So *simplify*! +% - In a 20min talk, getting the main ideas across is hard +% enough. Leave out details, even if it means being less precise than +% you think necessary. +% - If you omit details that are vital to the proof/implementation, +% just say so once. Everybody will be happy with that. + +\section{Preliminaries} + +\begin{frame}{Goals}{(define, strategic approach, justify)} + + \begin{itemize} + \item To describe the concept of \alert{computable and executable + statistics}. + + \item To demonstrate that \alert{there exists a research program} + consisting of simple steps which is adaptable to a practitioner's + work habits, which is feasible and introduces relatively minimal + disruptive changes. + + \item To justify that the \alert{approach is worthwhile} and + represents a staged effort towards \alert{increased use of best + practices}. + \end{itemize} + (unfortunately, the last is still incomplete) +\end{frame} + +\subsection{Background} + +\begin{frame}{Many systems existed concurrently for statistical + computing} + + \begin{itemize} + \item LispStat (ViSta, ARC) + \item SPSS (BMDP) + \item MiniTab + \item Stata + \item SAS + \item Quail + \item XGobi (Orca, GGobi, Statistical Reality Engine) + \item DataDesk + \item R + \item Excel + \end{itemize} +\end{frame} + +\begin{frame}{Why Lisp?} + \begin{itemize} + \item Lisp as an ancient ``AI'' language; Statistics as ``artificial + intelligence'' (not real intelligence, \alert{humans are too + flawed and inconsistent} for Bayesian work to be anything but + AI). + \item Semantics and context are important: well supported by Lisp + paradigms. + \item Lisp's parentheses describe single, multi-scale, + \alert{complete thought}. See \#1 for why that could make it + difficult. + \end{itemize} + Aside: Common Lisp is the building block for all my current research + projects. +\end{frame} + +\begin{frame} + \frametitle{Semantics and Statistics} + \begin{itemize} + \item + There have been many wonderful talks about the semantic web which \\ + \alert{demonstrated its coolness} \\ + while completely \\ + \alert{failed to demonstrate its usefulness}.\\ + This talk follows in the tradition of such giants\ldots{} + + \item + Biological informatics support (GO, Entrez) has allowed for + precise definitions of concepts in biology. + + \item It is a shame that a field like statistics, requiring such + precision, has less than an imprecise and temporally instable + field such as biology\ldots + \end{itemize} +\end{frame} + + +\subsection{Context} + +\begin{frame}{Context}{(where I'm coming from, my ``priors'')} + \begin{itemize} + \item Pharmaceutical Industry + \item Modeling and Simulation uses mathematical models/constructs to + record beliefs for explication, clinical team alignment, decision + support, and quality management. + \item My major role at Novartis is to work at the intersection of + biomedical informatics, statistics, and mathematical modeling. + \item I need a mix of applications and novel research development to + solve challenges better, faster, more efficiently. + \item Data analysis is a specialized approach to computer + programming, \alert{different} than applications programming or + systems programming. + \item \alert{Nearly all of the research challenges I face today + existed for me in academia, and vice-versa.} + \end{itemize} +\end{frame} + +\subsection{Illustrating Computable / Executable Statistics} + +\begin{frame}{Can we compute?} + + For the following examples, the critical question becomes: + \centerline{\alert{Can we compute with it?}} +\end{frame} + + +\begin{frame}[fragile]{Example 1: Theory\ldots} + \label{example1} + Let $f(x;\theta)$ describe the likelihood of XX under the following + assumptions. + \begin{enumerate} + \item assumption 1 + \item assumption 2 + \end{enumerate} + Then if we use the following algorithm: + \begin{enumerate} + \item step 1 + \item step 2 + \end{enumerate} + then $\hat{\theta}$ should be $N(0,\hat\sigma^2)$ with the following + characteristics\ldots +\end{frame} + +\begin{frame} + \frametitle{Can we compute, using this description?} + Given the information at hand: + \begin{itemize} + \item we ought to have a framework for initial coding for the + actual simulations (test-first!) + \item the implementation is somewhat clear + \end{itemize} +\end{frame} + + +\begin{frame} + \frametitle{Example 2: Practice\ldots} + \label{example2} + The dataset comes from a series of clinical trials. We model the + primary endpoint, ``relief'', as a binary random variable. There is a random + trial effect on relief as well as severity due to differences in + recruitment and inclusion/exclusion criteria. +\end{frame} + +\begin{frame} + \frametitle{Can we compute, using this description?} + \begin{itemize} + \item With a real such description, it is clear what some of the + potential models might be for this dataset + \item It should be clear how to start thinking of a data dictionary + for this problem. + \end{itemize} +\end{frame} + +\begin{frame}{Example 3: The Round-trip\ldots} + \label{example3} + The first examples describe ``ideas $\rightarrow$ code'' + + Consider the last time you read someone else's implementation of a + statistical procedure (i.e. R package code). When you read the + code, could you see: + \begin{itemize} + \item the assumptions used? + \item the algorithm implemented? + \item practical guidance for when you might select the algorithm + over others? + \item practical guidance for when you might select the + implementation over others? + \end{itemize} + These are usually components of any reasonable journal article. + \textit{(Q: have you actually read an R package that wasn't yours?)} +\end{frame} + +\section{Motivation} + +\subsection{IT Can Speed up Deliverables in Statistical Practice} + +\begin{frame}{Our Generic Work-flow and Life-cycle} + {describing most data analytic activities} + Workflow: + \begin{enumerate} + \item Scope out the problem + \item Sketch out a potential solution + \item Implement until road-blocks appear + \item Deliver results + \end{enumerate} + + Lifecycle: + \begin{enumerate} + \item paper sketch + \item 1st e-draft of text/code/date (iterate to \#1, discarding) + \item cycle through work + \item publish + \item ``throw-away'' + \end{enumerate} + but there is valuble information that could enable the next + generation! +\end{frame} + +\begin{frame}[fragile]{Paper $\rightarrow$ Computer $\rightarrow$ Article $\rightarrow$ Computer}{Cut and Paste makes for large errors.} + \begin{itemize} + \item Problems in a regulatory setting + \item Regulatory issues are just ``best practices'' + \end{itemize} + + Why do we ``copy/paste'', or analogously, restart our work? + + pro: + \begin{itemize} + \item every time we repeat, we reinforce the idea in our brain + \item review of ideas can help improve them + \end{itemize} + con: + \begin{itemize} + \item inefficiency + \item introduction of mistakes + \item loss of historical context + \item changes to earlier work (on a different development branch) + can not propagate. + \end{itemize} +\end{frame} + +\subsection{Literate Programming is insufficient} + +\begin{frame}{Literate Statistical Practice.} + \begin{enumerate} + \item Literate Programming applied to data analysis + \item among the \alert{most annoying} techniques to integrate into + work-flow if one is not perfectly methodological. + \item Some tools: + \begin{itemize} + \item ESS: supports interactive creation of literate programs. + \item Sweave: tool which exemplifies reporting context; odfWeave + primarily simplifies reporting. + \item Roxygen: primarily supports a literate programming + documentation style, not a literate data analysis programming + style. + \end{itemize} + \item ROI demonstrated in specialized cases: BioConductor. + \item \alert{usually done after the fact} (final step of work-flow) + as a documentation/computational reproducibility technique, rarely + integrated into work-flow. + \end{enumerate} + Many contributors to this general theory/approach: + Knuth, Claerbout, de Leeuw, Leisch, Gentleman, Temple-Lang, + \ldots{} +\end{frame} + +% \begin{frame} +% \frametitle{Literate Programming} +% \framesubtitle{Why is it not enough?} + +% Claim: it isn't +% \begin{enumerate} +% \item used for statistics since mid 90s (Emacs/ESS support in 1997) +% \item active popular use with R (Leisch, 2001) +% \end{enumerate} + +% but it provides a work-flow which is difficult and unnatural for many +% people (no perceived ROI). +% \end{frame} + +\begin{frame}{Related work} + + Mathematica Workbooks for mathematics concepts + \begin{itemize} + \item Mathematical storage and reproducibility, what bout Statistical + Concepts? + \item Not open, but freely reproducible. + \item Some semantics, hopefully this will improve. + \end{itemize} + + Electronic Lab Notebooks for data and the data/data analytics + interaction (but not quantitative methodological development). +\end{frame} + +\section{Results/Contribution} + +\subsection{Claims} + +% \begin{frame}{Semantic Web}{How do we communicate "things"?} +% Recall Monday evening talk: What kinds of communication problems can we have? +% \begin{itemize} +% \item I say "reinigung", you say "waschen" +% \item I say "clean", you say "sauber" +% \end{itemize} +% In the context of our work, how do we communicate what we've done? +% \end{frame} + +\begin{frame}{Communication in Statistical Practice}{\ldots is essential for \ldots} + \begin{itemize} + \item finding + \item explanations + \item agreement + \item receiving information + \end{itemize} + \alert{``machine-readable'' communication/computation lets the + computer help} \\ + Semantic Web is about ``machine-enabled computability''. +\end{frame} + +\begin{frame} + \frametitle{Literate Programming} + \framesubtitle{Why isn't it enough for Data Analysis?} + + Only 2 contexts: (executable) code and documentation. Fine for + application programming, but for data analysis, we could benefit + from: + \begin{itemize} + \item classification of statistical procedures + \item descriptions of assumptions + \item pragmatic recommendations + \item inheritance of structure through the work-flow of a + statistical methodology or data analysis project + \item datasets and metadata + \end{itemize} + Concept: ontologies describing mathematical assumptions, applications + of methods, work-flow, and statistical data structures can enable + machine communication. + + (i.e. informatics framework ala biology) +\end{frame} + +\begin{frame} \frametitle{Semantics} + \framesubtitle{One definition: description and context} + + Interoperability is the key, with respect to + \begin{itemize} + \item ``Finding things'' + \item Applications and activities with related functionality + \begin{itemize} + \item moving information from one state to another (paper, journal + article, computer program) + \item computer programs which implement solutions to similar tasks + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame}{Statistical Practice is somewhat restricted} + {...but in a good sense, enabling potential for semantics...} + + There is a restrictable set of intended actions for what can be done + -- the critical goal is to be able to make a difference by + accelerating activities that should be ``computable'': + \begin{itemize} + \item restricted natural language processing + \item mathematical translation + \item common description of activities for simpler programming/data + analysis (S approach to objects and methods) + \end{itemize} + R is a good primitive start (model formulation approach, simple + ``programming with data'' paradigm); we should see if we can do + better! +\end{frame} + + +% \begin{frame}{Semantics}{Capturing Ideas, Concepts, Proposals.} +% \begin{itemize} +% \item Capturing the historical state and corresponding decisions is +% essential for developing improved approaches. A common problem in +% ``product development'' (stat research, drug development) is +% cycling through the same issues repeatedly. +% \item These should be captured semantically +% \item Conversion of concepts to computable semantics is sensible +% when you need it, difficult without a compelling reasons +% \end{itemize} +% \end{frame} + + +% \begin{frame}{Lowering the bounds to interactive work.} +% \begin{enumerate} +% \item Limitations of object-orientation and information-hiding +% routines: require context in order to keep the context. +% \item Statistical and Data analysis: context is central and obvious. +% \end{enumerate} +% \end{frame} + +\subsection{Current Approach / Implementation} + +\begin{frame}{Computable and Executable Statistics requires} + + \begin{itemize} + \item approaches to describe data and metadata (``data'') + \begin{itemize} + \item semantic WWW + \item metadata management and integration, driving + \item data integration + \end{itemize} + \item approaches to describe data analysis methods (``models'') + \begin{itemize} + \item quantitatively: many ontologies (AMS, etc), few meeting + statistical needs. + \item many substantive fields have implementations + (bioinformatics, etc) but not well focused. + \end{itemize} + \item approaches to describe the specific form of interaction + (``instances of models'') + \begin{itemize} + \item Original idea behind ``Literate Statistical Analysis''. + \item That idea is suboptimal, more structure needed (not + necessarily built upon existing...). + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame}[fragile]{Representation: XML and Lisp}{executing your data} + Many people are familiar with XML: +\begin{verbatim} +Tony Rossini +\end{verbatim} + which is shorter in Lisp: +\begin{verbatim} +(name "Tony Rossini" :phone "+41613674557") +\end{verbatim} + \begin{itemize} + \item Lisp ``parens'', universally hated by unbelievers, are + wonderful for denoting when a ``concept is complete''. + \item Why can't your data self-execute? + \end{itemize} +\end{frame} + +\begin{frame}{Common Lisp Stat.} + Ross talked about Lisp. I generally agree. My current + research program dates back over 3 years, and: + \begin{itemize} + \item Originally based on LispStat (reusability) + \item Re-factored structure (some numerics worked with a 1990-era code base). + \item Current activities: + \begin{enumerate} + \item numerics redone using CFFI-based BLAS/LAPLACK (cl-blapack) + \item matrix interface based on MatLisp + \item design of graphics system on-going; constraint system + (Cells) supporting interactivity. + \item general framework for model specification (regression, + likelihood, ODEs) + \end{enumerate} + \end{itemize} +\end{frame} + +\begin{frame}{Common Lisp Stat} + + Source code available! + + (but it is ugly, works only in 10 cases, and changes with my moods). + +\end{frame} +\section*{Summary} + +% \begin{frame}{Delivering Better Data Analyses Faster} +% Industrial settings: +% \begin{enumerate} +% \item Pharmaceutical companies +% \item Academic departments +% \item Review-centric organizations (Health Authorities, Regulators) +% \end{enumerate} +% \end{frame} + +\begin{frame}{Summary} + + \begin{itemize} + \item In the Pharma industry, it is all about getting the right + drugs to the patient faster. Data analysis systems seriously + impact this process, being potentially an impediment or an + accelerator. + + \begin{itemize} + \item \alert{Information technologies can increase the efficiency + of statistical practice}, though innovation change management + must be taking into account. (i.e. Statistical practice, while + considered by some an ``art form'', can benefit from + industrialization). + \item \alert{Lisp's features match the basic requirements we need} + (dichotomy: programs as data, data as programs). Sales pitch, + though... + \item Outlook: Lots of work and experimentation to do! + \end{itemize} + \item {\tiny Gratuitous Advert: We are hiring, have student + internships (undergrad, grad students), and a visiting faculty + program. Talk with me if possibly interested.} + \end{itemize} +\end{frame} + +% All of the following is optional and typically not needed. +\appendix + + +\section*{\appendixname} + + +\begin{frame} \frametitle{Complements and Backup} + No more, stop here. Questions? (now or later). +\end{frame} + +\begin{frame}{The Industrial Challenge.}{Getting the Consulting Right.} + % - A title should summarize the slide in an understandable fashion + % for anyone how does not follow everything on the slide itself. + + \begin{itemize} + \item Recording assumptions for the next data analyst, reviewer. + Use \texttt{itemize} a lot. + \item + Use very short sentences or short phrases. + \end{itemize} +\end{frame} + + +\begin{frame}{The Industrial Challenge.}{Getting the Right Research Fast.} + % - A title should summarize the slide in an understandable fashion + % for anyone how does not follow everything on the slide itself. + + \begin{itemize} + \item + Use \texttt{itemize} a lot. + \item + Use very short sentences or short phrases. + \end{itemize} +\end{frame} + + +\begin{frame}{Explicating the Work-flow}{QA/QC-based improvements.} + + +\end{frame} + +\section{Semantics and Statistical Practice} + + +\begin{frame} + \frametitle{Statistical Activity Leads to Reports} + \framesubtitle{You read what you know, do you understand it?} + + How can we improve the communication of the ideas we have? + + Precision of communication? + +\end{frame} + + + +\begin{frame} \frametitle{Communication Requires Context} + \framesubtitle{Intentions imply more than one might like...} + + \begin{itemize} + \item Consideration of what we might do + \item Applications with related functionality + \end{itemize} +\end{frame} + + + +\begin{frame} + \frametitle{Design Patterns} + \framesubtitle{Supporting Work-flow Transitions} + + (joint work with H Wickham): The point of this research program is + not to describe what to do at any particular stage of work, but to + encourage researchers and practitioners to consider how the + translation and transfer of information between stages so that work + is not lost. + + Examples of stages in a work-flow: + \begin{itemize} + \item planning, execution, reporting; + \item scoping, illustrative examples or counter examples, algorithmic construction, + article writing. + \item descriptive statistics, preliminary inferential analysis, + model/assumption checking, final inferential analysis, + communication of scientific results + \end{itemize} + Description of work-flows is essential to initiating discussions on + quality/efficiency of approaches to work. +\end{frame} + +\section{Design Challenges} + +\begin{frame} + \frametitle{Activities are enhanced by support} + + \begin{itemize} + \item Mathematical manipulation can be enhanced by symbolic + computation + \item Statistical programming can be enabled by examples and related + algorithm implementation + \item Datasets, to a limited extent, can self-describe. + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Executable and Computable Science} + + Use of algorithms and construction to describe how things work. + + Support for agent-based approaches +\end{frame} + + +\begin{frame} + \frametitle{What is Data? Metadata?} + + Data: what we've observed + + MetaData: context for observations, enables semantics. +\end{frame} + + +\section{Discussion} + +\begin{frame} + \frametitle{Outlook} + \begin{itemize} + \item Semantics and Computability have captured a great deal of + attention in the informatics and business computing R\&D worlds + \item Statistically-driven Decision Making and Knowledge Discovery + is, with high likelihood, the next challenging stage after data + integration. + \item Statistical practice (theory and application) can be enhanced, + made more efficient, providing increased benefit to organizations + and groups using appropriate methods. +% \item Lisp as a language, shares characteristics of both Latin +% (difficult dead language useful for classical training) and German +% (difficult living language useful for general life). +% Of course, for some people, they are not difficult. + \end{itemize} + + The research program described in this talk is currently driving the + design of CommonLisp Stat, which leverages concepts and approaches + from the dead and moribund XLisp-Stat project. + + \url{http://repo.or.cz/w/CommonLispStat.git/} +\end{frame} + +\section{Common Lisp Statistics} + +\begin{frame} + \frametitle{Interactive Programming} + \framesubtitle{Everything goes back to being Lisp-like} + \begin{itemize} + \item Interactive programming (as originating with Lisp): works + extremely well for data analysis (Lisp being the original + ``programming with data'' language). + \item Theories/methods for how to do this are reflected in styles + for using R. + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Lisp} + + Lisp (LISt Processor) is different than most high-level computing + languages, and is very old (1956). Lisp is built on lists of things + which are evaluatable. +\begin{verbatim} +(functionName data1 data2 data3) +\end{verbatim} + or ``quoted'': +\begin{verbatim} +'(functionName data1 data2 data3) +\end{verbatim} + which is shorthand for +\begin{verbatim} +(list functionName data1 data2 data3) +\end{verbatim} + The difference is important -- lists of data (the second/third) are + not (yet?!) functions applied to (unencapsulated lists of) data (the first). +\end{frame} + +\begin{frame} + \frametitle{Features} + \begin{itemize} + \item Data and Functions semantically the same + \item Natural interactive use through functional programming with + side effects + \item Batch is a simplification of interactive -- not a special mode! + \end{itemize} +\end{frame} + + + +% \begin{frame}[fragile] +% \frametitle{Defining Variables} +% \framesubtitle{Setting variables} +% \begin{verbatim} +% (setq ) +% \end{verbatim} +% Example: +% \begin{verbatim} +% (setq ess-source-directory +% "/home/rossini/R-src") +% \end{verbatim} +% \end{frame} + +% \begin{frame}[fragile] +% \frametitle{Defining on the fly} +% \begin{verbatim} +% (setq ess-source-directory +% (lambda () (file-name-as-directory +% (expand-file-name +% (concat (default-directory) +% ess-suffix "-src"))))) +% \end{verbatim} +% (Lambda-expressions are anonymous functions, i.e. ``instant-functions'') +% \end{frame} + + +% \begin{frame}[fragile] +% \frametitle{Function Reuse} +% By naming the function, we could make the previous example reusable +% (if possible): +% \begin{verbatim} +% (defun my-src-directory () +% (file-name-as-directory +% (expand-file-name +% (concat (default-directory) +% ess-suffix "-src")))) +% \end{verbatim} +% Example: +% \begin{verbatim} +% (setq ess-source-directory (my-src-directory)) +% \end{verbatim} +% \end{frame} + + +% \begin{frame} +% \frametitle{Equality Among Packages} +% \begin{itemize} +% \item more/less equal can be described specifically through +% overriding imports. +% \end{itemize} +% \end{frame} + + +\subsection*{For Further Reading} + +\begin{frame}[allowframebreaks] + \frametitle{Related Material} + + \begin{thebibliography}{10} + + \beamertemplatebookbibitems + % Start with overview books. + + \bibitem{LispStat1990} + L.~Tierney + \newblock {\em LispStat}. + + \beamertemplatearticlebibitems + % Followed by interesting articles. Keep the list short. + + \bibitem{Rossini2001} + AJ.~Rossini + \newblock Literate Statistical Practice + \newblock {\em Proceedings of the Conference on Distributed + Statistical Computing}, 2001. + + \bibitem{RossiniLeisch2003} + AJ.~Rossini and F.~Leisch + \newblock Literate Statistical Practice + \newblock {\em Technical Report Series, University of Washington + Department of Biostatistics}, 2003. + + \beamertemplatearrowbibitems + % Followed by interesting articles. Keep the list short. + + \bibitem{CLS} + Common Lisp Stat, 2008. + \newblock \url{http://repo.or.cz/CommonLispStat.git/} + + \end{thebibliography} +\end{frame} + +\end{document} -- 2.11.4.GIT