From 50b22d8521835101c99346534e629217430364c2 Mon Sep 17 00:00:00 2001 From: AJ Rossini Date: Fri, 13 Mar 2009 19:48:52 +0100 Subject: [PATCH] Final version for talk --- Doc/Rossini-RiceU-2009Mar.tex | 409 +++++++++++++++++++++--------------------- 1 file changed, 208 insertions(+), 201 deletions(-) diff --git a/Doc/Rossini-RiceU-2009Mar.tex b/Doc/Rossini-RiceU-2009Mar.tex index 6ee64b5..5154230 100644 --- a/Doc/Rossini-RiceU-2009Mar.tex +++ b/Doc/Rossini-RiceU-2009Mar.tex @@ -55,18 +55,22 @@ \section{Preliminaries} +\subsection{Context} + \begin{frame}{Goals for this Talk}{(define, strategic approach, justify)} \begin{itemize} \item To describe the concept of \alert{computable and executable - statistics}. + statistics}, placing it in a historical context. - \item To demonstrate that \alert{there exists a research program} - consisting of simple steps which can increase the use of - statistical computing approaches to allow for clear description - not only of the numerical characteristics of procedures, but the - statistical concepts behind them. + \item To demonstrate that \alert{a research program} + implemented through simple steps can increase the efficiency of + statistical computing approaches by clearly describing both: + \begin{itemize} + \item numerical characteristics of procedures, + \item statistical concepts driving them. + \end{itemize} \item To justify that the \alert{approach is worthwhile} and represents a staged effort towards \alert{increased use of best @@ -75,18 +79,77 @@ (unfortunately, the last is still incomplete) \end{frame} + +\begin{frame}{Historical Computing Languages} + \begin{itemize} + \item FORTRAN : FORmula TRANslator. Original numerical computing + language, designed for clean implementation of numerical + algorithms + \item LISP : LISt Processor. Associated with symbolic + manipulation, AI, and knowledge approaches + \end{itemize} + + They represent the 2 generalized needs of statistical computing, + which could be summarized as + \begin{itemize} + \item algorithms/numerics, + \item elicitation, communication, and generation of knowledge (``data + analysis'') + \end{itemize} +\end{frame} + +\begin{frame}{Statistical Computing Environments} + + Past: + \begin{itemize} + \item SPSS / BMDP / SAS + \item S ( S, S-PLUS, R) + \item LispStat ( XLispStat, ViSta, ARC , CommonLispStat ) ; QUAIL + \item XGobi (Orca / GGobi / Statistical Reality Engine) + \item MiniTab + \item Stata + \item DataDesk + \item Augsburg Impressionist series (MANET, + \item Excel + \end{itemize} + many others... + +\end{frame} + +\begin{frame}{How many are left?} + + \begin{itemize} + \item R + \item SAS + \item SPSS + \item Stata + \item Minitab + \item very few others... + \end{itemize} + ``R is the Microsoft of the statistical computing world'' -- anonymous. +\end{frame} + +\begin{frame}{Selection Pressure} + \begin{itemize} + \item the R user population is growing rapidly, fueled by critical + mass, quality, and value + \item R is a great system for applied data analysis + \item R is not such a great system for research into statistical + computing (backwards compatibility, inertia due to user population) + \end{itemize} + There is a need for alternative experiments for developing new + approaches/ideas/concepts. +\end{frame} + \begin{frame}{Philosophically, why Common Lisp?} Philosophically: \begin{itemize} - \item Lisp as an ancient ``AI'' language; Statistics as ``artificial - intelligence'' (not real intelligence, \alert{humans are too - flawed and inconsistent} for Bayesian work to be anything but - AI). + \item Lisp can cleanly present computational intentions, both + symbolically and numerically. \item Semantics and context are important: well supported by Lisp paradigms. - \item Lisp's parentheses describe single, multi-scale, - \alert{complete thought}. See \#1 for why that could make it - difficult. + \item Lisp's parentheses describe singular, multi-scale, + \alert{complete thoughts}. \end{itemize} \end{frame} @@ -95,42 +158,23 @@ \begin{itemize} \item interactive COMPILED language (``R with a compiler'') \item CLOS is R's S4 object system ``done right''. - \item clean semantics + \item clean semantics: modality, typing, can be expressed the way + one wants it. \item programs are data, data are programs, leading to \item Most modern computing tools available (XML, WWW technologies) \item ``executable XML'' \end{itemize} + Common Lisp is very close in usage to how people currently use R + (mostly interactive, some batch, and a wish for compilation efficiency). \end{frame} \subsection{Background} -\begin{frame}{Many systems existed concurrently for statistical - computing} - - \begin{itemize} - \item LispStat (ViSta, ARC) - \item SPSS (BMDP) - \item MiniTab - \item Stata - \item SAS - \item Quail - \item XGobi (Orca, GGobi, Statistical Reality Engine) - \item DataDesk - \item R - \item Excel - \end{itemize} -\end{frame} - - \begin{frame} - \frametitle{Semantics and Statistics} + \frametitle{Desire: Semantics and Statistics} \begin{itemize} - \item - There have been many wonderful talks about the semantic web which \\ - \alert{demonstrated its coolness} \\ - while completely \\ - \alert{failed to demonstrate its usefulness}.\\ - This talk follows in the tradition of such giants\ldots{} + \item The semantic web (content which is self-descriptive) is an + interesting and potentially useful idea. \item Biological informatics support (GO, Entrez) has allowed for @@ -140,38 +184,44 @@ precision, has less than an imprecise and temporally instable field such as biology\ldots \end{itemize} + + How can we express statistical work (research, applied work) which + is both human and computer readable (perhaps subject to + transformations first)? \end{frame} -\subsection{Context} +% \subsection{Context} -\begin{frame}{Context}{(where I'm coming from, my ``priors'')} - \begin{itemize} - \item Pharmaceutical Industry - \item Modeling and Simulation uses mathematical models/constructs to - record beliefs for explication, clinical team alignment, decision - support, and quality management. - \item My major role at Novartis is to work at the intersection of - biomedical informatics, statistics, and mathematical modeling. - \item I need a mix of applications and novel research development to - solve challenges better, faster, more efficiently. - \item Data analysis is a specialized approach to computer - programming, \alert{different} than applications programming or - systems programming. - \item \alert{Nearly all of the research challenges I face today - existed for me in academia, and vice-versa.} - \end{itemize} -\end{frame} +% \begin{frame}{Context}{(where I'm coming from, my ``priors'')} +% \begin{itemize} +% \item Pharmaceutical Industry +% \item Modeling and Simulation uses mathematical models/constructs to +% record beliefs (biology, pharmacology, clinical science) for +% explication, clinical team alignment, decision support, and +% quality. +% \item My work at Novartis is at the intersection of biomedical +% informatics, statistics, and mathematical modeling. +% \item As manager: I need a mix of applications and novel research development to +% solve our challenges better, faster, more efficiently. +% \item Data analysis is a specialized approach to computer +% programming, \alert{different} than applications programming or +% systems programming. +% \end{itemize} +% \end{frame} \section{Computable and Executable Statistics} \begin{frame}{Can we compute with them?} - - For the following examples, consider whether one can ``compute'' - with the information given. + 3 Examples: + \begin{itemize} + \item Research + \item Consulting + \item Reimplementation + \end{itemize} + Consider whether one can ``compute'' with the information given? \end{frame} - \begin{frame}[fragile]{Example 1: Theory\ldots} \label{example1} Let $f(x;\theta)$ describe the likelihood of XX under the following @@ -196,6 +246,9 @@ \item we ought to have a framework for initial coding for the actual simulations (test-first!) \item the implementation is somewhat clear + \item We should ask: what theorems have similar assumptions? + \item We should ask: what theorems have similar conclusions but + different assumptions? \end{itemize} \end{frame} @@ -212,14 +265,14 @@ (exponential-family theta gamma))) (:compute-by '(progn - (compute-starting-values thetahat gammahat + (compute-starting-values thetahat gammahat) (until (convergence) (setf convergence (or (step-1 thetahat) - (step-2 gammahat))))))) + (step-2 gammahat)))))) (:claim (assert - (and (equal-distribution thetahat normal) - (equal-distribution gammahat normal))))) + (and (equal-distribution thetahat 'normal) + (equal-distribution gammahat 'normal))))) \end{verbatim} } \end{frame} @@ -244,9 +297,11 @@ \begin{frame}{Comments} \begin{itemize} \item The general problem is very difficult - \item Some progress has been made in small areas of basic statistics - \item Areas targetted for medium-term future: resampling methods, - likelihood theory and algorithms. + \item Some progress has been made in small areas of basic + statistics: currently working on linear regression (LS-based, + Normal-bayesian) and the T-test. + \item Areas targetted for medium-term future: resampling methods and + similar algorithms. \end{itemize} \end{frame} @@ -272,7 +327,7 @@ \begin{frame}[fragile]{Can we compute?} \begin{verbatim} - (dataset paper-1 + (dataset-metadata paper-1 :context 'clinical-trials :variables '((relief :model-type dependent :distribution binary) @@ -282,8 +337,9 @@ :metadata '(inclusion-criteria exclusion-criteria recruitment-rate)) - (propose-analysis paper-1) ; => '(tables - ; (logistic regression)) + (propose-analysis paper-1) + ; => '(tables + ; (logistic regression)) \end{verbatim} \end{frame} @@ -319,7 +375,7 @@ \begin{frame}{Literate Statistical Practice.} \begin{enumerate} - \item Literate Programming applied to data analysis + \item Literate Programming applied to data analysis (Rossini, 1997/2001) \item among the \alert{most annoying} techniques to integrate into work-flow if one is not perfectly methodological. \item Some tools: @@ -336,64 +392,11 @@ as a documentation/computational reproducibility technique, rarely integrated into work-flow. \end{enumerate} - Many contributors to this general theory/approach: - Knuth, Claerbout, de Leeuw, Leisch, Gentleman, Temple-Lang, + Many contributors: + Knuth, Claerbout, Carey, de Leeuw, Leisch, Gentleman, Temple-Lang, \ldots{} \end{frame} -% \begin{frame} -% \frametitle{Literate Programming} -% \framesubtitle{Why is it not enough?} - -% Claim: it isn't -% \begin{enumerate} -% \item used for statistics since mid 90s (Emacs/ESS support in 1997) -% \item active popular use with R (Leisch, 2001) -% \end{enumerate} - -% but it provides a work-flow which is difficult and unnatural for many -% people (no perceived ROI). -% \end{frame} - -\begin{frame}{Related work} - - Mathematica Workbooks for mathematics concepts - \begin{itemize} - \item Mathematical storage and reproducibility, what bout Statistical - Concepts? - \item Not open, but freely reproducible. - \item Some semantics, hopefully this will improve. - \end{itemize} - - Electronic Lab Notebooks for data and the data/data analytics - interaction (but not quantitative methodological development). -\end{frame} - -\section{Results/Contribution} - -\subsection{Claims} - -% \begin{frame}{Semantic Web}{How do we communicate "things"?} -% Recall Monday evening talk: What kinds of communication problems can we have? -% \begin{itemize} -% \item I say "reinigung", you say "waschen" -% \item I say "clean", you say "sauber" -% \end{itemize} -% In the context of our work, how do we communicate what we've done? -% \end{frame} - -\begin{frame}{Communication in Statistical Practice}{\ldots is essential for \ldots} - \begin{itemize} - \item finding - \item explanations - \item agreement - \item receiving information - \end{itemize} - \alert{``machine-readable'' communication/computation lets the - computer help} \\ - Semantic Web is about ``machine-enabled computability''. -\end{frame} - \begin{frame} \frametitle{Literate Programming} \framesubtitle{Why isn't it enough for Data Analysis?} @@ -416,6 +419,19 @@ (i.e. informatics framework ala biology) \end{frame} + +\begin{frame}{Communication in Statistical Practice}{\ldots is essential for \ldots} + \begin{itemize} + \item finding + \item explanations + \item agreement + \item receiving information + \end{itemize} + \alert{``machine-readable'' communication/computation lets the + computer help} \\ + Semantic Web is about ``machine-enabled computability''. +\end{frame} + \begin{frame} \frametitle{Semantics} \framesubtitle{One definition: description and context} @@ -431,6 +447,7 @@ \end{itemize} \end{frame} + \begin{frame}{Statistical Practice is somewhat restricted} {...but in a good sense, enabling potential for semantics...} @@ -443,33 +460,36 @@ \item common description of activities for simpler programming/data analysis (S approach to objects and methods) \end{itemize} - R is a good primitive start (model formulation approach, simple + R is a good basic start (model formulation approach, simple ``programming with data'' paradigm); we should see if we can do better! \end{frame} +\begin{frame}{Computable and Executable Statistics requires} -% \begin{frame}{Semantics}{Capturing Ideas, Concepts, Proposals.} -% \begin{itemize} -% \item Capturing the historical state and corresponding decisions is -% essential for developing improved approaches. A common problem in -% ``product development'' (stat research, drug development) is -% cycling through the same issues repeatedly. -% \item These should be captured semantically -% \item Conversion of concepts to computable semantics is sensible -% when you need it, difficult without a compelling reasons -% \end{itemize} -% \end{frame} - - -% \begin{frame}{Lowering the bounds to interactive work.} -% \begin{enumerate} -% \item Limitations of object-orientation and information-hiding -% routines: require context in order to keep the context. -% \item Statistical and Data analysis: context is central and obvious. -% \end{enumerate} -% \end{frame} - + \begin{itemize} + \item approaches to describe data and metadata (``data'') + \begin{itemize} + \item semantic WWW + \item metadata management and integration, driving + \item data integration + \end{itemize} + \item approaches to describe data analysis methods (``models'') + \begin{itemize} + \item quantitatively: many ontologies (AMS, etc), few meeting + statistical needs. + \item many substantive fields have implementations + (bioinformatics, etc) but not well focused. + \end{itemize} + \item approaches to describe the specific form of interaction + (``instances of models'') + \begin{itemize} + \item Original idea behind ``Literate Statistical Analysis''. + \item That idea is suboptimal, more structure needed (not + necessarily built upon existing...). + \end{itemize} + \end{itemize} +\end{frame} \subsection{Common Lisp Statistics} @@ -484,7 +504,7 @@ for using R. \end{itemize} \end{frame} - + \begin{frame}[fragile] \frametitle{Lisp} @@ -517,36 +537,6 @@ \end{frame} -\subsection{Current Approach / Implementation} - - - - -\begin{frame}{Computable and Executable Statistics requires} - - \begin{itemize} - \item approaches to describe data and metadata (``data'') - \begin{itemize} - \item semantic WWW - \item metadata management and integration, driving - \item data integration - \end{itemize} - \item approaches to describe data analysis methods (``models'') - \begin{itemize} - \item quantitatively: many ontologies (AMS, etc), few meeting - statistical needs. - \item many substantive fields have implementations - (bioinformatics, etc) but not well focused. - \end{itemize} - \item approaches to describe the specific form of interaction - (``instances of models'') - \begin{itemize} - \item Original idea behind ``Literate Statistical Analysis''. - \item That idea is suboptimal, more structure needed (not - necessarily built upon existing...). - \end{itemize} - \end{itemize} -\end{frame} \begin{frame}[fragile]{Representation: XML and Lisp}{executing your data} Many people are familiar with XML: @@ -564,9 +554,34 @@ \end{itemize} \end{frame} -\begin{frame}{Common Lisp Statistics} - Ross talked about Lisp. I generally agree. My current - research program dates back over 3 years, and: +\begin{frame}[fragile]{Numerics with Lisp} + \begin{itemize} + \item addition of rational numbers and arithmetic + \item example for mean +\begin{verbatim} + (defun mean (x) + (checktype x 'vector-like) + (/ (loop for i from 0 to (- (nelts *x*) 1) + summing (vref *x* i)) + (nelts *x*))) +\end{verbatim} + \item example for variance +\begin{verbatim} +(defun variance (x) + (let ((meanx (mean x)) + (nm1 (1- (nelts x)))) + (/ (loop for i from 0 to nm1 + summing (power (- (vref *x* i) meanx) 2) + nm1)))) +\end{verbatim} + \item But through macros, \verb+(vref *x* i)+ could be + \verb+#V(X[i])+ or your favorite syntax. + \end{itemize} + +\end{frame} + + +\begin{frame}{Common Lisp Statistics 1} \begin{itemize} \item Originally based on LispStat (reusability) \item Re-factored structure (some numerics worked with a 1990-era code base). @@ -584,7 +599,7 @@ \end{itemize} \end{frame} -\begin{frame}{Common Lisp Statistics} +\begin{frame}{Common Lisp Statistics 2} \begin{itemize} \item Implemented using SBCL. Contributed fixes for @@ -594,17 +609,6 @@ the components whose API you like. \end{itemize} \end{frame} -\section*{Summary} - -% \begin{frame}{Delivering Better Data Analyses Faster} -% Industrial settings: -% \begin{enumerate} -% \item Pharmaceutical companies -% \item Academic departments -% \item Review-centric organizations (Health Authorities, Regulators) -% \end{enumerate} -% \end{frame} - \section{Discussion} @@ -625,17 +629,20 @@ for some people, they are not difficult. \end{itemize} +\end{frame} + +\begin{frame} The research program described in this talk is currently driving the design of CommonLisp Stat, which leverages concepts and approaches - from the dead and moribund XLisp-Stat project. - - \url{http://repo.or.cz/w/CommonLispStat.git/} + from the dead and moribund LispStat project. - \url{http://www.github.com/blindglobe/} + \begin{itemize} + \item \url{http://repo.or.cz/w/CommonLispStat.git/} + \item \url{http://www.github.com/blindglobe/} + \end{itemize} \end{frame} - -\begin{frame}{Summary} +\begin{frame}{Final Comment} \begin{itemize} \item In the Pharma industry, it is all about getting the right -- 2.11.4.GIT