From ffcd2027c4deb29a4570836ab27e565ee0e7a629 Mon Sep 17 00:00:00 2001 From: AJ Rossini Date: Mon, 14 Sep 2009 07:55:16 +0200 Subject: [PATCH] file for Vanderbilt talk --- Doc/talks/Rossini-Vandy-Oct2009.tex | 1265 +++++++++++++++++++++++++++++++++++ 1 file changed, 1265 insertions(+) create mode 100644 Doc/talks/Rossini-Vandy-Oct2009.tex diff --git a/Doc/talks/Rossini-Vandy-Oct2009.tex b/Doc/talks/Rossini-Vandy-Oct2009.tex new file mode 100644 index 0000000..20b0fe9 --- /dev/null +++ b/Doc/talks/Rossini-Vandy-Oct2009.tex @@ -0,0 +1,1265 @@ +\documentclass{beamer} + +\mode +{ + \usetheme{classic} + \setbeamercovered{transparent} +} + +\usepackage[english]{babel} +\usepackage[latin1]{inputenc} +\usepackage{times} +\usepackage[T1]{fontenc} +\usepackage{url} + +\title[CLS]{Common Lisp Statistics} +\subtitle{Using History to design better data analysis environments} +\author[Rossini]{Anthony~(Tony)~Rossini} + +\institute[Novartis and University of Washington]{ + Group Head, Modeling and Simulation Statistics\\ + Novartis Pharma AG, Switzerland + \and + Affiliate Assoc Prof, Biomedical and Health Informatics\\ + University of Washington, USA} + +\date[Vanderbilt]{Vanderbilt, Sept 2009, Nashville USA} +\subject{Statistical Computing Environments} + +\begin{document} + +\begin{frame} + \titlepage +\end{frame} + +\section{What Works?} +\label{sec:work} + +\begin{frame}{Is it Vaporware? Not quite} + The follow is possible with the help of the open source Common Lisp + community, who provided most of the packages, tools, and glue. + (Tamas Papp, Raymond Toy, Mark Hoemmomem, and many, many others). + Most of the underlying code was written by others, and ``composed'' + by me. +\end{frame} + +\subsection{Graphics} +\label{sec:work:graphics} + +\begin{frame}{Silly Visualization Example} +\includegraphics[width=3in,height=3in]{/home/tony/test1.png} +\end{frame} + +\begin{frame}[fragile]{How?} +\begin{verbatim} +(defparameter *frame2* + (as-frame (create-xlib-image-context 200 200) + :background-color +white+)) +(bind ((#2A((f1 f2) (f3 f4)) + (split-frame *frame2* + (percent 50) + (percent 50)))) + (defparameter *f1* f1) ; lower left + (defparameter *f2* f2) ; lower right f3 f4 + (defparameter *f3* f3) ; top left f1 f2 + (defparameter *f4* f4)); top right +\end{verbatim} +\end{frame} + +\begin{frame}[fragile]{Functions to Plot} +\begin{verbatim} +(plot-function *f1* #'sin + (interval-of 0 2) + :x-title "x" :y-title "sin(x)") +(plot-function *f2* #'cos (interval-of 0 2) + :x-title "x" :y-title "cos(x)") +(plot-function *f3* #'tan (interval-of 0 2) + :x-title "x" :y-title "tan(x)") +\end{verbatim} +\end{frame} + +\begin{frame}[fragile]{Things to Plot} +\small{ +\begin{verbatim} +(let* ((n 500) + (xs (num-sequence + :from 0 :to 10 :length n)) + (ys (map 'vector + #'(lambda (x) (+ x 8 (random 4.0))) + xs)) + (weights + (replicate #'(lambda () (1+ (random 10))) + n 'fixnum)) + (da (plot-simple *f4* + (interval-of 0 10) + (interval-of 10 20) + :x-title "x" :y-title "y"))) + (draw-symbols da xs ys :weights weights)) +\end{verbatim} +} +\end{frame} + +\begin{frame}[fragile]{Copying existing graphics} + And we generated the figure on the first page by: +\begin{verbatim} +(xlib-image-context-to-png + (context *f1*) + "/home/tony/test1.png") +\end{verbatim} +\end{frame} + +\subsection{Statistical Models} +\label{sec:work:statmod} + +\begin{frame}[fragile]{Linear Regression} +\small{ +\begin{verbatim} +;; Worse than LispStat, wrapping LAPACK's dgelsy: +(defparameter *result1* + (lm (list->vector-like iron) + (list->vector-like absorbtion))) +*result*1 => +((# + 2) + + # + + 13 2) +\end{verbatim} +} +\end{frame} + +\subsection{Data Manip/Mgmt} +\label{sec:work:data} + +\begin{frame}[fragile]{DataFrames} +\small{ +\begin{verbatim} +(defparameter *my-df-1* + (make-instance 'dataframe-array + :storage #2A((1 2 3 4 5) (10 20 30 40 50)) + :doc "This is a boring dataframe-array" + :case-labels (list "x" "y") + :var-labels (list "a" "b" "c" "d" "e"))) + +(xref *my-df-1* 0 0) ; API change in progress + +(setf (xref *my-df-1* 0 0) -1d0) +\end{verbatim} +} +\end{frame} + +\begin{frame}[fragile]{Numerical Matrices} +\small{ +\begin{verbatim} +(defparameter *mat-1* + (make-matrix 3 3 + :initial-contents #2A((2d0 3d0 -4d0) + (3d0 2d0 -4d0) + (4d0 4d0 -5d0)))) + +(xref *mat-1* 2 0) ; => 4d0 ; API change +(setf (xref *mat-1* 2 0) -4d0) + +(defparameter *xv* + (make-vector 4 :type :row + :initial-contents '((1d0 3d0 2d0 4d0)))) +\end{verbatim} +} +\end{frame} + +\begin{frame}[fragile]{Macros make the above tolerable} +\begin{verbatim} +(defparameter *xv* + (make-vector 4 :type :row + :initial-contents '((1d0 3d0 2d0 4d0)))) + +; can use defmacro for the following syntax => + +(make-row-vector *xv* '((1d0 3d0 2d0 4d0))) + +; or reader macros for the following: +#mrv(*xv* '((1d0 3d0 2d0 4d0))) +\end{verbatim} +\end{frame} + +\begin{frame}{Outline} + \tableofcontents +\end{frame} + +\section{Common Lisp Statistics} +\label{sec:CLS} + +\begin{frame}{Why CLS?} + \begin{itemize} + \item a component-based structure for statistical computing + \item Common Lisp provides a simple, \emph{primitive}, syntax + \item Common Lisp provides an amazing number of advanced features + that keep getting reinvented in other languages. + \item Common Lisp has linkages to many amazing features developed in + other languages. + \item ability to leverage non-statisticians interested in computing + technologies (compilers, protocols, interfaces, libraries, + functionality which can be reused for statistical purposes) + \item This is a ``customization'' through packages to support + statistical computing, not a independent language. ``Ala Carte'', + not ``Menu''. + \end{itemize} +\end{frame} + +\subsection{Implementation Plans} +\label{sec:CLS:impl} + + +\begin{frame}{Current Functionality} + \begin{itemize} + \item basic dataframes (similar to R); indexing/slicing API under + development. + \item Basic regression (similar to XLispStat) + \item matrix storage both in foreign and lisp-centric areas. + \item LAPACK (small percentage, increasing), working with both + matrix storage types + \item static graphics (X11) including preliminary grid functionality based + on CAIRO. Generation of PNG files from graphics windows. + \item CSV file support + \item Common Lisp! + \end{itemize} +\end{frame} + +\begin{frame}[fragile]{Computational Environment Supported} + \begin{itemize} + \item Should work on Linux, with recent SBCL versions + \item Definitely works on bleeding edge Debian (unstable). + \item Has worked for weak definitions of ``work'' on 4 different + people's environments (not quite, but sort of requires a + \verb+/home/tony/+ !) + \item Threaded support on threaded lisps (SBCL/CCL, soon CLISP). + But not yet integrated. + \end{itemize} +\end{frame} + +\begin{frame}{Goals} + Short Term + \begin{itemize} + \item Better integration of data structures with statistical routines + (auto-handling with dataframes, rather than manual parsing). + \end{itemize} + Medium/Long Term + \begin{itemize} + \item Support for CLISP (byte-compiled interpreted lisp) and Clozure + Common Lisp (formerly OpenMCL) + \item high-level Front-end API to a number of matrix and numerical + packages and numerical structures ( + \item constraint system for interactive GUIs and graphics + \item full LispStat compatibility (object system partially works; + GUI support coming). + \item Integrated threading via Bordeaux threads (portable CL API package). + \end{itemize} +\end{frame} + +\subsection{Common Lisp} +\label{sec:CLS:lisp} + +\begin{frame}{Common Lisp} + \begin{itemize} + \item Parens provide clear delineation of a \textbf{Complete + Thought} (functional programming with side effects). + \item Lisp-2 (symbols can denote both a separate function and a value) + \item ANSI standard (built by committee, but the committee was + reasonably smart) + \item Many implementations + \item Most implementations are interactive \textbf{compiled} + languages (few are interpreted, and those are usually + byte-compiled). + \item The Original \emph{Programming with Data} Language + (\emph{Programs are Data} and \emph{Data are Executable} also + apply). + \item advanced, powerful, first-class macros (macros functionally + re-write code, allowing for structural clarity and complete + destruction of syntax, should that be reasonable) + \end{itemize} +\end{frame} + +\begin{frame}{Common Lisp Packages} + (They are packages and called packages, not libraries. Some people + can rejoice!) + \begin{itemize} + \item infrastructure enhancement: infix-notation, data structures, + control and flow structures + \item numerics, graphics, GUIs, + \item primitive R to CL compiler (which could also be considered an + object-code compiler for R); 3 interfaces which embed R within CL. + \item Web 2.0 support and reporting facilities (similar to TeX) for PDF. + \end{itemize} + See \url{http://www.common-lisp.net/} and + \url{http://www.cliki.org/}. CLS sources can be found on + \url{http://github.com/blindglobe/} +\end{frame} + +\section{What else about CLS is still Vaporware?} + +\begin{frame}[fragile]{What does NOT work?} + Primarily, the reason that we doing this: + + \textbf{Computable and Executable Statistics} + + (which is the subject of another talk, slides in the backup). But + consider XML: +\begin{verbatim} +accord +\end{verbatim} +becomes +\begin{verbatim} +; data follows keywords... +(car :brand 'honda :engine "4cyl" accord) +\end{verbatim} +\end{frame} + +\section{Discussion} + +\begin{frame}{Conclusion} + Active but slow development, spanning the range of needs: + \begin{itemize} + \item Numerics: Linear algebra basics done -- full development + \item Static graphics: progress being made, have a partial + grid-solution, need interactive graphics + \item LispStat emulation needs to be finished + \item Model specification and unification + \end{itemize} + Related numerical/statistical projects: + \begin{itemize} + \item Incanter : R/LispStat/Omegahat-like system for Clojure (Lisp + on the JVM) + \item FEMLisp : system/workshop for finite-element analysis modeling + using Lisp + \item matlisp/LispLab : LAPACK-based numerical linear algebra packages + \item GSLL : GNU Scientific Library, Lisp interface. + \end{itemize} +\end{frame} + +\begin{frame}{Followup} + I'd be happy to talk with anyone on the following topics: + \begin{itemize} + \item Introduction to Common Lisp + \item support for new statistical programming environment modalities + (subject for another talk). + \item computable and executable statistics (code that explains + itself and can be parsed to generate knowledge about its claims; + ``XML's promise'') + \end{itemize} + and if you are interested in getting involved, or trying it out. +\end{frame} + +\end{document} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + + +\section{BACKUPS} + + +\section{Common Lisp} + +\begin{frame}[fragile]{Finding out things} + \begin{itemize} + \item CL-NUMLIB + num-sequence :from LOW to: HIGH :length SEQ-LENGTH + seq(from,to,by/length) + \item +\begin{verbatim} +(documentation + 'cl-numlib:num-sequence + 'function) +\end{verbatim} + \item This + \end{itemize} +\end{frame} + +\section{Computable Statistics} + +\begin{frame}{Can we compute with them?} + 3 Examples, of which we only present the first + \begin{itemize} + \item Research. + \item Consulting, Applied Statistics, Scientific Honesty. + \item Reimplementation. + \end{itemize} + Consider whether one can ``compute'' with the information given? + (that is: + \begin{itemize} + \item do we have sufficient information to communicate enough + for the right person to recreate the analysis? + \item have we sufficient clarity to prevent misunderstandings about + intentions and claims? + \end{itemize} + ) +\end{frame} + +\begin{frame}[fragile]{Example 1: Theory\ldots} + \label{example1} + Let $f(x;\theta)$ describe the likelihood of XX under the following + assumptions. + \begin{enumerate} + \item assumption-1 + \item assumption-2 + \end{enumerate} + Then if we use the following algorithm: + \begin{enumerate} + \item step-1 + \item step-2 + \end{enumerate} + then $\hat{\theta}$ should be $N(0,\hat\sigma^2)$ with the following + characteristics\ldots +\end{frame} + +\begin{frame} + \frametitle{Can we compute, using this description?} + Given the information at hand: + \begin{itemize} + \item we ought to have a framework for initial coding for the + actual simulations (test-first!) + \item the implementation is somewhat clear + \item We should ask: what theorems have similar assumptions? + \item We should ask: what theorems have similar conclusions but + different assumptions? + \end{itemize} +\end{frame} + +\begin{frame}[fragile]{Realizing Theory} +\small{ +\begin{verbatim} +(define-theorem my-proposed-theorem + (:theorem-type '(distribution-properties + frequentist + likelihood)) + (:assumes '(assumption-1 assumption-2)) + (:likelihood-form + (defun likelihood (data theta gamma) + (exponential-family theta gamma))) + (:compute-by + '(progn + (compute-starting-values thetahat gammahat) + (until (convergence) + (setf convergence + (or (step-1 thetahat) + (step-2 gammahat)))))) + (:claim (assert + (and (equal-distribution thetahat 'normal) + (equal-distribution gammahat 'normal))))) +\end{verbatim} +} +\end{frame} + +\begin{frame}[fragile]{It would be nice to have} +\begin{verbatim} + (theorem-veracity 'my-proposed-theorem) +\end{verbatim} +\end{frame} + +\begin{frame}[fragile]{and why not...?} +\begin{verbatim} + (when (theorem-veracity + 'my-proposed-theorem) + (write-paper 'my-proposed-theorem + :style :JASA + :output-format + '(LaTeX MSWord))) +\end{verbatim} +\end{frame} + +\begin{frame}{Comments} + \begin{itemize} + \item The general problem is very difficult + \item I'm working on some basic statistical proof of concepts (not + finished): linear regression (LS-based, Normal-bayesian) and the + T-test. + \item Areas targetted for medium-term future: resampling methods and + similar algorithms. + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Example 2: Practice\ldots} + \label{example2} + The dataset comes from a series of clinical trials. We model the + primary endpoint, ``relief'', as a binary random variable. There is + a random trial effect on relief as well as severity due to + differences in recruitment and inclusion/exclusion criteria. +\end{frame} + +\begin{frame} + \frametitle{Can we compute, using this description?} + \begin{itemize} + \item With a real such description, it is clear what some of the + potential models might be for this dataset + \item It should be clear how to start thinking of a data dictionary + for this problem. + \end{itemize} +\end{frame} + +\begin{frame}[fragile]{Can we compute?} +\begin{verbatim} + (dataset-metadata paper-1 + :context 'clinical-trials + :variables '((relief :model-type dependent + :distribution binary) + (trial :model-type independent + :distribution categorical) + (disease-severity)) + :metadata '(inclusion-criteria + exclusion-criteria + recruitment-rate)) + (propose-analysis paper-1) + ; => '(tables + ; (logistic regression)) +\end{verbatim} +\end{frame} + +\begin{frame}{Example 3: The Round-trip\ldots} + \label{example3} + The first examples describe ``ideas $\rightarrow$ code'' + + Consider the last time you read someone else's implementation of a + statistical procedure (i.e. R package code). When you read the + code, could you see: + \begin{itemize} + \item the assumptions used? + \item the algorithm implemented? + \item practical guidance for when you might select the algorithm + over others? + \item practical guidance for when you might select the + implementation over others? + \end{itemize} + These are usually components of any reasonable journal article. + \textit{(Q: have you actually read an R package that wasn't yours?)} +\end{frame} + +\begin{frame}{Exercise left to the reader!} + +% (aside: I have been looking at the \textbf{stats} and \textbf{lme4} +% packages recently -- \textit{for me}, very clear numerically, much +% less so statistically) +\end{frame} + + + +\section{Context} + +\begin{frame}{Goals for this Talk}{(define, strategic approach, + justify)} + + \begin{itemize} + \item To describe the concept of \alert{computable and executable + statistics}, placing it in a historical context. + + \item To demonstrate that \alert{a research program} + implemented through simple steps can increase the efficiency of + statistical computing approaches by clearly describing both: + \begin{itemize} + \item numerical characteristics of procedures, + \item statistical concepts driving them. + \end{itemize} + + \item To justify that the \alert{approach is worthwhile} and + represents a staged effort towards \alert{increased use of best + practices}. + \end{itemize} + (unfortunately, the last is still incomplete) +\end{frame} + + +\begin{frame}{Historical Computing Languages} + \begin{itemize} + \item FORTRAN : FORmula TRANslator. Original numerical computing + language, designed for clean implementation of numerical + algorithms + \item LISP : LISt Processor. Associated with symbolic + manipulation, AI, and knowledge approaches + \end{itemize} + + They represent the 2 generalized needs of statistical computing, + which could be summarized as + \begin{itemize} + \item algorithms/numerics, + \item elicitation, communication, and generation of knowledge (``data + analysis'') + \end{itemize} +\end{frame} + +\begin{frame}{Statistical Computing Environments} + + Past: + \begin{itemize} + \item SPSS / BMDP / SAS + \item S ( S, S-PLUS, R) + \item LispStat ( XLispStat, ViSta, ARC , CommonLispStat ) ; QUAIL + \item XGobi (Orca / GGobi / Statistical Reality Engine) + \item MiniTab + \item Stata + \item DataDesk + \item Augsburg Impressionist series (MANET, + \item Excel + \end{itemize} + many others... + +\end{frame} + +\begin{frame}{How many are left?} + + \begin{itemize} + \item R + \item SAS + \item SPSS + \item Stata + \item Minitab + \item very few others... + \end{itemize} + ``R is the Microsoft of the statistical computing world'' -- anonymous. +\end{frame} + +\begin{frame}{Selection Pressure} + \begin{itemize} + \item the R user population is growing rapidly, fueled by critical + mass, quality, and value + \item R is a great system for applied data analysis + \item R is not such a great system for research into statistical + computing (backwards compatibility, inertia due to user population) + \end{itemize} + There is a need for alternative experiments for developing new + approaches/ideas/concepts. +\end{frame} + +\begin{frame}{Philosophically, why Common Lisp?} + Philosophically: + \begin{itemize} + \item Lisp can cleanly present computational intentions, both + symbolically and numerically. + \item Semantics and context are important: well supported by Lisp + paradigms. + \item Lisp's parentheses describe singular, multi-scale, + \alert{complete thoughts}. + \end{itemize} + +\end{frame} + +\begin{frame}{Technically, why Common Lisp?} + \begin{itemize} + \item interactive COMPILED language (``R with a compiler'') + \item CLOS is R's S4 object system ``done right''. + \item clean semantics: modality, typing, can be expressed the way + one wants it. + \item programs are data, data are programs, leading to + \item Most modern computing tools available (XML, WWW technologies) + \item ``executable XML'' + \end{itemize} + Common Lisp is very close in usage to how people currently use R + (mostly interactive, some batch, and a wish for compilation efficiency). +\end{frame} + +\subsection{Background} + +\begin{frame} + \frametitle{Desire: Semantics and Statistics} + \begin{itemize} + \item The semantic web (content which is self-descriptive) is an + interesting and potentially useful idea. + + \item + Biological informatics support (GO, Entrez) has allowed for + precise definitions of concepts in biology. + + \item It is a shame that a field like statistics, requiring such + precision, has less than an imprecise and temporally instable + field such as biology\ldots + \end{itemize} + + How can we express statistical work (research, applied work) which + is both human and computer readable (perhaps subject to + transformations first)? +\end{frame} + + +% \subsection{Context} + +% \begin{frame}{Context}{(where I'm coming from, my ``priors'')} +% \begin{itemize} +% \item Pharmaceutical Industry +% \item Modeling and Simulation uses mathematical models/constructs to +% record beliefs (biology, pharmacology, clinical science) for +% explication, clinical team alignment, decision support, and +% quality. +% \item My work at Novartis is at the intersection of biomedical +% informatics, statistics, and mathematical modeling. +% \item As manager: I need a mix of applications and novel research development to +% solve our challenges better, faster, more efficiently. +% \item Data analysis is a specialized approach to computer +% programming, \alert{different} than applications programming or +% systems programming. +% \end{itemize} +% \end{frame} + + +\subsection{Literate Programming is insufficient} + +\begin{frame}{Literate Statistical Practice.} + \begin{enumerate} + \item Literate Programming applied to data analysis (Rossini, 1997/2001) + \item among the \alert{most annoying} techniques to integrate into + work-flow if one is not perfectly methodological. + \item Some tools: + \begin{itemize} + \item ESS: supports interactive creation of literate programs. + \item Sweave: tool which exemplifies reporting context; odfWeave + primarily simplifies reporting. + \item Roxygen: primarily supports a literate programming + documentation style, not a literate data analysis programming + style. + \end{itemize} + \item ROI demonstrated in specialized cases: BioConductor. + \item \alert{usually done after the fact} (final step of work-flow) + as a documentation/computational reproducibility technique, rarely + integrated into work-flow. + \end{enumerate} + Many contributors: + Knuth, Claerbout, Carey, de Leeuw, Leisch, Gentleman, Temple-Lang, + \ldots{} +\end{frame} + +\begin{frame} + \frametitle{Literate Programming} + \framesubtitle{Why isn't it enough for Data Analysis?} + + Only 2 contexts: (executable) code and documentation. Fine for + application programming, but for data analysis, we could benefit + from: + \begin{itemize} + \item classification of statistical procedures + \item descriptions of assumptions + \item pragmatic recommendations + \item inheritance of structure through the work-flow of a + statistical methodology or data analysis project + \item datasets and metadata + \end{itemize} + Concept: ontologies describing mathematical assumptions, applications + of methods, work-flow, and statistical data structures can enable + machine communication. + + (i.e. informatics framework ala biology) +\end{frame} + + +\begin{frame}{Communication in Statistical Practice}{\ldots is essential for \ldots} + \begin{itemize} + \item finding + \item explanations + \item agreement + \item receiving information + \end{itemize} + \alert{``machine-readable'' communication/computation lets the + computer help} \\ + Semantic Web is about ``machine-enabled computability''. +\end{frame} + +\begin{frame} \frametitle{Semantics} + \framesubtitle{One definition: description and context} + + Interoperability is the key, with respect to + \begin{itemize} + \item ``Finding things'' + \item Applications and activities with related functionality + \begin{itemize} + \item moving information from one state to another (paper, journal + article, computer program) + \item computer programs which implement solutions to similar tasks + \end{itemize} + \end{itemize} +\end{frame} + + +\begin{frame}{Statistical Practice is somewhat restricted} + {...but in a good sense, enabling potential for semantics...} + + There is a restrictable set of intended actions for what can be done + -- the critical goal is to be able to make a difference by + accelerating activities that should be ``computable'': + \begin{itemize} + \item restricted natural language processing + \item mathematical translation + \item common description of activities for simpler programming/data + analysis (S approach to objects and methods) + \end{itemize} + R is a good basic start (model formulation approach, simple + ``programming with data'' paradigm); we should see if we can do + better! +\end{frame} + +\begin{frame}{Computable and Executable Statistics requires} + + \begin{itemize} + \item approaches to describe data and metadata (``data'') + \begin{itemize} + \item semantic WWW + \item metadata management and integration, driving + \item data integration + \end{itemize} + \item approaches to describe data analysis methods (``models'') + \begin{itemize} + \item quantitatively: many ontologies (AMS, etc), few meeting + statistical needs. + \item many substantive fields have implementations + (bioinformatics, etc) but not well focused. + \end{itemize} + \item approaches to describe the specific form of interaction + (``instances of models'') + \begin{itemize} + \item Original idea behind ``Literate Statistical Analysis''. + \item That idea is suboptimal, more structure needed (not + necessarily built upon existing...). + \end{itemize} + \end{itemize} +\end{frame} + +\subsection{Common Lisp Statistics} + +\begin{frame} + \frametitle{Interactive Programming} + \framesubtitle{Everything goes back to being Lisp-like} + \begin{itemize} + \item Interactive programming (as originating with Lisp): works + extremely well for data analysis (Lisp being the original + ``programming with data'' language). + \item Theories/methods for how to do this are reflected in styles + for using R. + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Lisp} + + Lisp (LISt Processor) is different than most high-level computing + languages, and is very old (1956). Lisp is built on lists of things + which are evaluatable. +\begin{verbatim} +(functionName data1 data2 data3) +\end{verbatim} + or ``quoted'': +\begin{verbatim} +'(functionName data1 data2 data3) +\end{verbatim} + which is shorthand for +\begin{verbatim} +(list functionName data1 data2 data3) +\end{verbatim} + The difference is important -- lists of data (the second/third) are + not (yet?!) functions applied to (unencapsulated lists of) data (the first). +\end{frame} + +\begin{frame} + \frametitle{Features} + \begin{itemize} + \item Data and Functions semantically the same + \item Natural interactive use through functional programming with + side effects + \item Batch is a simplification of interactive -- not a special mode! + \end{itemize} +\end{frame} + + + +\begin{frame}[fragile]{Representation: XML and Lisp}{executing your data} + Many people are familiar with XML: +\begin{verbatim} +Tony Rossini +\end{verbatim} + which is shorter in Lisp: +\begin{verbatim} +(name "Tony Rossini" :phone "+41613674557") +\end{verbatim} + \begin{itemize} + \item Lisp ``parens'', universally hated by unbelievers, are + wonderful for denoting when a ``concept is complete''. + \item Why can't your data self-execute? + \end{itemize} +\end{frame} + +\begin{frame}[fragile]{Numerics with Lisp} + \begin{itemize} + \item addition of rational numbers and arithmetic + \item example for mean +\begin{verbatim} + (defun mean (x) + (checktype x 'vector-like) + (/ (loop for i from 0 to (- (nelts *x*) 1) + summing (vref *x* i)) + (nelts *x*))) +\end{verbatim} + \item example for variance +\begin{verbatim} +(defun variance (x) + (let ((meanx (mean x)) + (nm1 (1- (nelts x)))) + (/ (loop for i from 0 to nm1 + summing (power (- (vref *x* i) meanx) 2) + nm1)))) +\end{verbatim} + \item But through macros, \verb+(vref *x* i)+ could be + \verb+#V(X[i])+ or your favorite syntax. + \end{itemize} + +\end{frame} + + +\begin{frame}{Common Lisp Statistics 1} + \begin{itemize} + \item Originally based on LispStat (reusability) + \item Re-factored structure (some numerics worked with a 1990-era code base). + \item Current activities: + \begin{enumerate} + \item numerics redone using CFFI-based BLAS/LAPLACK (cl-blapack) + \item matrix interface based on MatLisp + \item starting design of a user interface system (interfaces, + visuals). + \item general framework for model specification (regression, + likelihood, ODEs) + \item general framework for algorithm specification (bootstrap, + MLE, algorithmic data anaylsis methods). + \end{enumerate} + \end{itemize} +\end{frame} + +\begin{frame}{Common Lisp Statistics 2} + + \begin{itemize} + \item Implemented using SBCL. Contributed fixes for + Clozure/OpenMCL. Goal to target CLISP + \item Supports LispStat prototype object system + \item Package-based design -- only use the components you need, or + the components whose API you like. + \end{itemize} +\end{frame} + +\section{Discussion} + +\begin{frame} + \frametitle{Outlook} + \begin{itemize} + \item Semantics and Computability have captured a great deal of + attention in the informatics and business computing R\&D worlds + \item Statistically-driven Decision Making and Knowledge Discovery + is, with high likelihood, the next challenging stage after data + integration. + \item Statistical practice (theory and application) can be enhanced, + made more efficient, providing increased benefit to organizations + and groups using appropriate methods. + \item Lisp as a language, shares characteristics of both Latin + (difficult dead language useful for classical training) and German + (difficult living language useful for general life). Of course, + for some people, they are not difficult. + \end{itemize} + +\end{frame} + +\begin{frame} + The research program described in this talk is currently driving the + design of CommonLisp Stat, which leverages concepts and approaches + from the dead and moribund LispStat project. + + \begin{itemize} + \item \url{http://repo.or.cz/w/CommonLispStat.git/} + \item \url{http://www.github.com/blindglobe/} + \end{itemize} + +\end{frame} +\begin{frame}{Final Comment} + + \begin{itemize} + \item In the Pharma industry, it is all about getting the right + drugs to the patient faster. Data analysis systems seriously + impact this process, being potentially an impediment or an + accelerator. + + \begin{itemize} + \item \alert{Information technologies can increase the efficiency + of statistical practice}, though innovation change management + must be taking into account. (i.e. Statistical practice, while + considered by some an ``art form'', can benefit from + industrialization). + \item \alert{Lisp's features match the basic requirements we need} + (dichotomy: programs as data, data as programs). Sales pitch, + though... + \item Outlook: Lots of work and experimentation to do! + \end{itemize} + \end{itemize} +\end{frame} + + +% % All of the following is optional and typically not needed. +% \appendix + + +% \section*{\appendixname} + + +% \begin{frame} \frametitle{Complements and Backup} +% No more, stop here. Questions? (now or later). +% \end{frame} + +% \begin{frame}{The Industrial Challenge.}{Getting the Consulting Right.} +% % - A title should summarize the slide in an understandable fashion +% % for anyone how does not follow everything on the slide itself. + +% \begin{itemize} +% \item Recording assumptions for the next data analyst, reviewer. +% Use \texttt{itemize} a lot. +% \item +% Use very short sentences or short phrases. +% \end{itemize} +% \end{frame} + + +% \begin{frame}{The Industrial Challenge.}{Getting the Right Research Fast.} +% % - A title should summarize the slide in an understandable fashion +% % for anyone how does not follow everything on the slide itself. + +% \begin{itemize} +% \item +% Use \texttt{itemize} a lot. +% \item +% Use very short sentences or short phrases. +% \end{itemize} +% \end{frame} + + +% \begin{frame}{Explicating the Work-flow}{QA/QC-based improvements.} + + +% \end{frame} + +% \section{Motivation} + +% \subsection{IT Can Speed up Deliverables in Statistical Practice} + +% \begin{frame}{Our Generic Work-flow and Life-cycle} +% {describing most data analytic activities} +% Workflow: +% \begin{enumerate} +% \item Scope out the problem +% \item Sketch out a potential solution +% \item Implement until road-blocks appear +% \item Deliver results +% \end{enumerate} + +% Lifecycle: +% \begin{enumerate} +% \item paper sketch +% \item 1st e-draft of text/code/date (iterate to \#1, discarding) +% \item cycle through work +% \item publish +% \item ``throw-away'' +% \end{enumerate} +% but there is valuble information that could enable the next +% generation! +% \end{frame} + +% \begin{frame}[fragile]{Paper $\rightarrow$ Computer $\rightarrow$ Article $\rightarrow$ Computer}{Cut and Paste makes for large errors.} +% \begin{itemize} +% \item Problems in a regulatory setting +% \item Regulatory issues are just ``best practices'' +% \end{itemize} + +% Why do we ``copy/paste'', or analogously, restart our work? + +% pro: +% \begin{itemize} +% \item every time we repeat, we reinforce the idea in our brain +% \item review of ideas can help improve them +% \end{itemize} +% con: +% \begin{itemize} +% \item inefficiency +% \item introduction of mistakes +% \item loss of historical context +% \item changes to earlier work (on a different development branch) +% can not propagate. +% \end{itemize} +% \end{frame} + +% \section{Semantics and Statistical Practice} + + +% \begin{frame} +% \frametitle{Statistical Activity Leads to Reports} +% \framesubtitle{You read what you know, do you understand it?} + +% How can we improve the communication of the ideas we have? + +% Precision of communication? + +% \end{frame} + + + +% \begin{frame} \frametitle{Communication Requires Context} +% \framesubtitle{Intentions imply more than one might like...} + +% \begin{itemize} +% \item Consideration of what we might do +% \item Applications with related functionality +% \end{itemize} +% \end{frame} + + + +% \begin{frame} +% \frametitle{Design Patterns} +% \framesubtitle{Supporting Work-flow Transitions} + +% (joint work with H Wickham): The point of this research program is +% not to describe what to do at any particular stage of work, but to +% encourage researchers and practitioners to consider how the +% translation and transfer of information between stages so that work +% is not lost. + +% Examples of stages in a work-flow: +% \begin{itemize} +% \item planning, execution, reporting; +% \item scoping, illustrative examples or counter examples, algorithmic construction, +% article writing. +% \item descriptive statistics, preliminary inferential analysis, +% model/assumption checking, final inferential analysis, +% communication of scientific results +% \end{itemize} +% Description of work-flows is essential to initiating discussions on +% quality/efficiency of approaches to work. +% \end{frame} + +% \section{Design Challenges} + +% \begin{frame} +% \frametitle{Activities are enhanced by support} + +% \begin{itemize} +% \item Mathematical manipulation can be enhanced by symbolic +% computation +% \item Statistical programming can be enabled by examples and related +% algorithm implementation +% \item Datasets, to a limited extent, can self-describe. +% \end{itemize} +% \end{frame} + +% \begin{frame} +% \frametitle{Executable and Computable Science} + +% Use of algorithms and construction to describe how things work. + +% Support for agent-based approaches +% \end{frame} + + +% \begin{frame} +% \frametitle{What is Data? Metadata?} + +% Data: what we've observed + +% MetaData: context for observations, enables semantics. +% \end{frame} + + + + +% % \begin{frame}[fragile] +% % \frametitle{Defining Variables} +% % \framesubtitle{Setting variables} +% % \begin{verbatim} +% % (setq ) +% % \end{verbatim} +% % Example: +% % \begin{verbatim} +% % (setq ess-source-directory +% % "/home/rossini/R-src") +% % \end{verbatim} +% % \end{frame} + +% % \begin{frame}[fragile] +% % \frametitle{Defining on the fly} +% % \begin{verbatim} +% % (setq ess-source-directory +% % (lambda () (file-name-as-directory +% % (expand-file-name +% % (concat (default-directory) +% % ess-suffix "-src"))))) +% % \end{verbatim} +% % (Lambda-expressions are anonymous functions, i.e. ``instant-functions'') +% % \end{frame} + + +% % \begin{frame}[fragile] +% % \frametitle{Function Reuse} +% % By naming the function, we could make the previous example reusable +% % (if possible): +% % \begin{verbatim} +% % (defun my-src-directory () +% % (file-name-as-directory +% % (expand-file-name +% % (concat (default-directory) +% % ess-suffix "-src")))) +% % \end{verbatim} +% % Example: +% % \begin{verbatim} +% % (setq ess-source-directory (my-src-directory)) +% % \end{verbatim} +% % \end{frame} + + +% % \begin{frame} +% % \frametitle{Equality Among Packages} +% % \begin{itemize} +% % \item more/less equal can be described specifically through +% % overriding imports. +% % \end{itemize} +% % \end{frame} + + +% \subsection*{For Further Reading} + +% \begin{frame}[allowframebreaks] +% \frametitle{Related Material} + +% \begin{thebibliography}{10} + +% \beamertemplatebookbibitems +% % Start with overview books. + +% \bibitem{LispStat1990} +% L.~Tierney +% \newblock {\em LispStat}. + +% \beamertemplatearticlebibitems +% % Followed by interesting articles. Keep the list short. + +% \bibitem{Rossini2001} +% AJ.~Rossini +% \newblock Literate Statistical Practice +% \newblock {\em Proceedings of the Conference on Distributed +% Statistical Computing}, 2001. + +% \bibitem{RossiniLeisch2003} +% AJ.~Rossini and F.~Leisch +% \newblock Literate Statistical Practice +% \newblock {\em Technical Report Series, University of Washington +% Department of Biostatistics}, 2003. + +% \beamertemplatearrowbibitems +% % Followed by interesting articles. Keep the list short. + +% \bibitem{CLS} +% Common Lisp Stat, 2008. +% \newblock \url{http://repo.or.cz/CommonLispStat.git/} + +% \end{thebibliography} +% \end{frame} -- 2.11.4.GIT