docs for dataframes and a need for a dataframe-window class
[CommonLispStat.git] / src / data / data-clos.lisp
blobe3f13a5c11022638a6b8803fe6dc7d8a0833a235
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-04-02 09:33:51 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: data-clos.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
8 ;;; on how it arrives.
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics.
11 ;;; This redoes dataframe structures in a CLOS based
12 ;;; framework. Currently contains the virtual class
13 ;;; DATAFRAME-LIKE as well as the actual classes
14 ;;; DATAFRAME-ARRAY and DATAFRAME-MATRIXLIKE
16 ;;; What is this talk of 'release'? Klingons do not make software
17 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
18 ;;; designers and quality assurance people in its wake.
20 (in-package :lisp-stat-data-clos)
22 ;;; No real basis for work, there is a bit of new-ness and R-ness to
23 ;;; this work. In particular, the notion of relation is key and
24 ;;; integral to the analysis. Tables are related and matched vectors,
25 ;;; for example. "column" vectors are related observations (by
26 ;;; measure/recording) while "row" vectors are related readings (by
27 ;;; case)
29 ;;; Relational structure -- can we capture a completely unnormalized
30 ;;; data strucutre to propose possible modeling approaches, and
31 ;;; propose appropriate models and inferential strategies?
33 ;;; So we want a verb-driven API for data collection construction. We
34 ;;; should encode independence or lack of, as possible.
36 ;;; Need to figure out typed vectors. We then map a series of typed
37 ;;; vectors over to tables where columns are equal typed. In a sense,
38 ;;; this is a relation (1-1) of equal-typed arrays. For the most
39 ;;; part, this ends up making the R data.frame into a relational
40 ;;; building block (considering 1-1 mappings using row ID as a
41 ;;; relation). Is this a worthwhile generalization?
43 ;;; verbs vs semantics for DS conversion -- consider the possibily of
44 ;;; how adverbs and verbs relate, where to put which semantically to
45 ;;; allow for general approach.
47 ;;; eg. Kasper's talk on the FUSION collection of parsers.
49 ;;;
50 ;;; Need to consider modification APIs
51 ;;; actions are:
52 ;;; - import
53 ;;; - get/set row names (case names)
54 ;;; - column names (variable names)
55 ;;; - dataset values
56 ;;; - annotation/metadata
57 ;;; - make sure that we do coherency checking in the exported
58 ;;; - functions.
59 ;;; - ...
60 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
61 ;;; additional input).
62 ;;; - either overwriting or not, i.e. with or without copy.
63 ;;; - check consistency of resulting data with metadata and related
64 ;;; data information.
65 ;;; -
68 ;;; Misc Functions (to move into a lisp data manipulation support package)
70 ;; the next two should be merged into a general replicator pattern.
71 (defun gen-seq (n &optional (start 1))
72 "Generates an integer sequence of length N starting at START. Used
73 for indexing."
74 (if (>= n start)
75 (append (gen-seq (- n 1) start) (list n))))
77 (defun repeat-seq (n item)
78 "FIXME: There has to be a better way -- I'm sure of it!
79 (repeat-seq 3 \"d\") ; => (\"d\" \"d\" \"d\")
80 (repeat-seq 3 'd) ; => ('d 'd 'd)"
81 (if (>= n 1)
82 (append (repeat-seq (1- n) item) (list item))))
84 (defun strsym->indexnum (df strsym)
85 "Returns a number indicating the DF column labelled by STRSYM.
86 Probably should be a method dispatching on DATAFRAME-LIKE type."
87 (position strsym (varlabels df)))
89 (defun string->number (str)
90 "Convert a string <str> representing a number to a number. A second value is
91 returned indicating the success of the conversion.
92 Examples:
93 (string->number \"123\") ; => 123 t
94 (string->number \"1.23\") ; => 1.23 t"
95 (let ((*read-eval* nil))
96 (let ((num (read-from-string str)))
97 (values num (numberp num)))))
102 (equal 'testme 'testme)
103 (defparameter *test-pos* 'testme)
104 (position *test-pos* (list 'a 'b 'testme 'c))
105 (position #'(lambda (x) (equal x "testme")) (list "a" "b" "testme" "c"))
106 (position #'(lambda (x) (equal x 1)) (list 2 1 3 4))
110 ;;; abstract dataframe class
112 (defclass dataframe-like (matrix-like)
114 ;; Matrix-like (from lisp-matrix) is basically a rectangular table
115 ;; without storage. We emulate that, and add storage, row/column
116 ;; labels, and within-column-typing.
118 ;; STORE is the storage component. We ignore this in the DATAFRAME-LIKE
119 ;; class, as it is the primary differentiator, driving how access
120 ;; (getting/setting) is done. We create methods depending on the
121 ;; storage component, which access data as appropriate. See
122 ;; DATAFRAME-ARRAY for an example implementation.
123 ;; the rest of this is metadata. In particular, we should find a
124 ;; more flexible, compact way to store this.
125 (case-labels :initform nil
126 :initarg :case-labels
127 :type list
128 :accessor case-labels
129 :documentation "labels used for describing cases (doc
130 metadata), possibly used for merging.")
131 (var-labels :initform nil
132 :initarg :var-labels
133 :type list
134 :accessor var-labels
135 :documentation "Variable names.")
136 (var-types :initform nil
137 :initarg :var-types
138 :type list
139 :accessor var-types
140 :documentation "variable types to ensure fit")
141 (documentation-string :initform nil
142 :initarg :doc
143 :accessor doc-string
144 :documentation "additional information,
145 potentially uncomputable, possibly metadata, about dataframe-like
146 instance."))
147 (:documentation "Abstract class for standard statistical analysis
148 dataset for independent data. Rows are considered
149 to be independent, matching observations. Columns
150 are considered to be type-consistent, match a
151 variable with distribution. inherits from
152 lisp-matrix base MATRIX-LIKE class.
154 DATAFRAME-LIKE is the basic cases by variables
155 framework. Need to embed this within other
156 structures which allow for generalized relations.
157 Goal is to ensure that relations imply and drive
158 the potential for statistical relativeness such as
159 correlation, interference, and similar concepts."))
162 ;;; Generics specialized above matrix-like, particularly for
163 ;;; dataframe-like objects. Need methods for any storage
164 ;;; implementation.
166 (defgeneric dataframe-dimensions (df)
167 (:documentation "")
168 (:method ((df dataframe-like))
169 (error "dispatch on virtual class.")))
171 (defgeneric dataframe-dimension (df index)
172 (:documentation "")
173 (:method ((df dataframe-like) index)
174 (elt (dataframe-dimensions df) index)))
176 (defgeneric dfref (df index1 index2)
177 (:documentation "scalar access with selection of possible return object types.")
178 (:method ((df dataframe-like) index1 index2)
179 (error "Need real class with real storage to reference elements.")))
181 (defgeneric set-dfref (df index1 index2 val)
182 (:documentation "setter for dfref")
183 (:method ((df dataframe-like) index1 index2 val)
184 (error "Need real class with real storage to reference elements.")))
186 (defsetf dfref set-dfref)
188 (defgeneric dfselect (df &key cases vars indices)
189 (:documentation "access to sub-dataframes. Always returns a dataframe.")
190 (:method ((df dataframe-like) &key cases vars indices)
191 (error "Need real class with real storage to reference elements.")))
193 ;;; Specializing on superclasses...
194 ;;; Access and Extraction: implementations needed for any storage
195 ;;; type. But here, just to point out that we've got a specializing
196 ;;; virtual subclass (DATAFRAME-LIKE specializing MATRIX-LIKE).
198 (defmethod nrows ((df dataframe-like))
199 "specializes on inheritance from matrix-like in lisp-matrix."
200 (error "Need implementation; can't dispatch on virtual class DATAFRAME-LIKE."))
202 (defmethod ncols ((df dataframe-like))
203 "specializes on inheritance from matrix-like in lisp-matrix."
204 (error "Need implementation; can't dispatch on virtual class DATAFRAME-LIKE."))
206 ;; Testing consistency/coherency.
208 (defgeneric consistent-dataframe-p (df)
209 (:documentation "methods to check for consistency.")
210 (:method ((df dataframe-like))
211 (and
212 ;; ensure dimensionality
213 (= (length (var-labels df)) (ncols df)) ; array-dimensions (dataset df))
214 (= (length (case-labels df)) (nrows df))
215 ;; when dims sane, check-type for each variable
216 (progn
217 (dotimes (i (nrows df))
218 (dotimes (j (ncols df))
219 ;; below, dfref bombs if not a df-like subclass...
220 (typep (dfref df i j) (nth j (var-types df)))))
221 t))))
225 (defun ensure-consistent-datatable-type (dt lot)
226 "given a datatable and a listoftypes, ensure that the datatble
227 variables are consistent."
228 (destructuring-bind (n p) ;; why use let when we can be cool? Sigh.
229 (array-dimensions dt)
230 (dotimes (i n)
231 (dotimes (j p)
232 (check-type (aref dt i j) (elt lot j))))))
236 ;;; GENERAL FUNCTIONS WHICH DISPATCH ON INTERNAL METHODS OR ARGS
238 ;;; Q: change the following to generic functions and dispatch on
239 ;;; array, matrix, and dataframe? Others?
240 (defun make-labels (initstr num)
241 "generate a list of strings which can be used as labels, i.e. something like
242 '(\"a1\" \"a2\" \"a3\")."
243 (check-type initstr string)
244 (mapcar #'(lambda (x y) (concatenate 'string x y))
245 (repeat-seq num initstr)
246 (mapcar #'(lambda (x) (format nil "~A" x)) (gen-seq num))))
249 (make-labels 'c 2)
250 (make-labels "c" 4)
253 (defun ncase-store (store)
254 (etypecase store
255 (array (array-dimension store 0))
256 (matrix-like (nrows store))))
258 (defun nvars-store (store)
259 (etypecase store
260 (array (array-dimension store 1))
261 (matrix-like (ncols store))))
264 (defun make-dataframe (newdata
265 &key (vartypes nil)
266 (caselabels nil) (varlabels nil)
267 (doc "no docs"))
268 "Helper function to use instead of make-instance to assure
269 construction of proper DF-array."
270 (check-type newdata (or matrix-like array))
271 (check-type caselabels sequence)
272 (check-type varlabels sequence)
273 (check-type doc string)
274 (let ((ncases (ncase-store newdata))
275 (nvars (nvars-store newdata)))
276 (if caselabels (assert (= ncases (length caselabels))))
277 (if varlabels (assert (= nvars (length varlabels))))
278 (let ((newcaselabels (if caselabels
279 caselabels
280 (make-labels "C" ncases)))
281 (newvarlabels (if varlabels
282 varlabels
283 (make-labels "V" nvars))))
284 (etypecase newdata
285 (array
286 (make-instance 'dataframe-array
287 :storage newdata
288 :nrows (length newcaselabels)
289 :ncols (length newvarlabels)
290 :case-labels newcaselabels
291 :var-labels newvarlabels
292 :var-types vartypes))
293 (matrix-like
294 (make-instance 'dataframe-matrixlike
295 :storage newdata
296 :nrows (length newcaselabels)
297 :ncols (length newvarlabels)
298 :case-labels newcaselabels
299 :var-labels newvarlabels
300 :var-types vartypes))))))
303 (make-dataframe #2A((1.2d0 1.3d0) (2.0d0 4.0d0)))
304 (make-dataframe #2A(('a 1) ('b 2)))
305 (dfref (make-dataframe #2A(('a 1) ('b 2))) 0 1)
306 (dfref (make-dataframe #2A(('a 1) ('b 2))) 1 0)
307 (make-dataframe 4) ; ERROR, should we allow?
308 (make-dataframe #2A((4)))
309 (make-dataframe (rand 10 5)) ;; ERROR, but should work!
313 (defun row-order-as-list (ary)
314 "Pull out data in row order into a list."
315 (let ((result (list))
316 (nrows (nth 0 (array-dimensions ary)))
317 (ncols (nth 1 (array-dimensions ary))))
318 (dotimes (i ncols)
319 (dotimes (j nrows)
320 (append result (aref ary i j))))))
322 (defun col-order-as-list (ary)
323 "Pull out data in row order into a list."
324 (let ((result (list))
325 (nrows (nth 0 (array-dimensions ary)))
326 (ncols (nth 1 (array-dimensions ary))))
327 (dotimes (i nrows)
328 (dotimes (j ncols)
329 (append result (aref ary i j))))))
331 (defun transpose-array (ary)
332 "map NxM to MxN."
333 (make-array (reverse (array-dimensions ary))
334 :initial-contents (col-order-as-list ary)))
336 ;;; THE FOLLOWING 2 dual-sets done to provide error checking
337 ;;; possibilities on top of the generic function structure. Not
338 ;;; intended as make-work!
340 (defun varlabels (df)
341 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
342 (var-labels df))
344 (defun set-varlabels (df vl)
345 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
346 (if (= (length (var-labels df))
347 (length vl))
348 (setf (var-labels df) vl)
349 (error "wrong size.")))
351 (defsetf varlabels set-varlabels)
353 ;;; Case-name handling for Tables. Needs error checking.
354 (defun caselabels (df)
355 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
356 (case-labels df))
358 (defun set-caselabels (df cl)
359 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
360 (if (= (length (case-labels df))
361 (length cl))
362 (setf (case-labels df) cl)
363 (error "wrong size.")))
365 (defsetf caselabels set-caselabels)
367 ;;;;;;;;;;;; IMPLEMENTATIONS, with appropriate methods.
368 ;; See also:
369 ;; (documentation 'dataframe-like 'type)
371 ;;;;; DATAFRAME-ARRAY
373 (defclass dataframe-array (dataframe-like)
374 ((store :initform nil
375 :initarg :storage
376 :type (array * *)
377 :accessor dataset
378 :documentation "Data storage: typed as array."))
379 (:documentation "example implementation of dataframe-like using storage
380 based on lisp arrays. An obvious alternative could be a
381 dataframe-matrix-like which uses the lisp-matrix classes."))
383 (defmethod nrows ((df dataframe-array))
384 "specializes on inheritance from matrix-like in lisp-matrix."
385 (array-dimension (dataset df) 0))
387 (defmethod ncols ((df dataframe-array))
388 "specializes on inheritance from matrix-like in lisp-matrix."
389 (array-dimension (dataset df) 1))
391 (defmethod dfref ((df dataframe-array)
392 (index1 number) (index2 number))
393 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
394 idx1/2 is row/col or case/var."
395 (aref (dataset df) index1 index2))
397 (defmethod set-dfref ((df dataframe-array) (index1 number) (index2 number) val)
398 "set value for df-ar."
399 ;; (check-type val (elt (var-type df) index2))
400 (setf (aref (dataset df) index1 index2) val))
402 (defparameter *default-dataframe-class* 'dataframe-array)
404 (defmethod dfselect ((df dataframe-array)
405 &key cases vars indices)
406 "Extract the OR of cases, vars, or have a list of indices to extract"
407 (declare (ignore indices))
408 (let ((newdf (make-instance *default-dataframe-class*
409 :storage (make-array (list (length cases) (length vars)))
410 :nrows (length cases)
411 :ncols (length vars)
413 :case-labels (select-list caselist (case-labels df))
414 :var-labels (select-list varlist (var-labels df))
415 :var-types (select-list varlist (vartypes df))
418 (dotimes (i (length cases))
419 (dotimes (j (length vars))
420 (setf (dfref newdf i j)
421 (dfref df
422 (position (elt cases i) (case-labels df))
423 (position (elt vars j) (var-labels df))))))))
425 ;;;;; DATAFRAME-MATRIXLIKE
427 (defclass dataframe-matrixlike (dataframe-like)
428 ((store :initform nil
429 :initarg :storage
430 :type matrix-like
431 :accessor dataset
432 :documentation "Data storage: typed as matrix-like
433 (numerical only)."))
434 (:documentation "example implementation of dataframe-like using storage
435 based on lisp-matrix structures."))
437 (defmethod nrows ((df dataframe-matrixlike))
438 "specializes on inheritance from matrix-like in lisp-matrix."
439 (matrix-dimension (dataset df) 0))
441 (defmethod ncols ((df dataframe-matrixlike))
442 "specializes on inheritance from matrix-like in lisp-matrix."
443 (matrix-dimension (dataset df) 1))
445 (defmethod dfref ((df dataframe-matrixlike)
446 (index1 number) (index2 number))
447 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
448 idx1/2 is row/col or case/var."
449 (mref (dataset df) index1 index2))
451 (defmethod set-dfref ((df dataframe-matrixlike)
452 (index1 number) (index2 number) val)
453 "Sets a value for df-ml."
454 ;; NEED TO CHECK TYPE!
455 ;; (check-type val (elt (vartype df) index2))
456 (setf (mref (dataset df) index1 index2) val))
460 ;;;;;; IMPLEMENTATION INDEPENDENT FUNCTIONS AND METHODS
461 ;;;;;; (use only dfref, nrows, ncols and similar dataframe-like
462 ;;;;;; components as core).
464 (defun dfref-var (df index return-type)
465 "Returns the data in a single variable as type.
466 type = sequence, vector, vector-like (if valid numeric type) or dataframe."
467 (ecase return-type
468 (('list)
469 (map 'list
470 #'(lambda (x) (dfref df index x))
471 (gen-seq (nth 2 (array-dimensions (dataset df))))))
472 (('vector) t)
473 (:vector-like t)
474 (:matrix-like t)
475 (:dataframe t)))
477 (defun dfref-case (df index return-type)
478 "Returns row as sequence."
479 (ecase return-type
480 (:list
481 (map 'list
482 #'(lambda (x) (dfref df x index))
483 (gen-seq (nth 1 (array-dimensions (dataset df))))))
484 (:vector t)
485 (:vector-like t)
486 (:matrix-like t)
487 (:dataframe t)))
489 ;; FIXME
490 (defun dfref-2indexlist (df indexlist1 indexlist2 &key (return-type :array))
491 "return an array, row X col dims. FIXME TESTME"
492 (case return-type
493 (:array
494 (let ((my-pre-array (list)))
495 (dolist (x indexlist1)
496 (dolist (y indexlist2)
497 (append my-pre-array (dfref df x y))))
498 (make-array (list (length indexlist1)
499 (length indexlist2))
500 :initial-contents my-pre-array)))
501 (:dataframe
502 (make-instance 'dataframe-array
503 :storage (make-array
504 (list (length indexlist1)
505 (length indexlist2))
506 :initial-contents (dataset df))
507 ;; ensure copy for this and following
508 :doc (doc-string df)
509 ;; the following 2 need to be subseted based on
510 ;; the values of indexlist1 and indexlist2
511 :case-labels (case-labels df)
512 :var-labels (var-labels df)))))
514 ;;; Do we establish methods for dataframe-like, which specialize to
515 ;;; particular instances of storage?
517 (defmethod print-object ((object dataframe-like) stream)
518 (print-unreadable-object (object stream :type t)
519 (format stream " ~d x ~d" (nrows object) (ncols object))
520 (terpri stream)
521 ;; (format stream "~T ~{~S ~T~}" (var-labels object))
522 (dotimes (j (ncols object)) ; print labels
523 (write-char #\tab stream)
524 (write-char #\tab stream)
525 (format stream "~T~A~T" (nth j (var-labels object))))
526 (dotimes (i (nrows object)) ; print obs row
527 (terpri stream)
528 (format stream "~A:~T" (nth i (case-labels object)))
529 (dotimes (j (ncols object))
530 (write-char #\tab stream) ; (write-char #\space stream)
531 ;; (write (dfref object i j) :stream stream)
532 (format stream "~7,3E" (dfref object i j)) ; if works, need to include a general output mechanism control
533 ))))
536 (defun print-structure-relational (ds)
537 "example of what we want the methods to look like. Should be sort
538 of like a graph of spreadsheets if the storage is a relational
539 structure."
540 (dolist (k (relations ds))
541 (let ((currentRelationSet (getRelation ds k)))
542 (print-as-row (var-labels currentRelationSet))
543 (let ((j -1))
544 (dolist (i (case-labels currentRelationSet))
545 (print-as-row
546 (append (list i)
547 (dfref-obsn (dataset currentRelationSet)
548 (incf j)))))))))
550 (defun testecase (s)
551 (ecase s
552 ((scalar) 1)
553 ((asd asdf) 2)))
555 (testecase 'scalar)
556 (testecase 'asd)
557 (testecase 'asdf)
558 (testecase 'as)