moving from older dfref to the generic xref approach. One less key stroke to use .
[CommonLispStat.git] / src / data / dataframe.lisp
blobb247bff43e6365c864f1504d5a7700aba41d37ea
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-08-18 08:07:01 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: dataframe.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
8 ;;; on how it arrives.
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics.
11 ;;; This redoes dataframe structures in a CLOS based
12 ;;; framework. Currently contains the virtual class
13 ;;; DATAFRAME-LIKE as well as the actual classes
14 ;;; DATAFRAME-ARRAY and DATAFRAME-MATRIXLIKE
16 ;;; What is this talk of 'release'? Klingons do not make software
17 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
18 ;;; designers and quality assurance people in its wake.
20 (in-package :cls-dataframe)
22 ;;; No real basis for work, there is a bit of new-ness and R-ness to
23 ;;; this work. In particular, the notion of relation is key and
24 ;;; integral to the analysis. Tables are related and matched vectors,
25 ;;; for example. "column" vectors are related observations (by
26 ;;; measure/recording) while "row" vectors are related readings (by
27 ;;; case, independence). This does mean that we are placing
28 ;;; statistical semantics into the computational data object -- and
29 ;;; that it is a violation of use to consider rows which are not at
30 ;;; the least conditionally independent (though the conditioning
31 ;;; should be outside the data set, not internally specified).
33 ;;; So we want a verb-driven API for data collection construction. We
34 ;;; should encode independence or lack of, as possible.
36 ;;; Need to figure out statistically-typed vectors. We then map a
37 ;;; series of typed vectors over to tables where columns are equal
38 ;;; typed. In a sense, this is a relation (1-1) of equal-typed
39 ;;; arrays. For the most part, this ends up making the R data.frame
40 ;;; into a relational building block (considering 1-1 mappings using
41 ;;; row ID as a relation). Is this a worthwhile generalization or
42 ;;; communicable analogy?
44 ;;; verbs vs semantics for DF construction -- consider the possibily
45 ;;; of how adverbs and verbs relate, where to put which semantically
46 ;;; to allow for general approach.
48 ;;; Need to consider modification APIs
49 ;;; actions are:
50 ;;; - import
51 ;;; - get/set row names (case names)
52 ;;; - column names (variable names)
53 ;;; - dataset values
54 ;;; - annotation/metadata
55 ;;; - make sure that we do coherency checking in the exported
56 ;;; - functions.
57 ;;; - ...
58 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
59 ;;; additional input).
60 ;;; - either overwriting or not, i.e. with or without copy.
61 ;;; - check consistency of resulting data with metadata and related
62 ;;; data information.
65 ;;; Misc Functions (to move into a lisp data manipulation support package)
67 ;; the next two should be merged into a general replicator pattern.
68 (defun gen-seq (n &optional (start 1))
69 "Generates an integer sequence of length N starting at START. Used
70 for indexing."
71 (if (>= n start)
72 (append (gen-seq (- n 1) start) (list n))))
74 (defun repeat-seq (n item)
75 "FIXME: There has to be a better way -- I'm sure of it!
76 (repeat-seq 3 \"d\") ; => (\"d\" \"d\" \"d\")
77 (repeat-seq 3 'd) ; => ('d 'd 'd)
78 (repeat-seq 3 (list 1 2))"
79 (if (>= n 1)
80 (append (repeat-seq (1- n) item) (list item))))
83 (defun strsym->indexnum (df strsym)
84 "Returns a number indicating the DF column labelled by STRSYM.
85 Probably should be a method dispatching on DATAFRAME-LIKE type."
86 (position strsym (varlabels df)))
88 (defun string->number (str)
89 "Convert a string <str> representing a number to a number. A second value is
90 returned indicating the success of the conversion.
91 Examples:
92 (string->number \"123\") ; => 123 t
93 (string->number \"1.23\") ; => 1.23 t"
94 (let ((*read-eval* nil))
95 (let ((num (read-from-string str)))
96 (values num (numberp num)))))
99 (equal 'testme 'testme)
100 (defparameter *test-pos* 'testme)
101 (position *test-pos* (list 'a 'b 'testme 'c))
102 (position #'(lambda (x) (equal x "testme")) (list "a" "b" "testme" "c"))
103 (position #'(lambda (x) (equal x 1)) (list 2 1 3 4))
106 ;;; abstract dataframe class
108 (defclass dataframe-like (matrix-like)
109 ((case-labels :initform nil
110 :initarg :case-labels
111 :type list
112 :accessor case-labels
113 :documentation "labels used for describing cases (doc
114 metadata), possibly used for merging.")
115 (var-labels :initform nil
116 :initarg :var-labels
117 :type list
118 :accessor var-labels
119 :documentation "Variable names.")
120 (var-types :initform nil
121 :initarg :var-types
122 :type list
123 :accessor var-types
124 :documentation "variable types to ensure fit")
125 (doc-string :initform nil
126 :initarg :doc
127 :accessor doc-string
128 :documentation "additional information, potentially
129 uncomputable, possibly metadata, about
130 dataframe-like instance."))
131 (:documentation "Abstract class for standard statistical analysis
132 dataset for independent data. Rows are considered
133 to be independent, matching observations. Columns
134 are considered to be type-consistent, match a
135 variable with distribution. inherits from
136 lisp-matrix base MATRIX-LIKE class.
137 MATRIX-LIKE (from lisp-matrix) is basically a
138 rectangular table without storage. We emulate
139 that, and add storage, row/column labels, and
140 within-column-typing.
142 DATAFRAME-LIKE is the basic cases by variables
143 framework. Need to embed this within other
144 structures which allow for generalized relations.
145 Goal is to ensure that relations imply and drive
146 the potential for statistical relativeness such as
147 correlation, interference, and similar concepts.
149 STORE is the storage component. We ignore this in
150 the DATAFRAME-LIKE class, as it is the primary
151 differentiator, spec'ing the structure used for
152 storing the actual data. We create methods which
153 depend on STORE for access. See DATAFRAME-ARRAY
154 and DATAFRAME-MATRIXLIKE for examples. The rest of
155 this is metadata."))
157 ;;; Generics specialized above matrix-like, particularly for
158 ;;; dataframe-like objects. Need implementation of methods which
159 ;;; depend on storage form.
161 (defgeneric dataframe-dimensions (df)
162 (:documentation "")
163 (:method ((df dataframe-like))
164 (error "Dispatch on virtual class, Method needed for
165 DATAFRAME-DIMENSIONS with class ~A." (find-class df))))
167 (defgeneric dataframe-dimension (df index)
168 (:documentation "")
169 (:method ((df dataframe-like) index)
170 (elt (dataframe-dimensions df) index)))
172 (defgeneric dfselect (df &optional cases vars indices)
173 (:documentation "access to sub-dataframes. Always returns a dataframe.")
174 (:method ((df dataframe-like) &optional cases vars indices)
175 (declare (ignorable cases vars))
176 (if indices (error "Indicies not used yet"))
177 (error "Dispatch on virtual class, Method needed for DFSELECT with
178 class ~A." (find-class df))))
180 ;;; Specializing on superclasses...
182 ;;; Access and Extraction: implementations needed for any storage
183 ;;; type. But here, just to point out that we've got a specializing
184 ;;; virtual subclass (DATAFRAME-LIKE specializing MATRIX-LIKE).
186 (defmethod nrows ((df dataframe-like))
187 "specializes on inheritance from matrix-like in lisp-matrix."
188 (error "Need implementation; can't dispatch on virtual class DATAFRAME-LIKE."))
190 (defmethod ncols ((df dataframe-like))
191 "specializes on inheritance from matrix-like in lisp-matrix."
192 (error "Need implementation; can't dispatch on virtual class DATAFRAME-LIKE."))
194 ;; Testing consistency/coherency.
196 (defgeneric consistent-dataframe-p (df)
197 (:documentation "methods to check for consistency.")
198 (:method ((df dataframe-like))
199 (and
200 ;; ensure dimensionality
201 (= (length (var-labels df)) (ncols df)) ; array-dimensions (dataset df))
202 (= (length (case-labels df)) (nrows df))
203 ;; when dims sane, check-type for each variable
204 (progn
205 (dotimes (i (nrows df))
206 (dotimes (j (ncols df))
207 ;; xref bombs if not a df-like subclass so we don't worry
208 ;; about specialization.
209 ;; (check-type (aref dt i j) (elt lot j)))))) ???
210 (typep (xref df i j) (nth j (var-types df)))))
211 t))))
214 ;;; FUNCTIONS WHICH DISPATCH ON INTERNAL METHODS OR ARGS
216 ;;; Q: change the following to generic functions and dispatch on
217 ;;; array, matrix, and dataframe? Others?
218 (defun make-labels (initstr num)
219 "generate a list of strings which can be used as labels, i.e. something like
220 (make-labels \"a\" 3) => '(\"a1\" \"a2\" \"a3\")."
221 (check-type initstr string)
222 (mapcar #'(lambda (x y) (concatenate 'string x y))
223 (repeat-seq num initstr)
224 (mapcar #'(lambda (x) (format nil "~A" x)) (gen-seq num))))
226 (defun ncase-store (store)
227 "Return number of cases (rows) in dataframe storage. Doesn't test
228 that that list is a valid listoflist dataframe structure."
229 (etypecase store
230 (array (array-dimension store 0))
231 (matrix-like (nrows store))
232 (list (length store))))
234 (defun nvars-store (store)
235 "Return number of variables (columns) in dataframe storage. Doesn't
236 test that that list is a valid listoflist dataframe structure."
237 (etypecase store
238 (array (array-dimension store 1))
239 (matrix-like (ncols store))
240 (list (length (elt store 0)))))
243 (defun make-dataframe (newdata
244 &key (vartypes nil)
245 (caselabels nil) (varlabels nil)
246 (doc "no docs"))
247 "Helper function to use instead of make-instance to assure
248 construction of proper DF-array."
249 (check-type newdata (or matrix-like array list))
250 (check-type caselabels sequence)
251 (check-type varlabels sequence)
252 (check-type doc string)
253 (let ((ncases (ncase-store newdata))
254 (nvars (nvars-store newdata)))
255 (if caselabels (assert (= ncases (length caselabels))))
256 (if varlabels (assert (= nvars (length varlabels))))
257 (let ((newcaselabels (if caselabels
258 caselabels
259 (make-labels "C" ncases)))
260 (newvarlabels (if varlabels
261 varlabels
262 (make-labels "V" nvars))))
263 (etypecase newdata
264 (list
265 (make-instance 'dataframe-listoflist
266 :storage newdata
267 :nrows (length newcaselabels)
268 :ncols (length newvarlabels)
269 :case-labels newcaselabels
270 :var-labels newvarlabels
271 :var-types vartypes))
272 (array
273 (make-instance 'dataframe-array
274 :storage newdata
275 :nrows (length newcaselabels)
276 :ncols (length newvarlabels)
277 :case-labels newcaselabels
278 :var-labels newvarlabels
279 :var-types vartypes))
280 (matrix-like
281 (make-instance 'dataframe-matrixlike
282 :storage newdata
283 :nrows (length newcaselabels)
284 :ncols (length newvarlabels)
285 :case-labels newcaselabels
286 :var-labels newvarlabels
287 :var-types vartypes))
289 ))))
292 (make-dataframe #2A((1.2d0 1.3d0) (2.0d0 4.0d0)))
293 (make-dataframe #2A(('a 1) ('b 2)))
294 (xref (make-dataframe #2A(('a 1) ('b 2))) 0 1)
295 (xref (make-dataframe #2A(('a 1) ('b 2))) 1 0)
296 (make-dataframe 4) ; ERROR, should we allow?
297 (make-dataframe #2A((4)))
298 (make-dataframe (rand 10 5)) ;; ERROR, but should work!
302 (defun row-order-as-list (ary)
303 "Pull out data in row order into a list."
304 (let ((result (list))
305 (nrows (nth 0 (array-dimensions ary)))
306 (ncols (nth 1 (array-dimensions ary))))
307 (dotimes (i ncols)
308 (dotimes (j nrows)
309 (append result (aref ary i j))))))
311 (defun col-order-as-list (ary)
312 "Pull out data in row order into a list."
313 (let ((result (list))
314 (nrows (nth 0 (array-dimensions ary)))
315 (ncols (nth 1 (array-dimensions ary))))
316 (dotimes (i nrows)
317 (dotimes (j ncols)
318 (append result (aref ary i j))))))
320 (defun transpose-array (ary)
321 "map NxM to MxN."
322 (make-array (reverse (array-dimensions ary))
323 :initial-contents (col-order-as-list ary)))
325 ;;; THE FOLLOWING 2 dual-sets done to provide error checking
326 ;;; possibilities on top of the generic function structure. Not
327 ;;; intended as make-work!
329 (defun varlabels (df)
330 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
331 (var-labels df))
333 (defun set-varlabels (df vl)
334 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
335 (if (= (length (var-labels df))
336 (length vl))
337 (setf (var-labels df) vl)
338 (error "wrong size.")))
340 (defsetf varlabels set-varlabels)
342 ;;; Case-name handling for Tables. Needs error checking.
343 (defun caselabels (df)
344 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
345 (case-labels df))
347 (defun set-caselabels (df cl)
348 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
349 (if (= (length (case-labels df))
350 (length cl))
351 (setf (case-labels df) cl)
352 (error "wrong size.")))
354 (defsetf caselabels set-caselabels)
356 ;;;;;;;;;;;; IMPLEMENTATIONS, with appropriate methods.
357 ;; See also:
358 ;; (documentation 'dataframe-like 'type)
360 ;;;;; DATAFRAME-ARRAY
362 (defclass dataframe-array (dataframe-like)
363 ((store :initform nil
364 :initarg :storage
365 :type (array * *)
366 :accessor dataset
367 :documentation "Data storage: typed as array."))
368 (:documentation "example implementation of dataframe-like using storage
369 based on lisp arrays. An obvious alternative could be a
370 dataframe-matrix-like which uses the lisp-matrix classes."))
372 (defmethod nrows ((df dataframe-array))
373 "specializes on inheritance from matrix-like in lisp-matrix."
374 (array-dimension (dataset df) 0))
376 (defmethod ncols ((df dataframe-array))
377 "specializes on inheritance from matrix-like in lisp-matrix."
378 (array-dimension (dataset df) 1))
380 (defmethod xref ((df dataframe-array)
381 (index1 number) (index2 number))
382 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
383 idx1/2 is row/col or case/var."
384 (aref (dataset df) index1 index2))
386 (defmethod (setf xref) ((df dataframe-array) (index1 number) (index2 number) val)
387 "set value for df-ar."
388 ;; (check-type val (elt (var-type df) index2))
389 (setf (aref (dataset df) index1 index2) val))
391 (defparameter *default-dataframe-class* 'dataframe-array)
393 (defmethod dfselect ((df dataframe-array)
394 &optional cases vars indices)
395 "Extract the OR of cases, vars, or have a list of indices to extract"
396 (if indices (error "Indicies not used yet"))
397 (let ((newdf (make-instance *default-dataframe-class*
398 :storage (make-array (list (length cases) (length vars)))
399 :nrows (length cases)
400 :ncols (length vars)
402 :case-labels (select-list caselist (case-labels df))
403 :var-labels (select-list varlist (var-labels df))
404 :var-types (select-list varlist (vartypes df))
407 (dotimes (i (length cases))
408 (dotimes (j (length vars))
409 (setf (xref newdf i j)
410 (xref df
412 (position (elt cases i) (case-labels df))
413 (position (elt vars j) (var-labels df))))))))
415 ;;; DATAFRAME-MATRIXLIKE
416 ;;;
417 ;;; example/implementatin of using lisp-matrix datastructures for
418 ;;; dataframe storage.
420 (defclass dataframe-matrixlike (dataframe-like)
421 ((store :initform nil
422 :initarg :storage
423 :type matrix-like
424 :accessor dataset
425 :documentation "Data storage: typed as matrix-like
426 (numerical only)."))
427 (:documentation "example implementation of dataframe-like using storage
428 based on lisp-matrix structures."))
430 (defmethod nrows ((df dataframe-matrixlike))
431 "specializes on inheritance from matrix-like in lisp-matrix."
432 (matrix-dimension (dataset df) 0))
434 (defmethod ncols ((df dataframe-matrixlike))
435 "specializes on inheritance from matrix-like in lisp-matrix."
436 (matrix-dimension (dataset df) 1))
438 (defmethod xref ((df dataframe-matrixlike)
439 (index1 number) (index2 number))
440 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
441 idx1/2 is row/col or case/var."
442 (mref (dataset df) index1 index2))
444 (defmethod (setf xref) ((df dataframe-matrixlike)
445 (index1 number) (index2 number) val)
446 "Sets a value for df-ml."
447 ;; NEED TO CHECK TYPE!
448 ;; (check-type val (elt (vartype df) index2))
449 (setf (mref (dataset df) index1 index2) val))
454 ;;; DATAFRAME-LISTOFLIST
455 ;;;
456 ;;; example/implementatin of using lisp-matrix datastructures for
457 ;;; dataframe storage.
459 (defclass dataframe-listoflist (dataframe-like)
460 ((store :initform nil
461 :initarg :storage
462 :type list
463 :accessor dataset
464 :documentation "Data storage: typed as matrix-like
465 (numerical only)."))
466 (:documentation "example implementation of dataframe-like using storage
467 based on lisp-matrix structures."))
469 (defmethod nrows ((df dataframe-listoflist))
470 "specializes on inheritance from listoflist in lisp-matrix."
471 (length (dataset df)))
473 (defmethod ncols ((df dataframe-listoflist))
474 "specializes on inheritance from matrix-like in lisp-matrix."
475 (length (elt (dataset df) 0)))
477 (defmethod xref ((df dataframe-listoflist)
478 (index1 number) (index2 number))
479 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
480 idx1/2 is row/col or case/var."
481 (elt (elt (dataset df) index1) index2)) ;; ??
483 (defmethod (setf xref) ((df dataframe-listoflist)
484 (index1 number) (index2 number) val)
485 "Sets a value for df-ml."
486 ;; NEED TO CHECK TYPE!
487 ;; (check-type val (elt (vartype df) index2))
488 (setf (elt (elt (dataset df) index2) index1) val))
492 ;;;;;; IMPLEMENTATION INDEPENDENT FUNCTIONS AND METHODS
493 ;;;;;; (use only xref, nrows, ncols and similar dataframe-like
494 ;;;;;; components as core).
496 (defun xref-var (df index return-type)
497 "Returns the data in a single variable as type.
498 type = sequence, vector, vector-like (if valid numeric type) or dataframe."
499 (ecase return-type
500 (('list)
501 (map 'list
502 #'(lambda (x) (xref df index x))
503 (gen-seq (nth 2 (array-dimensions (dataset df))))))
504 (('vector) t)
505 (:vector-like t)
506 (:matrix-like t)
507 (:dataframe t)))
509 (defun xref-case (df index return-type)
510 "Returns row as sequence."
511 (ecase return-type
512 (:list
513 (map 'list
514 #'(lambda (x) (xref df x index))
515 (gen-seq (nth 1 (array-dimensions (dataset df))))))
516 (:vector t)
517 (:vector-like t)
518 (:matrix-like t)
519 (:dataframe t)))
521 ;; FIXME
522 (defun xref-2indexlist (df indexlist1 indexlist2 &key (return-type :array))
523 "return an array, row X col dims. FIXME TESTME"
524 (case return-type
525 (:array
526 (let ((my-pre-array (list)))
527 (dolist (x indexlist1)
528 (dolist (y indexlist2)
529 (append my-pre-array (xref df x y))))
530 (make-array (list (length indexlist1)
531 (length indexlist2))
532 :initial-contents my-pre-array)))
533 (:dataframe
534 (make-instance 'dataframe-array
535 :storage (make-array
536 (list (length indexlist1)
537 (length indexlist2))
538 :initial-contents (dataset df))
539 ;; ensure copy for this and following
540 :doc (doc-string df)
541 ;; the following 2 need to be subseted based on
542 ;; the values of indexlist1 and indexlist2
543 :case-labels (case-labels df)
544 :var-labels (var-labels df)))))
546 ;;; Do we establish methods for dataframe-like, which specialize to
547 ;;; particular instances of storage?
549 (defmethod print-object ((object dataframe-like) stream)
550 (print-unreadable-object (object stream :type t)
551 (format stream " ~d x ~d" (nrows object) (ncols object))
552 (terpri stream)
553 ;; (format stream "~T ~{~S ~T~}" (var-labels object))
554 (dotimes (j (ncols object)) ; print labels
555 (write-char #\tab stream)
556 (write-char #\tab stream)
557 (format stream "~T~A~T" (nth j (var-labels object))))
558 (dotimes (i (nrows object)) ; print obs row
559 (terpri stream)
560 (format stream "~A:~T" (nth i (case-labels object)))
561 (dotimes (j (ncols object))
562 (write-char #\tab stream) ; (write-char #\space stream)
563 ;; (write (xref object i j) :stream stream)
564 (format stream "~7,3E" (xref object i j)) ; if works, need to include a general output mechanism control
565 ))))
568 (defun print-structure-relational (ds)
569 "example of what we want the methods to look like. Should be sort
570 of like a graph of spreadsheets if the storage is a relational
571 structure."
572 (dolist (k (relations ds))
573 (let ((currentRelationSet (getRelation ds k)))
574 (print-as-row (var-labels currentRelationSet))
575 (let ((j -1))
576 (dolist (i (case-labels currentRelationSet))
577 (print-as-row
578 (append (list i)
579 (xref-obsn (dataset currentRelationSet)
580 (incf j)))))))))
582 (defun testecase (s)
583 (ecase s
584 ((scalar) 1)
585 ((asd asdf) 2)))
587 (testecase 'scalar)
588 (testecase 'asd)
589 (testecase 'asdf)
590 (testecase 'as)
594 ;;; Vector-like generalizations: we consider observation-like and
595 ;;; variable-like to be abstract classes which provide row and column
596 ;;; access to dataframe structures. These will be specialized, in
597 ;;; that rows correspond to an observation (or case?) which are
598 ;;; multitype, while columns correspond to a variable, which must be
599 ;;; singularly typed.
601 (defclass observation-like (dataframe-like)
603 (:documentation "dataframe-like with only 1 row, is an observation-like."))
605 (defclass variable-like (dataframe-like)
607 (:documentation "dataframe-like with only 1 column is a variable-like."))
609 ;;; Need to implement views, i.e. dataframe-view-like,
610 ;;; observation-view-like, variable-view-like.
612 ;;; Need to consider read-only variants, leveraging the xref
613 ;;; strategy.