clean up dataframe/xarray mappings. More to do.
[CommonLispStat.git] / src / data / dataframe.lisp
blob14e8b76844c17fc2a9344ac1a29de6d44070fc2e
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-08-27 08:16:33 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: dataframe.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
8 ;;; on how it arrives.
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics.
11 ;;; This redoes dataframe structures in a CLOS based
12 ;;; framework. Currently contains the virtual class
13 ;;; DATAFRAME-LIKE as well as the actual classes
14 ;;; DATAFRAME-ARRAY and DATAFRAME-MATRIXLIKE
16 ;;; What is this talk of 'release'? Klingons do not make software
17 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
18 ;;; designers and quality assurance people in its wake.
20 (in-package :cls-dataframe)
22 ;;; No real basis for work, there is a bit of new-ness and R-ness to
23 ;;; this work. In particular, the notion of relation is key and
24 ;;; integral to the analysis. Tables are related and matched vectors,
25 ;;; for example. "column" vectors are related observations (by
26 ;;; measure/recording) while "row" vectors are related readings (by
27 ;;; case, independence). This does mean that we are placing
28 ;;; statistical semantics into the computational data object -- and
29 ;;; that it is a violation of use to consider rows which are not at
30 ;;; the least conditionally independent (though the conditioning
31 ;;; should be outside the data set, not internally specified).
33 ;;; So we want a verb-driven API for data collection construction. We
34 ;;; should encode independence or lack of, as a computable status.
36 ;;; Need to figure out statistically-typed vectors. We then map a
37 ;;; series of typed vectors over to tables where columns are equal
38 ;;; typed. In a sense, this is a relation (1-1) of equal-typed
39 ;;; arrays. For the most part, this ends up making the R data.frame
40 ;;; into a relational building block (considering 1-1 mappings using
41 ;;; row ID as a relation). Is this a worthwhile generalization or
42 ;;; communicable analogy?
44 ;;; verbs vs semantics for DF construction -- consider the possibily
45 ;;; of how adverbs and verbs relate, where to put which semantically
46 ;;; to allow for general approach.
48 ;;; Need to consider modification APIs
49 ;;; actions are:
50 ;;; - import
51 ;;; - get/set row names (case names)
52 ;;; - column names (variable names)
53 ;;; - dataset values
54 ;;; - annotation/metadata
55 ;;; - make sure that we do coherency checking in the exported
56 ;;; - functions.
57 ;;; - ...
58 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
59 ;;; additional input).
60 ;;; - either overwriting or not, i.e. with or without copy.
61 ;;; - check consistency of resulting data with metadata and related
62 ;;; data information.
64 ;;; Misc Functions (to move into a lisp data manipulation support package)
66 ;; the next two should be merged into a general replicator pattern.
67 (defun gen-seq (n &optional (start 1))
68 "Generates an integer sequence of length N starting at START. Used
69 for indexing."
70 (if (>= n start)
71 (append (gen-seq (- n 1) start) (list n))))
73 (defun repeat-seq (n item)
74 "FIXME: There has to be a better way -- I'm sure of it!
75 (repeat-seq 3 \"d\") ; => (\"d\" \"d\" \"d\")
76 (repeat-seq 3 'd) ; => ('d 'd 'd)
77 (repeat-seq 3 (list 1 2))"
78 (if (>= n 1)
79 (append (repeat-seq (1- n) item) (list item))))
82 (defun strsym->indexnum (df strsym)
83 "Returns a number indicating the DF column labelled by STRSYM.
84 Probably should be a method dispatching on DATAFRAME-LIKE type."
85 (position strsym (varlabels df)))
87 (defun string->number (str)
88 "Convert a string <str> representing a number to a number. A second
89 value is returned indicating the success of the conversion. Examples:
90 (string->number \"123\") ; => 123 t
91 (string->number \"1.23\") ; => 1.23 t"
92 (let ((*read-eval* nil))
93 (let ((num (read-from-string str)))
94 (values num (numberp num)))))
97 (equal 'testme 'testme)
98 (defparameter *test-pos* 'testme)
99 (position *test-pos* (list 'a 'b 'testme 'c))
100 (position #'(lambda (x) (equal x "testme")) (list "a" "b" "testme" "c"))
101 (position #'(lambda (x) (equal x 1)) (list 2 1 3 4))
104 ;;; abstract dataframe class
106 (defclass dataframe-like (matrix-like)
107 ((case-labels :initform nil
108 :initarg :case-labels
109 :type list
110 :accessor case-labels
111 :documentation "labels used for describing cases (doc
112 metadata), possibly used for merging.")
113 (var-labels :initform nil
114 :initarg :var-labels
115 :type list
116 :accessor var-labels
117 :documentation "Variable names.")
118 (var-types :initform nil
119 :initarg :var-types
120 :type list
121 :accessor var-types
122 :documentation "variable types to ensure fit. Must be
123 list of symbols of valid types for check-type.")
124 (doc-string :initform nil
125 :initarg :doc
126 :accessor doc-string
127 :documentation "additional information, potentially
128 uncomputable, possibly metadata, about dataframe-like
129 instance."))
130 (:documentation "Abstract class for standard statistical analysis
131 dataset for independent data. Rows are considered to be
132 independent, matching observations. Columns are considered to be
133 type-consistent, match a variable with distribution. inherits
134 from lisp-matrix base MATRIX-LIKE class. MATRIX-LIKE (from
135 lisp-matrix) is basically a rectangular table without storage.
136 We emulate that, and add storage, row/column labels, and
137 within-column-typing.
139 DATAFRAME-LIKE is the basic cases by variables
140 framework. Need to embed this within other
141 structures which allow for generalized relations.
142 Goal is to ensure that relations imply and drive
143 the potential for statistical relativeness such as
144 correlation, interference, and similar concepts.
146 STORE is the storage component. We ignore this in
147 the DATAFRAME-LIKE class, as it is the primary
148 differentiator, spec'ing the structure used for
149 storing the actual data. We create methods which
150 depend on STORE for access. See DATAFRAME-ARRAY
151 and DATAFRAME-MATRIXLIKE for examples. The rest of
152 this is metadata."))
154 ;;; Specializing on superclasses...
156 ;;; Access and Extraction: implementations needed for any storage
157 ;;; type. But here, just to point out that we've got a specializing
158 ;;; virtual subclass (DATAFRAME-LIKE specializing MATRIX-LIKE).
160 (defgeneric nvars (df)
161 (:documentation "number of variables represented in storage type.")
162 (:method ((df dataframe-like))
163 (xdim (store df) 0)))
165 (defgeneric ncases (df)
166 (:documentation "number of cases (indep observantions) represented by storage.")
167 (:method ((df dataframe-like))
168 (xdim (store df) 1)))
170 ;; Testing consistency/coherency.
172 (defgeneric consistent-dataframe-p (df)
173 (:documentation "methods to check for consistency.")
174 (:method ((df dataframe-like))
175 (and
176 ;; ensure dimensionality
177 (= (length (var-labels df)) (ncols df)) ; array-dimensions (dataset df))
178 (= (length (case-labels df)) (nrows df))
179 ;; when dims sane, check-type for each variable
180 (progn
181 (dotimes (i (nrows df))
182 (dotimes (j (ncols df))
183 ;; xref bombs if not a df-like subclass so we don't worry
184 ;; about specialization.
185 ;; (check-type (aref dt i j) (elt lot j)))))) ???
186 (typep (xref df i j) (nth j (var-types df)))))
187 t))))
190 ;;; FUNCTIONS WHICH DISPATCH ON INTERNAL METHODS OR ARGS
192 ;;; Q: change the following to generic functions and dispatch on
193 ;;; array, matrix, and dataframe? Others?
194 (defun make-labels (initstr num)
195 "generate a list of strings which can be used as labels, i.e. something like
196 (make-labels \"a\" 3) => '(\"a1\" \"a2\" \"a3\")."
197 (check-type initstr string)
198 (mapcar #'(lambda (x y) (concatenate 'string x y))
199 (repeat-seq num initstr)
200 (mapcar #'(lambda (x) (format nil "~A" x)) (gen-seq num))))
202 (defun ncase-store (store)
203 "Return number of cases (rows) in dataframe storage. Doesn't test
204 that that list is a valid listoflist dataframe structure."
205 (etypecase store
206 (array (array-dimension store 0))
207 (matrix-like (nrows store))
208 (list (length store))))
210 (defun nvars-store (store)
211 "Return number of variables (columns) in dataframe storage. Doesn't
212 test that that list is a valid listoflist dataframe structure."
213 (etypecase store
214 (array (array-dimension store 1))
215 (matrix-like (ncols store))
216 (list (length (elt store 0)))))
219 (defun make-dataframe (newdata
220 &key (vartypes nil)
221 (caselabels nil) (varlabels nil)
222 (doc "no docs"))
223 "Helper function to use instead of make-instance to assure
224 construction of proper DF-array."
225 (check-type newdata (or matrix-like array list))
226 (check-type caselabels sequence)
227 (check-type varlabels sequence)
228 (check-type doc string)
229 (let ((ncases (ncase-store newdata))
230 (nvars (nvars-store newdata)))
231 (if caselabels (assert (= ncases (length caselabels))))
232 (if varlabels (assert (= nvars (length varlabels))))
233 (let ((newcaselabels (if caselabels
234 caselabels
235 (make-labels "C" ncases)))
236 (newvarlabels (if varlabels
237 varlabels
238 (make-labels "V" nvars))))
239 (etypecase newdata
240 (list
241 (make-instance 'dataframe-listoflist
242 :storage newdata
243 :nrows (length newcaselabels)
244 :ncols (length newvarlabels)
245 :case-labels newcaselabels
246 :var-labels newvarlabels
247 :var-types vartypes))
248 (array
249 (make-instance 'dataframe-array
250 :storage newdata
251 :nrows (length newcaselabels)
252 :ncols (length newvarlabels)
253 :case-labels newcaselabels
254 :var-labels newvarlabels
255 :var-types vartypes))
256 (matrix-like
257 (make-instance 'dataframe-matrixlike
258 :storage newdata
259 :nrows (length newcaselabels)
260 :ncols (length newvarlabels)
261 :case-labels newcaselabels
262 :var-labels newvarlabels
263 :var-types vartypes))
265 ))))
268 (make-dataframe #2A((1.2d0 1.3d0) (2.0d0 4.0d0)))
269 (make-dataframe #2A(('a 1) ('b 2)))
270 (xref (make-dataframe #2A(('a 1) ('b 2))) 0 1)
271 (xref (make-dataframe #2A(('a 1) ('b 2))) 1 0)
272 (make-dataframe 4) ; ERROR, should we allow?
273 (make-dataframe #2A((4)))
274 (make-dataframe (rand 10 5)) ;; ERROR, but should work!
278 (defun row-order-as-list (ary)
279 "Pull out data in row order into a list."
280 (let ((result (list))
281 (nrows (nth 0 (array-dimensions ary)))
282 (ncols (nth 1 (array-dimensions ary))))
283 (dotimes (i ncols)
284 (dotimes (j nrows)
285 (append result (aref ary i j))))))
287 (defun col-order-as-list (ary)
288 "Pull out data in row order into a list."
289 (let ((result (list))
290 (nrows (nth 0 (array-dimensions ary)))
291 (ncols (nth 1 (array-dimensions ary))))
292 (dotimes (i nrows)
293 (dotimes (j ncols)
294 (append result (aref ary i j))))))
296 (defun transpose-array (ary)
297 "map NxM to MxN."
298 (make-array (reverse (array-dimensions ary))
299 :initial-contents (col-order-as-list ary)))
301 ;;; THE FOLLOWING 2 dual-sets done to provide error checking
302 ;;; possibilities on top of the generic function structure. Not
303 ;;; intended as make-work!
305 (defun varlabels (df)
306 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
307 (var-labels df))
309 (defun set-varlabels (df vl)
310 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
311 (if (= (length (var-labels df))
312 (length vl))
313 (setf (var-labels df) vl)
314 (error "wrong size.")))
316 (defsetf varlabels set-varlabels)
318 ;;; Case-name handling for Tables. Needs error checking.
319 (defun caselabels (df)
320 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
321 (case-labels df))
323 (defun set-caselabels (df cl)
324 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
325 (if (= (length (case-labels df))
326 (length cl))
327 (setf (case-labels df) cl)
328 (error "wrong size.")))
330 (defsetf caselabels set-caselabels)
332 ;;;;;;;;;;;; IMPLEMENTATIONS, with appropriate methods.
333 ;; See also:
334 ;; (documentation 'dataframe-like 'type)
336 ;;;;; DATAFRAME-ARRAY
338 (defclass dataframe-array (dataframe-like)
339 ((store :initform nil
340 :initarg :storage
341 :type (array * *)
342 :accessor dataset
343 :documentation "Data storage: typed as array."))
344 (:documentation "example implementation of dataframe-like using storage
345 based on lisp arrays. An obvious alternative could be a
346 dataframe-matrix-like which uses the lisp-matrix classes."))
348 (defmethod nrows ((df dataframe-array))
349 "specializes on inheritance from matrix-like in lisp-matrix."
350 (array-dimension (dataset df) 0))
352 (defmethod ncols ((df dataframe-array))
353 "specializes on inheritance from matrix-like in lisp-matrix."
354 (array-dimension (dataset df) 1))
356 (defmethod xref ((df dataframe-array) &rest subscripts)
357 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
358 idx1/2 is row/col or case/var."
359 (assert (>= 2 (length subscripts)))
360 #| ;; needed?
361 (assert (typep (elt subscripts 0) integer))
362 (assert (typep (elt subscripts 1) integer))
364 (aref (dataset df) (elt subscripts 0) (elt subscripts 1)))
366 (defmethod (setf xref) (value (df dataframe-array) &rest subscripts)
367 "set value for df-ar."
368 ;; (check-type val (elt (var-type df) index2))
369 (setf (aref (dataset df) (elt subscripts 0) (elt subscripts 1)) value))
371 (defparameter *default-dataframe-class* 'dataframe-array)
373 (defmethod dfselect ((df dataframe-array)
374 &optional cases vars indices)
375 "Extract the OR of cases, vars, or have a list of indices to extract"
376 (if indices (error "Indicies not used yet"))
377 (let ((newdf (make-instance *default-dataframe-class*
378 :storage (make-array (list (length cases) (length vars)))
379 :nrows (length cases)
380 :ncols (length vars)
382 :case-labels (select-list caselist (case-labels df))
383 :var-labels (select-list varlist (var-labels df))
384 :var-types (select-list varlist (vartypes df))
387 (dotimes (i (length cases))
388 (dotimes (j (length vars))
389 (setf (xref newdf i j)
390 (xref df
391 (position (elt cases i) (case-labels df))
392 (position (elt vars j) (var-labels df))))))))
394 ;;; DATAFRAME-MATRIXLIKE
395 ;;;
396 ;;; example/implementatin of using lisp-matrix datastructures for
397 ;;; dataframe storage.
399 (defclass dataframe-matrixlike (dataframe-like)
400 ((store :initform nil
401 :initarg :storage
402 :type matrix-like
403 :accessor dataset
404 :documentation "Data storage: typed as matrix-like
405 (numerical only)."))
406 (:documentation "example implementation of dataframe-like using storage
407 based on lisp-matrix structures."))
409 (defmethod nrows ((df dataframe-matrixlike))
410 "specializes on inheritance from matrix-like in lisp-matrix."
411 (matrix-dimension (dataset df) 0))
413 (defmethod ncols ((df dataframe-matrixlike))
414 "specializes on inheritance from matrix-like in lisp-matrix."
415 (matrix-dimension (dataset df) 1))
417 ;;; *** FIXME: change mref to xref when we establish lisp-matrix
418 ;;; change to use xarray access facility. Need to dummy-proof the
419 ;;; following.
420 (defmethod xref ((df dataframe-matrixlike) &rest subscripts)
421 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
422 idx1/2 is row/col or case/var."
423 (mref (dataset df) (elt subscripts 0) (elt subscripts 1)))
425 (defmethod (setf xref) (value (df dataframe-matrixlike) &rest subscripts)
426 "Sets a value for df-ml."
427 ;; NEED TO CHECK TYPE!
428 ;; (check-type val (elt (vartype df) index2))
429 (setf (mref (dataset df) (elt subscripts 0) (elt subscripts 1)) value))
434 ;;; DATAFRAME-LISTOFLIST
435 ;;;
436 ;;; example/implementatin of using lisp-matrix datastructures for
437 ;;; dataframe storage.
439 (defclass dataframe-listoflist (dataframe-like)
440 ((store :initform nil
441 :initarg :storage
442 :type list
443 :accessor dataset
444 :documentation "Data storage: typed as matrix-like
445 (numerical only)."))
446 (:documentation "example implementation of dataframe-like using storage
447 based on lisp-matrix structures."))
449 (defmethod nrows ((df dataframe-listoflist))
450 "specializes on inheritance from listoflist in lisp-matrix."
451 (length (dataset df)))
453 (defmethod ncols ((df dataframe-listoflist))
454 "specializes on inheritance from matrix-like in lisp-matrix."
455 (length (elt (dataset df) 0)))
457 (defmethod xref ((df dataframe-listoflist) &rest subscripts)
458 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
459 idx1/2 is row/col or case/var."
460 (elt (elt (dataset df) (elt subscripts 0)) (elt subscripts 1))) ;; ??
462 (defmethod (setf xref) (value (df dataframe-listoflist) &rest subscripts)
463 "Sets a value for df-ml."
464 ;; NEED TO CHECK TYPE!
465 ;; (check-type val (elt (vartype df) index2))
466 (setf (elt (elt (dataset df) (elt subscripts 1)) (elt subscripts 0)) value))
468 ;;;;;; IMPLEMENTATION INDEPENDENT FUNCTIONS AND METHODS
469 ;;;;;; (use only xref, nrows, ncols and similar dataframe-like
470 ;;;;;; components as core).
472 (defun xref-var (df index return-type)
473 "Returns the data in a single variable as type.
474 type = sequence, vector, vector-like (if valid numeric type) or dataframe."
475 (ecase return-type
476 (('list)
477 (map 'list
478 #'(lambda (x) (xref df index x))
479 (gen-seq (nth 2 (array-dimensions (dataset df))))))
480 (('vector) t)
481 (:vector-like t)
482 (:matrix-like t)
483 (:dataframe t)))
485 (defun xref-case (df index return-type)
486 "Returns row as sequence."
487 (ecase return-type
488 (:list
489 (map 'list
490 #'(lambda (x) (xref df x index))
491 (gen-seq (nth 1 (array-dimensions (dataset df))))))
492 (:vector t)
493 (:vector-like t)
494 (:matrix-like t)
495 (:dataframe t)))
497 ;; FIXME
498 (defun xref-2indexlist (df indexlist1 indexlist2 &key (return-type :array))
499 "return an array, row X col dims. FIXME TESTME"
500 (case return-type
501 (:array
502 (let ((my-pre-array (list)))
503 (dolist (x indexlist1)
504 (dolist (y indexlist2)
505 (append my-pre-array (xref df x y))))
506 (make-array (list (length indexlist1)
507 (length indexlist2))
508 :initial-contents my-pre-array)))
509 (:dataframe
510 (make-instance 'dataframe-array
511 :storage (make-array
512 (list (length indexlist1)
513 (length indexlist2))
514 :initial-contents (dataset df))
515 ;; ensure copy for this and following
516 :doc (doc-string df)
517 ;; the following 2 need to be subseted based on
518 ;; the values of indexlist1 and indexlist2
519 :case-labels (case-labels df)
520 :var-labels (var-labels df)))))
522 ;;; Do we establish methods for dataframe-like, which specialize to
523 ;;; particular instances of storage?
525 (defmethod print-object ((object dataframe-like) stream)
526 (print-unreadable-object (object stream :type t)
527 (format stream " ~d x ~d" (nrows object) (ncols object))
528 (terpri stream)
529 ;; (format stream "~T ~{~S ~T~}" (var-labels object))
530 (dotimes (j (ncols object)) ; print labels
531 (write-char #\tab stream)
532 (write-char #\tab stream)
533 (format stream "~T~A~T" (nth j (var-labels object))))
534 (dotimes (i (nrows object)) ; print obs row
535 (terpri stream)
536 (format stream "~A:~T" (nth i (case-labels object)))
537 (dotimes (j (ncols object))
538 (write-char #\tab stream) ; (write-char #\space stream)
539 ;; (write (xref object i j) :stream stream)
540 (format stream "~7,3E" (xref object i j)) ; if works, need to include a general output mechanism control
541 ))))
544 (defun print-structure-relational (ds)
545 "example of what we want the methods to look like. Should be sort
546 of like a graph of spreadsheets if the storage is a relational
547 structure."
548 (dolist (k (relations ds))
549 (let ((currentRelationSet (getRelation ds k)))
550 (print-as-row (var-labels currentRelationSet))
551 (let ((j -1))
552 (dolist (i (case-labels currentRelationSet))
553 (print-as-row
554 (append (list i)
555 (xref-obsn (dataset currentRelationSet)
556 (incf j)))))))))
558 (defun testecase (s)
559 (ecase s
560 ((scalar) 1)
561 ((asd asdf) 2)))
563 (testecase 'scalar)
564 (testecase 'asd)
565 (testecase 'asdf)
566 (testecase 'as)
570 ;;; Vector-like generalizations: we consider observation-like and
571 ;;; variable-like to be abstract classes which provide row and column
572 ;;; access to dataframe structures. These will be specialized, in
573 ;;; that rows correspond to an observation (or case?) which are
574 ;;; multitype, while columns correspond to a variable, which must be
575 ;;; singularly typed.
577 (defclass observation-like (dataframe-like)
579 (:documentation "dataframe-like with only 1 row, is an observation-like."))
581 (defclass variable-like (dataframe-like)
583 (:documentation "dataframe-like with only 1 column is a variable-like."))
585 ;;; Need to implement views, i.e. dataframe-view-like,
586 ;;; observation-view-like, variable-view-like.
588 ;;; Need to consider read-only variants, leveraging the xref
589 ;;; strategy.
595 ;;;;;;;; from dataframe-xarray experiment
598 (defmethod xref ((obj dataframe-like) &rest subscripts)
599 "For data-frame-like, dispatch on storage object."
600 (xref (dataset obj) subscripts))
602 (defmethod (setf xref) (value (obj dataframe-like) &rest subscripts)
603 (setf (xref (dataset obj) subscripts) value))
605 (defmethod xref ((obj matrix-like) &rest indices))
607 (defmethod xtype ((obj dataframe-like))
608 "Unlike the standard xtype, here we need to return a vector of the
609 types. Vectors can have single types, but arrays have single type.
610 Dataframe-like have multiple types, variable-like single type,
611 case-like has multiple types, and matrix-like has single type.")
613 (defmethod xdims ((obj dataframe-like))
614 (dataframe-dimensions obj))
616 ;; use default methods at this point, except for potentially weird DFs
617 (defmethod xdims* ())
619 (defmethod xdim ((obj dataframe-like) index)
620 (dataframe-dimension index))
623 (defmethod xrank ())
625 (defmethod slice ())
627 (defmethod take ())
629 (defmethod carray ())