migrate all listoflist support with matrix/array/dataframe type objects to the result...
[CommonLispStat.git] / src / data / dataframe.lisp
blob18ed6560903a61205a4de54df57a0bbb4064ae34
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-09-25 08:01:46 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: dataframe.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
8 ;;; on how it arrives.
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics.
11 ;;; This redoes dataframe structures in a CLOS based
12 ;;; framework. Currently contains the virtual class
13 ;;; DATAFRAME-LIKE as well as the actual classes
14 ;;; DATAFRAME-ARRAY and DATAFRAME-MATRIXLIKE
16 ;;; What is this talk of 'release'? Klingons do not make software
17 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
18 ;;; designers and quality assurance people in its wake.
20 (in-package :cls-dataframe)
22 ;;; No real basis for work, there is a bit of new-ness and R-ness to
23 ;;; this work. In particular, the notion of relation is key and
24 ;;; integral to the analysis. Tables are related and matched vectors,
25 ;;; for example. "column" vectors are related observations (by
26 ;;; measure/recording) while "row" vectors are related readings (by
27 ;;; case, independence). This does mean that we are placing
28 ;;; statistical semantics into the computational data object -- and
29 ;;; that it is a violation of use to consider rows which are not at
30 ;;; the least conditionally independent (though the conditioning
31 ;;; should be outside the data set, not internally specified).
33 ;;; So we want a verb-driven API for data collection construction. We
34 ;;; should encode independence or lack of, as a computable status.
36 ;;; Need to figure out statistically-typed vectors. We then map a
37 ;;; series of typed vectors over to tables where columns are equal
38 ;;; typed. In a sense, this is a relation (1-1) of equal-typed
39 ;;; arrays. For the most part, this ends up making the R data.frame
40 ;;; into a relational building block (considering 1-1 mappings using
41 ;;; row ID as a relation). Is this a worthwhile generalization or
42 ;;; communicable analogy?
44 ;;; verbs vs semantics for DF construction -- consider the possibily
45 ;;; of how adverbs and verbs relate, where to put which semantically
46 ;;; to allow for general approach.
48 ;;; Need to consider modification APIs
49 ;;; actions are:
50 ;;; - import
51 ;;; - get/set row names (case names)
52 ;;; - column names (variable names)
53 ;;; - dataset values
54 ;;; - annotation/metadata
55 ;;; - make sure that we do coherency checking in the exported
56 ;;; - functions.
57 ;;; - ...
58 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
59 ;;; additional input).
60 ;;; - either overwriting or not, i.e. with or without copy.
61 ;;; - check consistency of resulting data with metadata and related
62 ;;; data information.
64 ;;; Is there any need for an N-way dataframe (N>2) ? Am currently
65 ;;; assuming not, that this is specializing only the
66 ;;; "independent cases"-by-variables format and that there would be
67 ;;; other tools for other structures.
69 ;;; Misc Functions (to move into a lisp data manipulation support package)
71 ;; the next two should be merged into a general replicator pattern.
72 (defun gen-seq (n &optional (start 1))
73 "Generates an integer sequence of length N starting at START. Used
74 for indexing."
75 (if (>= n start)
76 (append (gen-seq (- n 1) start) (list n))))
78 (defun repeat-seq (n item)
79 "FIXME: There has to be a better way -- I'm sure of it!
80 (repeat-seq 3 \"d\") ; => (\"d\" \"d\" \"d\")
81 (repeat-seq 3 'd) ; => ('d 'd 'd)
82 (repeat-seq 3 (list 1 2))"
83 (if (>= n 1)
84 (append (repeat-seq (1- n) item) (list item))))
86 (defun strsym->indexnum (df strsym)
87 "Returns a number indicating the DF column labelled by STRSYM.
88 Probably should be a method dispatching on DATAFRAME-LIKE type."
89 (position strsym (varlabels df)))
91 (defun string->number (str)
92 "Convert a string <str> representing a number to a number. A second
93 value is returned indicating the success of the conversion. Examples:
94 (string->number \"123\") ; => 123 t
95 (string->number \"1.23\") ; => 1.23 t"
96 (let ((*read-eval* nil))
97 (let ((num (read-from-string str)))
98 (values num (numberp num)))))
101 (equal 'testme 'testme)
102 (defparameter *test-pos* 'testme)
103 (position *test-pos* (list 'a 'b 'testme 'c))
104 (position #'(lambda (x) (equal x "testme")) (list "a" "b" "testme" "c"))
105 (position #'(lambda (x) (equal x 1)) (list 2 1 3 4))
108 ;;; abstract dataframe class
110 (defclass dataframe-like (matrix-like)
111 ((case-labels :initform nil
112 :initarg :case-labels
113 :type list
114 :accessor case-labels
115 :documentation "labels used for describing cases (doc
116 metadata), possibly used for merging.")
117 (var-labels :initform nil
118 :initarg :var-labels
119 :type list
120 :accessor var-labels
121 :documentation "Variable names.")
122 (var-types :initform nil
123 :initarg :var-types
124 :type list
125 :accessor var-types
126 :documentation "variable types to ensure fit. Must be
127 list of symbols of valid types for check-type.")
128 (doc-string :initform nil
129 :initarg :doc
130 :accessor doc-string
131 :documentation "additional information, potentially
132 uncomputable, possibly metadata, about dataframe-like
133 instance."))
134 (:documentation "Abstract class for standard statistical analysis
135 dataset for independent data. Rows are considered to be
136 independent, matching observations. Columns are considered to be
137 type-consistent, match a variable with distribution. inherits
138 from lisp-matrix base MATRIX-LIKE class. MATRIX-LIKE (from
139 lisp-matrix) is basically a rectangular table without storage.
140 We emulate that, and add storage, row/column labels, and
141 within-column-typing.
143 DATAFRAME-LIKE is the basic cases by variables framework. Need
144 to embed this within other structures which allow for generalized
145 relations. Goal is to ensure that relations imply and drive the
146 potential for statistical relativeness such as correlation,
147 interference, and similar concepts.
149 STORE is the storage component. We ignore this in the
150 DATAFRAME-LIKE class, as it is the primary differentiator,
151 spec'ing the structure used for storing the actual data. We
152 create methods which depend on STORE for access. The only
153 critical component is that STORE be a class which is
154 xarray-compliant. Examples of such mixins are DATAFRAME-ARRAY
155 and DATAFRAME-MATRIXLIKE. The rest of this structure is
156 metadata."))
158 ;;; Specializing on superclasses...
160 ;;; Access and Extraction: implementations needed for any storage
161 ;;; type. But here, just to point out that we've got a specializing
162 ;;; virtual subclass (DATAFRAME-LIKE specializing MATRIX-LIKE).
164 (defgeneric nvars (df)
165 (:documentation "number of variables represented in storage type.")
166 (:method ((df dataframe-like))
167 (xdim (store df) 0)))
169 (defgeneric ncases (df)
170 (:documentation "number of cases (indep observantions) represented by storage.")
171 (:method ((df dataframe-like))
172 (xdim (store df) 1)))
174 ;; Testing consistency/coherency.
176 (defgeneric consistent-dataframe-p (df)
177 (:documentation "methods to check for consistency. Mostly of
178 internal interest, since ideally we'd have to use standard
179 constructs to ensure that we do not get the dataframe structure
180 misaligned.")
181 (:method (object) "General objects are not consistent dataframes!" nil)
182 (:method ((df dataframe-like))
183 "At minimum, must dispatch on virtual-class."
184 (and
185 ;; ensure dimensionality
186 (= (length (var-labels df)) (ncols df)) ; array-dimensions (dataset df))
187 (= (length (case-labels df)) (nrows df))
188 ;; when dims are sane, ensure variable-typing is consistent
189 (progn
190 (dotimes (i (nrows df))
191 (dotimes (j (ncols df))
192 ;; xref bombs if not a df-like subclass so we don't worry
193 ;; about specialization.
194 ;; (check-type (aref dt i j) (elt lot j)))))) ???
195 (typep (xref df i j) (nth j (var-types df)))))
196 t))))
199 ;;; FUNCTIONS WHICH DISPATCH ON INTERNAL METHODS OR ARGS
201 ;;; Q: change the following to generic functions and dispatch on
202 ;;; array, matrix, and dataframe? Others?
203 (defun make-labels (initstr num)
204 "generate a list of strings which can be used as labels, i.e. something like
205 (make-labels \"a\" 3) => '(\"a1\" \"a2\" \"a3\")."
206 (check-type initstr string)
207 (mapcar #'(lambda (x y) (concatenate 'string x y))
208 (repeat-seq num initstr)
209 (mapcar #'(lambda (x) (format nil "~A" x)) (gen-seq num))))
211 (defun ncase-store (store)
212 "Return number of cases (rows) in dataframe storage. Doesn't test
213 that that list is a valid listoflist dataframe structure."
214 (etypecase store
215 (array (array-dimension store 0))
216 (matrix-like (nrows store))
217 (list (length store))))
219 (defun nvars-store (store)
220 "Return number of variables (columns) in dataframe storage. Doesn't
221 test that that list is a valid listoflist dataframe structure."
222 (etypecase store
223 (array (array-dimension store 1))
224 (matrix-like (ncols store))
225 (list (length (elt store 0)))))
228 (defun make-dataframe (newdata
229 &key (vartypes nil)
230 (caselabels nil) (varlabels nil)
231 (doc "no docs"))
232 "Helper function to use instead of make-instance to assure
233 construction of proper DF-array."
234 (check-type newdata (or matrix-like array list))
235 (check-type caselabels sequence)
236 (check-type varlabels sequence)
237 (check-type doc string)
238 (let ((ncases (ncase-store newdata))
239 (nvars (nvars-store newdata)))
240 (if caselabels (assert (= ncases (length caselabels))))
241 (if varlabels (assert (= nvars (length varlabels))))
242 (let ((newcaselabels (if caselabels
243 caselabels
244 (make-labels "C" ncases)))
245 (newvarlabels (if varlabels
246 varlabels
247 (make-labels "V" nvars))))
248 (etypecase newdata
249 (list
250 (make-instance 'dataframe-listoflist
251 :storage newdata
252 :nrows (length newcaselabels)
253 :ncols (length newvarlabels)
254 :case-labels newcaselabels
255 :var-labels newvarlabels
256 :var-types vartypes))
257 (array
258 (make-instance 'dataframe-array
259 :storage newdata
260 :nrows (length newcaselabels)
261 :ncols (length newvarlabels)
262 :case-labels newcaselabels
263 :var-labels newvarlabels
264 :var-types vartypes))
265 (matrix-like
266 (make-instance 'dataframe-matrixlike
267 :storage newdata
268 :nrows (length newcaselabels)
269 :ncols (length newvarlabels)
270 :case-labels newcaselabels
271 :var-labels newvarlabels
272 :var-types vartypes))
274 ))))
277 (make-dataframe #2A((1.2d0 1.3d0) (2.0d0 4.0d0)))
278 (make-dataframe #2A(('a 1) ('b 2)))
279 (xref (make-dataframe #2A(('a 1) ('b 2))) 0 1)
280 (xref (make-dataframe #2A(('a 1) ('b 2))) 1 0)
281 (make-dataframe 4) ; ERROR, should we allow?
282 (make-dataframe #2A((4)))
283 (make-dataframe (rand 10 5)) ;; ERROR, but should work!
287 (defun row-order-as-list (ary)
288 "Pull out data in row order into a list."
289 (let ((result (list))
290 (nrows (nth 0 (array-dimensions ary)))
291 (ncols (nth 1 (array-dimensions ary))))
292 (dotimes (i ncols)
293 (dotimes (j nrows)
294 (append result (aref ary i j))))))
296 (defun col-order-as-list (ary)
297 "Pull out data in row order into a list."
298 (let ((result (list))
299 (nrows (nth 0 (array-dimensions ary)))
300 (ncols (nth 1 (array-dimensions ary))))
301 (dotimes (i nrows)
302 (dotimes (j ncols)
303 (append result (aref ary i j))))))
305 (defun transpose-array (ary)
306 "map NxM to MxN."
307 (make-array (reverse (array-dimensions ary))
308 :initial-contents (col-order-as-list ary)))
310 ;;; THE FOLLOWING 2 dual-sets done to provide error checking
311 ;;; possibilities on top of the generic function structure. Not
312 ;;; intended as make-work!
314 (defun varlabels (df)
315 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
316 (var-labels df))
318 (defun set-varlabels (df vl)
319 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
320 (if (= (length (var-labels df))
321 (length vl))
322 (setf (var-labels df) vl)
323 (error "wrong size.")))
325 (defsetf varlabels set-varlabels)
327 ;;; Case-name handling for Tables. Needs error checking.
328 (defun caselabels (df)
329 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
330 (case-labels df))
332 (defun set-caselabels (df cl)
333 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
334 (if (= (length (case-labels df))
335 (length cl))
336 (setf (case-labels df) cl)
337 (error "wrong size.")))
339 (defsetf caselabels set-caselabels)
341 ;;;;;;;;;;;; IMPLEMENTATIONS, with appropriate methods.
342 ;; See also:
343 ;; (documentation 'dataframe-like 'type)
348 ;;; Do we establish methods for dataframe-like, which specialize to
349 ;;; particular instances of storage?
351 (defmethod print-object ((object dataframe-like) stream)
352 (print-unreadable-object (object stream :type t)
353 (format stream " ~d x ~d" (nrows object) (ncols object))
354 (terpri stream)
355 ;; (format stream "~T ~{~S ~T~}" (var-labels object))
356 (dotimes (j (ncols object)) ; print labels
357 (write-char #\tab stream)
358 (write-char #\tab stream)
359 (format stream "~T~A~T" (nth j (var-labels object))))
360 (dotimes (i (nrows object)) ; print obs row
361 (terpri stream)
362 (format stream "~A:~T" (nth i (case-labels object)))
363 (dotimes (j (ncols object))
364 (write-char #\tab stream) ; (write-char #\space stream)
365 ;; (write (xref object i j) :stream stream)
366 (format stream "~7,3E" (xref object i j)) ; if works, need to include a general output mechanism control
367 ))))
370 (defun print-structure-relational (ds)
371 "example of what we want the methods to look like. Should be sort
372 of like a graph of spreadsheets if the storage is a relational
373 structure."
374 (dolist (k (relations ds))
375 (let ((currentRelationSet (getRelation ds k)))
376 (print-as-row (var-labels currentRelationSet))
377 (let ((j -1))
378 (dolist (i (case-labels currentRelationSet))
379 (print-as-row
380 (append (list i)
381 (xref-obsn (dataset currentRelationSet)
382 (incf j)))))))))
384 (defun testecase (s)
385 (ecase s
386 ((scalar) 1)
387 ((asd asdf) 2)))
389 (testecase 'scalar)
390 (testecase 'asd)
391 (testecase 'asdf)
392 (testecase 'as)
396 ;;; Vector-like generalizations: we consider observation-like and
397 ;;; variable-like to be abstract classes which provide row and column
398 ;;; access to dataframe structures. These will be specialized, in
399 ;;; that rows correspond to an observation (or case?) which are
400 ;;; multitype, while columns correspond to a variable, which must be
401 ;;; singularly typed.
403 (defclass observation-like (dataframe-like)
405 (:documentation "dataframe-like with only 1 row, is an observation-like."))
407 (defclass variable-like (dataframe-like)
409 (:documentation "dataframe-like with only 1 column is a variable-like."))
411 ;;; Need to implement views, i.e. dataframe-view-like,
412 ;;; observation-view-like, variable-view-like.
416 ;;; Need to consider read-only variants, leveraging the xref
417 ;;; strategy.
421 ;;; Dataframe <-> Listoflist support
422 ;; the following will be handy to help out folks adjust. It should
423 ;; provide a means to write code faster and better.
425 ;; leverages listoflist support in our version of xarray
426 (defun listoflist->dataframe (lol) ; &key (type :row-major))
427 "Create a cases-by-variables data frame consisting of numeric data,
428 from a ROW-MAJOR list-of-lists representation. A COLUMN-MAJOR
429 representation should be handled using the transpose-listoflists
430 function."
431 (check-type lol list) ; imperfect -- must verify all list elements are also lists.
432 (if (lists-of-same-size lol)
433 (make-dataframe (listoflist->array lol))
434 (error "make-data-set-from-lists: no combining different length lists"))
435 (error "make-data-set-from-lists: proposed name exists"))
438 ;;;;;;;; from dataframe-xarray experiment
441 (defmethod xref ((obj dataframe-like) &rest subscripts)
442 "For data-frame-like, dispatch on storage object."
443 (xref (dataset obj) subscripts))
445 (defmethod (setf xref) (value (obj dataframe-like) &rest subscripts)
446 (setf (xref (dataset obj) subscripts) value))
448 (defmethod xref ((obj matrix-like) &rest indices))
450 (defmethod xtype ((obj dataframe-like))
451 "Unlike the standard xtype, here we need to return a vector of the
452 types. Vectors can have single types, but arrays have single type.
453 Dataframe-like have multiple types, variable-like single type,
454 case-like has multiple types, and matrix-like has single type.")
456 (defmethod xdims ((obj dataframe-like))
457 (dataframe-dimensions obj))
459 ;; use default methods at this point, except for potentially weird DFs
460 (defmethod xdims* ())
462 (defmethod xdim ((obj dataframe-like) index)
463 (dataframe-dimension index))
466 (defmethod xrank ())
468 (defmethod slice ())
470 (defmethod take ())
472 (defmethod carray ())