compiles again -- fixed function calls, but need to get the generic function signatur...
[CommonLispStat.git] / src / data / dataframe.lisp
blob98477fb98787b53da8e06fcb3c1b6d66129159d7
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2012-10-08 17:46:02 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: dataframe.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008--2010, AJ Rossini. See LICENSE.mit in
8 ;;; toplevel directory for conditions.
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics,
11 ;;; using a DATAFRAME-LIKE virtual structure.
12 ;;; This redoes dataframe structures in a CLOS based
13 ;;; framework. Currently contains the virtual class
14 ;;; DATAFRAME-LIKE as well as the actual classes
15 ;;; DATAFRAME-ARRAY and DATAFRAME-MATRIXLIKE.
17 ;;; What is this talk of 'release'? Klingons do not make software
18 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
19 ;;; designers and quality assurance people in its wake.
21 (in-package :cls-dataframe)
23 ;;; No real basis for work, there is a bit of new-ness and R-ness to
24 ;;; this work. In particular, the notion of relation is key and
25 ;;; integral to the analysis. Tables are related and matched vectors,
26 ;;; for example. "column" vectors are related observations (by
27 ;;; measure/recording) while "row" vectors are related readings (by
28 ;;; case, independence). This does mean that we are placing
29 ;;; statistical semantics into the computational data object -- and
30 ;;; that it is a violation of use to consider rows which are not at
31 ;;; the least conditionally independent (though the conditioning
32 ;;; should be outside the data set, not internally specified).
34 ;;; So we want a verb-driven API for data collection construction. We
35 ;;; should encode independence or lack of, as a computable status.
37 ;;; Need to figure out statistically-typed vectors. We then map a
38 ;;; series of typed vectors over to tables where columns are equal
39 ;;; typed. In a sense, this is a relation (1-1) of equal-typed
40 ;;; arrays. For the most part, this ends up making the R data.frame
41 ;;; into a relational building block (considering 1-1 mappings using
42 ;;; row ID as a relation). Is this a worthwhile generalization or
43 ;;; communicable analogy?
45 ;;; verbs vs semantics for DF construction -- consider the possibily
46 ;;; of how adverbs and verbs relate, where to put which semantically
47 ;;; to allow for general approach.
49 ;;; Need to consider modification APIs
50 ;;; actions are:
51 ;;; - import
52 ;;; - get/set row names (case names)
53 ;;; - column names (variable names)
54 ;;; - dataset values
55 ;;; - annotation/metadata
56 ;;; - make sure that we do coherency checking in the exported
57 ;;; - functions.
58 ;;; - ...
59 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
60 ;;; additional input).
61 ;;; - either overwriting or not, i.e. with or without copy.
62 ;;; - check consistency of resulting data with metadata and related
63 ;;; data information.
65 ;;; Is there any need for an N-way dataframe (N>2) ? Am currently
66 ;;; assuming not, that this is specializing only the
67 ;;; "independent cases"-by-variables format and that there would be
68 ;;; other tools for other structures.
70 ;;; Misc Functions (to move into a lisp data manipulation support package)
72 ;; the next two should be merged into a general replicator or iterator
73 ;; pattern.
74 (defun gen-seq (n &optional (start 1))
75 "Generates an integer sequence of length N starting at START. Used
76 for indexing."
77 (if (>= n start)
78 (append (gen-seq (- n 1) start) (list n))))
80 (defun repeat-seq (n item)
81 "FIXME: There has to be a better way -- I'm sure of it!
82 (repeat-seq 3 \"d\") ; => (\"d\" \"d\" \"d\")
83 (repeat-seq 3 'd) ; => ('d 'd 'd)
84 (repeat-seq 3 (list 1 2))"
85 (if (>= n 1)
86 (append (repeat-seq (1- n) item) (list item))))
88 (defun strsym->indexnum (df strsym)
89 "Returns a number indicating the DF column labelled by STRSYM.
90 Probably should be generic/methods dispatching on DATAFRAME-LIKE type."
91 (position strsym (varlabels df)))
93 (defun string->number (str)
94 "Convert a string <str> representing a number to a number. A second
95 value is returned indicating the success of the conversion. Examples:
96 (string->number \"123\") ; => 123 t
97 (string->number \"1.23\") ; => 1.23 t"
98 (let ((*read-eval* nil))
99 (let ((num (read-from-string str)))
100 (values num (numberp num)))))
103 (equal 'testme 'testme)
104 (defparameter *test-pos* 'testme)
105 (position *test-pos* (list 'a 'b 'testme 'c))
106 (position #'(lambda (x) (equal x "testme")) (list "a" "b" "testme" "c"))
107 (position #'(lambda (x) (equal x 1)) (list 2 1 3 4))
110 ;;; abstract dataframe class
112 (defclass dataframe-like (matrix-like)
115 (store :initform nil
116 :accessor dataset
117 :documentation "not useful in the -like virtual class case,
118 contains actual data")
119 (store-class :initform nil
120 :accessor store-class
121 :documentation "Lisp class used for the dataframe storage.")
123 (case-labels :initform nil
124 :initarg :case-labels
125 :type list
126 :accessor case-labels
127 :documentation "labels used for describing cases (doc
128 metadata), possibly used for merging.")
129 (var-labels :initform nil
130 :initarg :var-labels
131 :type list
132 :accessor var-labels
133 :documentation "Variable names. List order matches
134 order in STORE.")
135 (var-types :initform nil
136 :initarg :var-types
137 :type list
138 :accessor var-types
139 :documentation "List of symbols representing classes
140 which describe the range of contents for a particular
141 variable. Symbols must be valid types for check-type.
142 List order matches order in STORE.")
143 (doc-string :initform nil
144 :initarg :doc
145 :accessor doc-string
146 :documentation "additional information, potentially
147 uncomputable, possibly metadata, about dataframe-like
148 instance."))
149 (:documentation "Abstract class for standard statistical analysis
150 dataset for (possible conditionally, externally) independent
151 data. Rows are considered to be independent, matching
152 observations. Columns are considered to be type-consistent,
153 match a variable with distribution. inherits from lisp-matrix
154 base MATRIX-LIKE class. MATRIX-LIKE (from lisp-matrix) is
155 basically a rectangular table without storage. We emulate that,
156 and add storage, row/column labels, and within-column-typing.
158 DATAFRAME-LIKE is the basic cases by variables framework. Need
159 to embed this within other structures which allow for generalized
160 relations. Goal is to ensure that relations imply and drive the
161 potential for statistical relativeness such as correlation,
162 interference, and similar concepts.
164 STORE is the storage component. We ignore this in the
165 DATAFRAME-LIKE class, as it is the primary differentiator,
166 spec'ing the structure used for storing the actual data. We
167 create methods which depend on STORE for access. The only
168 critical component is that STORE be a class which is
169 xarray-compliant. Examples of such mixins are DATAFRAME-ARRAY
170 and DATAFRAME-MATRIXLIKE. The rest of this structure is
171 metadata."))
173 ;;; Specializing on superclasses...
175 ;;; Access and Extraction: implementations needed for any storage
176 ;;; type. But here, just to point out that we've got a specializing
177 ;;; virtual subclass (DATAFRAME-LIKE specializing MATRIX-LIKE).
179 (defgeneric nvars (df)
180 (:documentation "number of variables represented in storage type.")
181 (:method ((df dataframe-like))
182 (xdim (store df) 1))
183 (:method ((df array))
184 (xdim df 1)))
188 (defun nvars-store (store)
189 "Return number of variables (columns) in dataframe storage. Doesn't
190 test that that list is a valid listoflist dataframe structure."
191 (etypecase store
192 (array (array-dimension store 1))
193 (matrix-like (ncols store))
194 (list (length (elt store 0)))))
197 (defgeneric ncases (df)
198 (:documentation "number of cases (indep, or indep within context,
199 observantions) within DF storage form.")
200 (:method ((df matrix-like))
201 (nrows df))
202 (:method ((df list))
203 (xdim df 0)) ;; probably should do a valid LISTOFLIST structure test but this would be inefficient
204 (:method ((df array))
205 (xdim df 0)))
208 (defun ncase-store (store)
209 "Return number of cases (rows) in dataframe storage. Doesn't test
210 that that list is a valid listoflist dataframe structure."
211 (etypecase store
212 (array (array-dimension store 0))
213 (matrix-like (nrows store))
214 (list (length store))))
217 ;; Testing consistency/coherency.
219 (defgeneric consistent-dataframe-p (df)
220 (:documentation "methods to check for consistency. Mostly of
221 internal interest, since ideally we'd have to use standard
222 constructs to ensure that we do not get the dataframe structure
223 misaligned.")
224 (:method (object) "General objects are not consistent dataframes!" nil)
225 (:method ((df dataframe-like))
226 "At minimum, must dispatch on virtual-class."
227 (and
228 ;; ensure dimensionality
229 (= (length (var-labels df)) (ncols df)) ; array-dimensions (dataset df))
230 (= (length (case-labels df)) (nrows df))
231 ;; ensure claimed STORE-CLASS
232 ;; when dims are sane, ensure variable-typing is consistent
233 (progn
234 (dotimes (i (nrows df))
235 (dotimes (j (ncols df))
236 ;; xref bombs if not a df-like subclass so we don't worry
237 ;; about specialization. Need to ensure xref throws a
238 ;; condition we can recover from.
239 ;; (check-type (aref dt i j) (elt lot j)))))) ???
240 (typep (xref df i j) (nth j (var-types df)))))
241 t))))
244 ;;; FUNCTIONS WHICH DISPATCH ON INTERNAL METHODS OR ARGS
246 ;;; Q: change the following to generic functions and dispatch on
247 ;;; array, matrix, and dataframe? Others?
248 (defun make-labels (initstr num)
249 "generate a list of strings which can be used as labels, i.e. something like
250 (make-labels \"a\" 3) => '(\"a1\" \"a2\" \"a3\")."
251 (check-type initstr string)
252 (mapcar #'(lambda (x y) (concatenate 'string x y))
253 (repeat-seq num initstr)
254 (mapcar #'(lambda (x) (format nil "~A" x)) (gen-seq num))))
256 (defun check-dataframe-params (data vartypes varlabels caselabels doc)
257 "This will throw an exception (FIXME: Need to put together a CLS exception system, this could be part of it)"
258 ;; type checking
259 (check-type data (or matrix-like array list))
260 (check-type caselabels sequence)
261 (check-type varlabels sequence)
262 (check-type vartypes sequence)
263 (check-type doc string)
264 ;; dimension checking
265 (if vartypes (assert (= (nvars data) (length vartypes))))
266 (if varlabels (assert (= (nvars data) (length varlabels))))
267 (if caselabels (assert (= (ncases data) (length varlabels)))))
269 (defmacro build-dataframe (type)
270 `(progn
271 (check-dataframe-params data vartypes varlabels caselabels doc)
272 (let ((newcaselabels (if caselabels
273 caselabels
274 (make-labels "C" (ncases data))))
275 (newvarlabels (if varlabels
276 varlabels
277 (make-labels "V" (nvars data))))
278 ;; also should determine most restrictive possible (compsci
279 ;; and/or statistical) variable typing (integer, double,
280 ;; string, symbol, *). FIXME: until we get the mixed typing system in place, we will just leave null
281 (newvartypes (if vartypes
282 vartypes
283 (make-labels "*" (nvars data)))))
284 (make-instance ,type
285 :storage data
286 :nrows (length newcaselabels)
287 :ncols (length newvarlabels)
288 :case-labels newcaselabels
289 :var-labels newvarlabels
290 :var-types newvartypes))))
292 ;; (macroexpand '(build-dataframe 'test)))
294 (defgeneric make-dataframe2 (data &key vartypes varlabels caselabels doc)
295 (:documentation "trial generic dispatch. Data should be in table format desired for use.")
296 (:method ((data dataframe-array)
297 &key vartypes varlabels caselabels doc)
298 (check-dataframe-params data vartypes varlabels caselabels doc)
299 (let ((newcaselabels (if caselabels
300 caselabels
301 (make-labels "C" (ncases data))))
302 (newvarlabels (if varlabels
303 varlabels
304 (make-labels "V" (nvars data))))
305 ;; also should determine most restrictive possible (compsci
306 ;; and/or statistical) variable typing (integer, double,
307 ;; string, symbol, *). FIXME: until we get the mixed typing system in place, we will just leave null
308 (newvartypes (if vartypes
309 vartypes
310 (make-labels "*" (nvars data)))))
311 (make-instance 'dataframe-array
312 :storage data
313 :nrows (length newcaselabels)
314 :ncols (length newvarlabels)
315 :case-labels newcaselabels
316 :var-labels newvarlabels
317 :var-types newvartypes)))
318 (:method ((data dataframe-matrixlike)
319 &key vartypes varlabels caselabels doc
320 ;; (vartypes sequence) (varlabels sequence) (caselabels sequence) (doc string)
322 (check-dataframe-params data vartypes varlabels caselabels doc)
323 (build-dataframe 'dataframe-matrixlike))
324 (:method ((data dataframe-listoflist)
325 &key vartypes varlabels caselabels doc
326 ;; (vartypes sequence) (varlabels sequence) (caselabels sequence) (doc string)
328 (check-dataframe-params data vartypes varlabels caselabels doc)
329 (build-dataframe 'dataframe-listoflist)))
331 (defun make-dataframe (newdata
332 &key (vartypes nil)
333 (caselabels nil) (varlabels nil)
334 (doc "no docs"))
335 "Helper function to use instead of make-instance to assure
336 construction of proper DF-array."
337 (check-type newdata (or matrix-like array list))
338 (check-type caselabels sequence)
339 (check-type varlabels sequence)
340 (check-type doc string)
341 (let ((ncases (ncases newdata))
342 (nvars (nvars newdata)))
343 (if caselabels (assert (= ncases (length caselabels))))
344 (if varlabels (assert (= nvars (length varlabels))))
345 (let ((newcaselabels (if caselabels
346 caselabels
347 (make-labels "C" ncases)))
348 (newvarlabels (if varlabels
349 varlabels
350 (make-labels "V" nvars))))
351 (etypecase newdata
352 (list
353 (make-instance 'dataframe-listoflist
354 :storage newdata
355 :nrows (length newcaselabels)
356 :ncols (length newvarlabels)
357 :case-labels newcaselabels
358 :var-labels newvarlabels
359 :var-types vartypes))
360 (array
361 (make-instance 'dataframe-array
362 :storage newdata
363 :nrows (length newcaselabels)
364 :ncols (length newvarlabels)
365 :case-labels newcaselabels
366 :var-labels newvarlabels
367 :var-types vartypes))
368 (matrix-like
369 (make-instance 'dataframe-matrixlike
370 :storage newdata
371 :nrows (length newcaselabels)
372 :ncols (length newvarlabels)
373 :case-labels newcaselabels
374 :var-labels newvarlabels
375 :var-types vartypes))))))
378 (make-dataframe #2A((1.2d0 1.3d0) (2.0d0 4.0d0)))
379 (make-dataframe #2A(('a 1) ('b 2)))
380 (xref (make-dataframe #2A(('a 1) ('b 2))) 0 1)
381 (xref (make-dataframe #2A(('a 1) ('b 2))) 1 0)
382 (make-dataframe 4) ; ERROR, should we allow?
383 (make-dataframe #2A((4)))
384 (make-dataframe (rand 10 5)) ;; ERROR, but should work!
388 (defun row-order-as-list (ary)
389 "Pull out data in row order into a list."
390 (let ((result (list))
391 (nrows (nth 0 (array-dimensions ary)))
392 (ncols (nth 1 (array-dimensions ary))))
393 (dotimes (i ncols)
394 (dotimes (j nrows)
395 (append result (aref ary i j))))))
397 (defun col-order-as-list (ary)
398 "Pull out data in row order into a list."
399 (let ((result (list))
400 (nrows (nth 0 (array-dimensions ary)))
401 (ncols (nth 1 (array-dimensions ary))))
402 (dotimes (i nrows)
403 (dotimes (j ncols)
404 (append result (aref ary i j))))))
406 (defun transpose-array (ary)
407 "map NxM to MxN."
408 (make-array (reverse (array-dimensions ary))
409 :initial-contents (col-order-as-list ary)))
411 ;;; THE FOLLOWING 2 dual-sets done to provide error checking
412 ;;; possibilities on top of the generic function structure. Not
413 ;;; intended as make-work!
415 (defun varlabels (df)
416 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
417 (var-labels df))
419 (defun set-varlabels (df vl)
420 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
421 (if (= (length (var-labels df))
422 (length vl))
423 (setf (var-labels df) vl)
424 (error "wrong size.")))
426 (defsetf varlabels set-varlabels)
428 ;;; Case-name handling for Tables. Needs error checking.
429 (defun caselabels (df)
430 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
431 (case-labels df))
433 (defun set-caselabels (df cl)
434 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
435 (if (= (length (case-labels df))
436 (length cl))
437 (setf (case-labels df) cl)
438 (error "wrong size.")))
440 (defsetf caselabels set-caselabels)
442 ;;;;;;;;;;;; IMPLEMENTATIONS, with appropriate methods.
443 ;; See also:
444 ;; (documentation 'dataframe-like 'type)
449 ;;; Do we establish methods for dataframe-like, which specialize to
450 ;;; particular instances of storage?
452 (defmethod print-object ((object dataframe-like) stream)
453 (print-unreadable-object (object stream :type t)
454 (format stream " ~d x ~d" (nrows object) (ncols object))
455 (terpri stream)
456 ;; (format stream "~T ~{~S ~T~}" (var-labels object))
457 (dotimes (j (ncols object)) ; print labels
458 (write-char #\tab stream)
459 (write-char #\tab stream)
460 (format stream "~T~A~T" (nth j (var-labels object))))
461 (dotimes (i (nrows object)) ; print obs row
462 (terpri stream)
463 (format stream "~A:~T" (nth i (case-labels object)))
464 (dotimes (j (ncols object))
465 (write-char #\tab stream) ; (write-char #\space stream)
466 ;; (write (xref object i j) :stream stream)
467 (format stream "~7,3E" (xref object i j)) ; if works, need to include a general output mechanism control
468 ))))
471 (defun print-structure-relational (ds)
472 "example of what we want the methods to look like. Should be sort
473 of like a graph of spreadsheets if the storage is a relational
474 structure."
475 (dolist (k (relations ds))
476 (let ((currentRelationSet (getRelation ds k)))
477 (print-as-row (var-labels currentRelationSet))
478 (let ((j -1))
479 (dolist (i (case-labels currentRelationSet))
480 (print-as-row
481 (append (list i)
482 (xref-obsn (dataset currentRelationSet)
483 (incf j)))))))))
485 (defun testecase (s)
486 (ecase s
487 ((scalar) 1)
488 ((asd asdf) 2)))
490 (testecase 'scalar)
491 (testecase 'asd)
492 (testecase 'asdf)
493 (testecase 'as)
497 ;;; Vector-like generalizations: we consider observation-like and
498 ;;; variable-like to be abstract classes which provide row and column
499 ;;; access to dataframe structures. These will be specialized, in
500 ;;; that rows correspond to an observation (or case?) which are
501 ;;; multitype, while columns correspond to a variable, which must be
502 ;;; singularly typed.
504 (defclass observation-like (dataframe-like)
506 (:documentation "dataframe-like with only 1 row, is an observation-like."))
508 (defclass variable-like (dataframe-like)
510 (:documentation "dataframe-like with only 1 column is a variable-like."))
512 ;;; Need to implement views, i.e. dataframe-view-like,
513 ;;; observation-view-like, variable-view-like.
517 ;;; Need to consider read-only variants, leveraging the xref
518 ;;; strategy.
522 ;;; Dataframe <-> Listoflist support
523 ;; the following will be handy to help out folks adjust. It should
524 ;; provide a means to write code faster and better.
526 ;; leverages listoflist support in our version of xarray
527 (defun listoflist->dataframe (lol) ; &key (type :row-major))
528 "Create a cases-by-variables data frame consisting of numeric data,
529 from a ROW-MAJOR list-of-lists representation. A COLUMN-MAJOR
530 representation should be handled using the transpose-listoflists
531 function."
532 (check-type lol list) ; imperfect -- must verify all list elements are also lists.
533 (if (listoflist:sublists-of-same-size-p lol)
534 (make-dataframe (listoflist:listoflist->array lol))
535 (error "make-data-set-from-lists: no combining different length lists"))
536 (error "make-data-set-from-lists: proposed name exists"))
539 ;;;;;;;; from dataframe-xarray experiment
542 (defmethod xref ((obj dataframe-like) &rest subscripts)
543 "For data-frame-like, dispatch on storage object."
544 (xref (dataset obj) subscripts))
546 (defmethod (setf xref) (value (obj dataframe-like) &rest subscripts)
547 (setf (xref (dataset obj) subscripts) value))
549 (defmethod xref ((obj matrix-like) &rest indices))
551 (defmethod xtype ((obj dataframe-like))
552 "Unlike the standard xtype, here we need to return a vector of the
553 types. Vectors can have single types, but arrays have single type.
554 Dataframe-like have multiple types, variable-like single type,
555 case-like has multiple types, and matrix-like has single type.")
557 (defmethod xdims ((obj dataframe-like))
558 (dataframe-dimensions obj))
560 ;; use default methods at this point, except for potentially weird DFs
561 (defmethod xdims* ())
563 (defmethod xdim ((obj dataframe-like) index)
564 (dataframe-dimension index))
567 (defmethod xrank ())
569 (defmethod slice ())
571 (defmethod take ())
573 (defmethod carray ())
575 (defmacro with-dataframe (env &rest progn)
576 "Compute using variable names with with.data.frame type semantics.")
578 (defmacro with-data (body)
579 "Stream-handling, maintaining I/O through object typing.")