reset errors
[CommonLispStat.git] / src / data / data-clos.lisp
blobe7dd03193fcff9864a41e394ad871bf90ddb730b
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-03-27 17:03:01 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: data-clos.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
8 ;;; on how it arrives.
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics.
11 ;;; This redoes data storage structures in a CLOS based
12 ;;; framework.
13 ;;;
15 ;;; What is this talk of 'release'? Klingons do not make software
16 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
17 ;;; designers and quality assurance people in its wake.
19 (in-package :lisp-stat-data-clos)
21 ;;; No real basis for work, there is a bit of new-ness and R-ness to
22 ;;; this work. In particular, the notion of relation is key and
23 ;;; integral to the analysis. Tables are related and matched vectors,
24 ;;; for example. "column" vectors are related observations (by
25 ;;; measure/recording) while "row" vectors are related readings (by
26 ;;; case)
28 ;;; Relational structure -- can we capture a completely unnormalized
29 ;;; data strucutre to propose possible modeling approaches, and
30 ;;; propose appropriate models and inferential strategies?
32 ;;; So we want a verb-driven API for data collection construction. We
33 ;;; should encode independence or lack of, as possible.
35 ;;; Need to figure out typed vectors. We then map a series of typed
36 ;;; vectors over to tables where columns are equal typed. In a sense,
37 ;;; this is a relation (1-1) of equal-typed arrays. For the most
38 ;;; part, this ends up making the R data.frame into a relational
39 ;;; building block (considering 1-1 mappings using row ID as a
40 ;;; relation). Is this a worthwhile generalization?
42 ;;; verbs vs semantics for DS conversion -- consider the possibily of
43 ;;; how adverbs and verbs relate, where to put which semantically to
44 ;;; allow for general approach.
46 ;;; eg. Kasper's talk on the FUSION collection of parsers.
48 ;;;
49 ;;; Need to consider modification APIs
50 ;;; actions are:
51 ;;; - import
52 ;;; - get/set row names (case names)
53 ;;; - column names (variable names)
54 ;;; - dataset values
55 ;;; - annotation/metadata
56 ;;; - make sure that we do coherency checking in the exported
57 ;;; - functions.
58 ;;; - ...
59 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
60 ;;; additional input).
61 ;;; - either overwriting or not, i.e. with or without copy.
62 ;;; - check consistency of resulting data with metadata and related
63 ;;; data information.
64 ;;; -
67 ;;; Misc Functions (to move into a lisp data manipulation support package)
69 (defun gen-seq (n &optional (start 1))
70 "There has to be a better way -- I'm sure of it! default count from 1.
71 (gen-seq 4) ; => (1 2 3 4)
72 (gen-seq 0) ; => nil
73 (gen-seq 5 3) ; => 3 4 5
75 (if (>= n start)
76 (append (gen-seq (- n 1) start) (list n))))
78 (defun repeat-seq (n item)
79 "FIXME: There has to be a better way -- I'm sure of it!
80 (repeat-seq 3 \"d\") ; => (\"d\" \"d\" \"d\")
81 (repeat-seq 3 'd) ; => ('d 'd 'd)
83 (if (>= n 1)
84 (append (repeat-seq (1- n) item) (list item))))
88 (defun strsym->indexnum (df strsym)
89 "Returns a number indicating which column in DF has STRSYM labeling
90 it. Probably should be a method dispatching on the type of
91 DATAFRAME-LIKE."
92 (position strsym (varlabels df)))
94 (defun string->number (str)
95 "Convert a string <str> representing a number to a number. A second value is
96 returned indicating the success of the conversion.
97 Example: (rsm.string:string->number \"123\")
98 123
100 (let ((*read-eval* nil))
101 (let ((num (read-from-string str)))
102 (values num (numberp num)))))
104 (string->number "1.22")
109 (equal 'testme 'testme)
110 (defparameter *test-pos* 'testme)
111 (position *test-pos* (list 'a 'b 'testme 'c))
112 (position #'(lambda (x) (equal x "testme")) (list "a" "b" "testme" "c"))
113 (position #'(lambda (x) (equal x 1)) (list 2 1 3 4))
117 ;;; abstract dataframe class
119 (defclass dataframe-like (matrix-like)
121 ;; Matrix-like (from lisp-matrix) is basically a rectangular table
122 ;; without storage. We emulate that, and add storage, row/column
123 ;; labels, and within-column-typing.
125 ;; STORE is the storage component. We ignore this in the DATAFRAME-LIKE
126 ;; class, as it is the primary differentiator, driving how access
127 ;; (getting/setting) is done. We create methods depending on the
128 ;; storage component, which access data as appropriate. See
129 ;; DATAFRAME-ARRAY for an example implementation.
130 ;; the rest of this is metadata. In particular, we should find a
131 ;; more flexible, compact way to store this.
132 (case-labels :initform nil
133 :initarg :case-labels
134 :type list
135 :accessor case-labels
136 :documentation "labels used for describing cases (doc
137 metadata), possibly used for merging.")
138 (var-labels :initform nil
139 :initarg :var-labels
140 :type list
141 :accessor var-labels
142 :documentation "Variable names.")
143 (var-types :initform nil
144 :initarg :var-types
145 :type list
146 :accessor var-types
147 :documentation "variable types to ensure fit")
148 (documentation-string :initform nil
149 :initarg :doc
150 :accessor doc-string
151 :documentation "additional information,
152 potentially uncomputable, possibly metadata, about dataframe-like
153 instance."))
154 (:documentation "Abstract class for standard statistical analysis
155 dataset for independent data. Rows are considered
156 to be independent, matching observations. Columns
157 are considered to be type-consistent, match a
158 variable with distribution. inherits from
159 lisp-matrix base MATRIX-LIKE class.
161 DATAFRAME-LIKE is the basic cases by variables
162 framework. Need to embed this within other
163 structures which allow for generalized relations.
164 Goal is to ensure that relations imply and drive
165 the potential for statistical relativeness such as
166 correlation, interference, and similar concepts."))
169 ;;; Generics specialized above matrix-like, particularly for
170 ;;; dataframe-like objects. Need methods for any storage
171 ;;; implementation.
173 (defgeneric dataframe-dimensions (df)
174 (:documentation "")
175 (:method ((df dataframe-like))
176 (error "dispatch on virtual class.")))
178 (defgeneric dataframe-dimension (df index)
179 (:documentation "")
180 (:method ((df dataframe-like) index)
181 (elt (dataframe-dimensions df) index)))
183 (defgeneric dfref (df index1 index2) ; &key return-type
184 (:documentation "scalar access with selection of possible return
185 object types.")
186 (:method ((df dataframe-like) index1 index2) ; &key return-type
187 (error "need a real class with real storage to reference elements.")))
189 ;;; Specializing on superclasses...
190 ;;; Access and Extraction: implementations needed for any storage
191 ;;; type. But here, just to point out that we've got a specializing
192 ;;; virtual subclass (DATAFRAME-LIKE specializing MATRIX-LIKE).
194 (defmethod nrows ((df dataframe-like))
195 "specializes on inheritance from matrix-like in lisp-matrix."
196 (error "Need implementation; can't dispatch on virtual class DATAFRAME-LIKE."))
198 (defmethod ncols ((df dataframe-like))
199 "specializes on inheritance from matrix-like in lisp-matrix."
200 (error "Need implementation; can't dispatch on virtual class DATAFRAME-LIKE."))
202 ;; Testing consistency/coherency.
204 (defgeneric consistent-dataframe-p (df)
205 (:documentation "methods to check for consistency.")
206 (:method ((df dataframe-like))
207 (error "Need implementation; can't dispatch on virtual class DATAFRAME-LIKE.")))
211 (defun ensure-consistent-datatable-type (dt lot)
212 "given a datatable and a listoftypes, ensure that the datatble
213 variables are consistent."
214 (destructuring-bind (n p) ;; why use let when we can be cool? Sigh.
215 (array-dimensions dt)
216 (dotimes (i n)
217 (dotimes (j p)
218 (check-type (aref dt i j) (elt lot j))))))
222 ;;; GENERAL FUNCTIONS WHICH DISPATCH ON INTERNAL METHODS OR ARGS
224 ;;; Q: change the following to generic functions and dispatch on
225 ;;; array, matrix, and dataframe? Others?
226 (defun make-labels (initstr num)
227 "generate a list of strings which can be used as labels, i.e. something like
228 '(\"a1\" \"a2\" \"a3\")."
229 (check-type initstr string)
230 (mapcar #'(lambda (x y) (concatenate 'string x y))
231 (repeat-seq num initstr)
232 (mapcar #'(lambda (x) (format nil "~A" x)) (gen-seq num))))
235 (make-labels 'c 2)
236 (make-labels "c" 4)
239 (defun ncase-store (store)
240 (etypecase store
241 (array (array-dimension store 0))
242 (matrix-like (nrows store))))
244 (defun nvars-store (store)
245 (etypecase store
246 (array (array-dimension store 1))
247 (matrix-like (ncols store))))
250 (defun make-dataframe (newdata
251 &key (vartypes nil)
252 (caselabels nil) (varlabels nil)
253 (doc "no docs"))
254 "Helper function to use instead of make-instance to assure
255 construction of proper DF-array."
256 (check-type newdata (or matrix-like array))
257 (check-type caselabels sequence)
258 (check-type varlabels sequence)
259 (check-type doc string)
260 (let ((ncases (ncase-store newdata))
261 (nvars (nvars-store newdata)))
262 (if caselabels (assert (= ncases (length caselabels))))
263 (if varlabels (assert (= nvars (length varlabels))))
264 (let ((newcaselabels (if caselabels
265 caselabels
266 (make-labels "C" ncases)))
267 (newvarlabels (if varlabels
268 varlabels
269 (make-labels "V" nvars))))
270 (etypecase newdata
271 (array
272 (make-instance 'dataframe-array
273 :storage newdata
274 :nrows (length newcaselabels)
275 :ncols (length newvarlabels)
276 :case-labels newcaselabels
277 :var-labels newvarlabels
278 :var-types vartypes))
279 (matrix-like
280 (make-instance 'dataframe-matrixlike
281 :storage newdata
282 :nrows (length newcaselabels)
283 :ncols (length newvarlabels)
284 :case-labels newcaselabels
285 :var-labels newvarlabels
286 :var-types vartypes))))))
289 (make-dataframe #2A((1.2d0 1.3d0) (2.0d0 4.0d0)))
290 (make-dataframe #2A(('a 1) ('b 2)))
291 (dfref (make-dataframe #2A(('a 1) ('b 2))) 0 1)
292 (dfref (make-dataframe #2A(('a 1) ('b 2))) 1 0)
293 (make-dataframe 4) ; ERROR, should we allow?
294 (make-dataframe #2A((4)))
295 (make-dataframe (rand 10 5)) ;; ERROR, but should work!
299 (defun row-order-as-list (ary)
300 "Pull out data in row order into a list."
301 (let ((result (list))
302 (nrows (nth 0 (array-dimensions ary)))
303 (ncols (nth 1 (array-dimensions ary))))
304 (dotimes (i ncols)
305 (dotimes (j nrows)
306 (append result (aref ary i j))))))
308 (defun col-order-as-list (ary)
309 "Pull out data in row order into a list."
310 (let ((result (list))
311 (nrows (nth 0 (array-dimensions ary)))
312 (ncols (nth 1 (array-dimensions ary))))
313 (dotimes (i nrows)
314 (dotimes (j ncols)
315 (append result (aref ary i j))))))
317 (defun transpose-array (ary)
318 "map NxM to MxN."
319 (make-array (reverse (array-dimensions ary))
320 :initial-contents (col-order-as-list ary)))
322 ;;; THE FOLLOWING 2 dual-sets done to provide error checking
323 ;;; possibilities on top of the generic function structure. Not
324 ;;; intended as make-work!
326 (defun varlabels (df)
327 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
328 (var-labels df))
330 (defun set-varlabels (df vl)
331 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
332 (if (= (length (var-labels df))
333 (length vl))
334 (setf (var-labels df) vl)
335 (error "wrong size.")))
337 (defsetf varlabels set-varlabels)
339 ;;; Case-name handling for Tables. Needs error checking.
340 (defun caselabels (df)
341 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
342 (case-labels df))
344 (defun set-caselabels (df cl)
345 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
346 (if (= (length (case-labels df))
347 (length cl))
348 (setf (case-labels df) cl)
349 (error "wrong size.")))
351 (defsetf caselabels set-caselabels)
353 ;;;;;;;;;;;; IMPLEMENTATIONS, with appropriate methods.
354 ;; See also:
355 ;; (documentation 'dataframe-like 'type)
357 ;;;;; DATAFRAME-ARRAY
359 (defclass dataframe-array (dataframe-like)
360 ((store :initform nil
361 :initarg :storage
362 :type (array * *)
363 :accessor dataset
364 :documentation "Data storage: typed as array."))
365 (:documentation "example implementation of dataframe-like using storage
366 based on lisp arrays. An obvious alternative could be a
367 dataframe-matrix-like which uses the lisp-matrix classes."))
369 (defmethod nrows ((df dataframe-array))
370 "specializes on inheritance from matrix-like in lisp-matrix."
371 (array-dimension (dataset df) 0))
373 (defmethod ncols ((df dataframe-array))
374 "specializes on inheritance from matrix-like in lisp-matrix."
375 (array-dimension (dataset df) 1))
377 (defmethod dfref ((df dataframe-array)
378 (index1 number) (index2 number))
379 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
380 idx1/2 is row/col or case/var."
381 (aref (dataset df) index1 index2))
384 ;;;;; DATAFRAME-MATRIXLIKE
386 (defclass dataframe-matrixlike (dataframe-like)
387 ((store :initform nil
388 :initarg :storage
389 :type matrix-like
390 :accessor dataset
391 :documentation "Data storage: typed as matrix-like
392 (numerical only)."))
393 (:documentation "example implementation of dataframe-like using storage
394 based on lisp-matrix structures."))
396 (defmethod nrows ((df dataframe-matrixlike))
397 "specializes on inheritance from matrix-like in lisp-matrix."
398 (matrix-dimension (dataset df) 0))
400 (defmethod ncols ((df dataframe-matrixlike))
401 "specializes on inheritance from matrix-like in lisp-matrix."
402 (matrix-dimension (dataset df) 1))
404 (defmethod dfref ((df dataframe-matrixlike)
405 (index1 number) (index2 number))
406 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
407 idx1/2 is row/col or case/var."
408 (mref (dataset df) index1 index2))
412 ;;;;;; IMPLEMENTATION INDEPENDENT FUNCTIONS AND METHODS
413 ;;;;;; (use only dfref, nrows, ncols and similar dataframe-like
414 ;;;;;; components as core).
416 (defmethod consistent-dataframe-p ((ds dataframe-like))
417 "Test that dataframe-like is internally consistent with metadata.
418 Ensure that dims of stored data are same as case and var labels.
420 Currently checks length of things, but needs to check type of things
421 as well."
422 (and
423 ;; ensure dimensionality
424 (equal (list (ncols ds) (nrows ds)) ; array-dimensions (dataset ds))
425 (list (length (var-labels ds))
426 (length (case-labels ds))))
427 ;; when dims sane, check-type for each variable
428 (progn
429 (dolist (i (ncols ds))
430 (dotimes (j (nrows ds))
431 (typep (dfref ds i j) (nth i (var-types ds)))))
432 t)))
434 (defun dfref-var (df index return-type)
435 "Returns the data in a single variable as type.
436 type = sequence, vector, vector-like (if valid numeric type) or dataframe."
437 (ecase return-type
438 (('list)
439 (map 'list
440 #'(lambda (x) (dfref df index x))
441 (gen-seq (nth 2 (array-dimensions (dataset df))))))
442 (('vector) t)
443 (:vector-like t)
444 (:matrix-like t)
445 (:dataframe t)))
447 (defun dfref-case (df index return-type)
448 "Returns row as sequence."
449 (ecase return-type
450 (:list
451 (map 'list
452 #'(lambda (x) (dfref df x index))
453 (gen-seq (nth 1 (array-dimensions (dataset df))))))
454 (:vector t)
455 (:vector-like t)
456 (:matrix-like t)
457 (:dataframe t)))
459 ;; FIXME
460 (defun dfref-2indexlist (df indexlist1 indexlist2 &key (return-type :array))
461 "return an array, row X col dims. FIXME TESTME"
462 (case return-type
463 (:array
464 (let ((my-pre-array (list)))
465 (dolist (x indexlist1)
466 (dolist (y indexlist2)
467 (append my-pre-array (dfref df x y))))
468 (make-array (list (length indexlist1)
469 (length indexlist2))
470 :initial-contents my-pre-array)))
471 (:dataframe
472 (make-instance 'dataframe-array
473 :storage (make-array
474 (list (length indexlist1)
475 (length indexlist2))
476 :initial-contents (dataset df))
477 ;; ensure copy for this and following
478 :doc (doc-string df)
479 ;; the following 2 need to be subseted based on
480 ;; the values of indexlist1 and indexlist2
481 :case-labels (case-labels df)
482 :var-labels (var-labels df)))))
484 ;;; Do we establish methods for dataframe-like, which specialize to
485 ;;; particular instances of storage?
487 (defmethod print-object ((object dataframe-like) stream)
488 (print-unreadable-object (object stream :type t)
489 (format stream " ~d x ~d" (nrows object) (ncols object))
490 (terpri stream)
491 ;; (format stream "~T ~{~S ~T~}" (var-labels object))
492 (dotimes (j (ncols object))
493 (write-char #\tab stream)
494 (format stream "~A~T" (nth j (var-labels object))))
495 (dotimes (i (nrows object))
496 (terpri stream)
497 (format stream "~A:~T" (nth i (case-labels object)))
498 (dotimes (j (ncols object))
499 ;; (write-char #\space stream)
500 (write-char #\tab stream)
501 (write (dfref object i j) :stream stream)))))
504 (defun print-structure-relational (ds)
505 "example of what we want the methods to look like. Should be sort
506 of like a graph of spreadsheets if the storage is a relational
507 structure."
508 (dolist (k (relations ds))
509 (let ((currentRelationSet (getRelation ds k)))
510 (print-as-row (var-labels currentRelationSet))
511 (let ((j -1))
512 (dolist (i (case-labels currentRelationSet))
513 (print-as-row
514 (append (list i)
515 (dfref-obsn (dataset currentRelationSet)
516 (incf j)))))))))
518 (defun testecase (s)
519 (ecase s
520 ((scalar) 1)
521 ((asd asdf) 2)))
523 (testecase 'scalar)
524 (testecase 'asd)
525 (testecase 'asdf)
526 (testecase 'as)