dataframe extended, more work to handle extraction
[CommonLispStat.git] / src / data / data-clos.lisp
blob88c6f9c9bc0fbbb4ea85066e515ced09322ca80d
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-03-24 18:22:33 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: data-clos.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
8 ;;; on how it arrives.
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics.
11 ;;; This redoes data storage structures in a CLOS based
12 ;;; framework.
13 ;;;
15 ;;; What is this talk of 'release'? Klingons do not make software
16 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
17 ;;; designers and quality assurance people in its wake.
19 (in-package :lisp-stat-data-clos)
21 ;;; No real basis for work, there is a bit of new-ness and R-ness to
22 ;;; this work. In particular, the notion of relation is key and
23 ;;; integral to the analysis. Tables are related and matched vectors,
24 ;;; for example. "column" vectors are related observations (by
25 ;;; measure/recording) while "row" vectors are related readings (by
26 ;;; case)
28 ;;; Relational structure -- can we capture a completely unnormalized
29 ;;; data strucutre to propose possible modeling approaches, and
30 ;;; propose appropriate models and inferential strategies?
32 ;;; So we want a verb-driven API for data collection construction. We
33 ;;; should encode independence or lack of, as possible.
35 ;;; Need to figure out typed vectors. We then map a series of typed
36 ;;; vectors over to tables where columns are equal typed. In a sense,
37 ;;; this is a relation (1-1) of equal-typed arrays. For the most
38 ;;; part, this ends up making the R data.frame into a relational
39 ;;; building block (considering 1-1 mappings using row ID as a
40 ;;; relation). Is this a worthwhile generalization?
42 ;;; verbs vs semantics for DS conversion -- consider the possibily of
43 ;;; how adverbs and verbs relate, where to put which semantically to
44 ;;; allow for general approach.
46 ;;; eg. Kasper's talk on the FUSION collection of parsers.
48 ;;;
49 ;;; Need to consider modification APIs
50 ;;; actions are:
51 ;;; - import
52 ;;; - get/set row names (case names)
53 ;;; - column names (variable names)
54 ;;; - dataset values
55 ;;; - annotation/metadata
56 ;;; - make sure that we do coherency checking in the exported
57 ;;; - functions.
58 ;;; - ...
59 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
60 ;;; additional input).
61 ;;; - either overwriting or not, i.e. with or without copy.
62 ;;; - check consistency of resulting data with metadata and related
63 ;;; data information.
64 ;;; -
67 ;;; Misc Functions
69 (defun gen-seq (n &optional (start 1))
70 "There has to be a better way -- I'm sure of it! default count from 1.
71 (gen-seq 4) ; => (1 2 3 4)
72 (gen-seq 0) ; => nil
73 (gen-seq 5 3) ; => 3 4 5
75 (if (>= n start)
76 (append (gen-seq (- n 1) start) (list n))))
79 (defun strsym->indexnum (df strsym)
80 "Probably should be a method dispatching on the type of
81 DATAFRAME-LIKE."
82 (position strsym (varlabels df)))
86 (equal 'testme 'testme)
87 (defparameter *test-pos* 'testme)
88 (position *test-pos* (list 'a 'b 'testme 'c))
89 (position #'(lambda (x) (equal x "testme")) (list "a" "b" "testme" "c"))
90 (position #'(lambda (x) (equal x 1)) (list 2 1 3 4))
94 ;;; abstract dataframe class
96 (defclass dataframe-like (matrix-like)
98 ;; Matrix-like (from lisp-matrix) is basically a rectangular table
99 ;; without storage. We emulate that, and add storage, row/column
100 ;; labels, and within-column-typing.
102 ;; STORE is the storage component. We ignore this in the DATAFRAME-LIKE
103 ;; class, as it is the primary differentiator, driving how access
104 ;; (getting/setting) is done. We create methods depending on the
105 ;; storage component, which access data as appropriate. See
106 ;; DATAFRAME-ARRAY for an example implementation.
107 ;; the rest of this is metadata. In particular, we should find a
108 ;; more flexible, compact way to store this.
109 (case-labels :initform nil
110 :initarg :case-labels
111 :type list
112 :accessor case-labels
113 :documentation "labels used for describing cases (doc
114 metadata), possibly used for merging.")
115 (var-labels :initform nil
116 :initarg :var-labels
117 :type list
118 :accessor var-labels
119 :documentation "Variable names.")
120 (var-types :initform nil
121 :initarg :var-types
122 :type list
123 :accessor var-types
124 :documentation "variable types to ensure fit")
125 (documentation-string :initform nil
126 :initarg :doc
127 :accessor doc-string
128 :documentation "additional information,
129 potentially uncomputable, possibly metadata, about dataframe-like
130 instance."))
131 (:documentation "Abstract class for standard statistical analysis
132 dataset for independent data. Rows are considered
133 to be independent, matching observations. Columns
134 are considered to be type-consistent, match a
135 variable with distribution. inherits from
136 lisp-matrix base MATRIX-LIKE class.
138 DATAFRAME-LIKE is the basic cases by variables
139 framework. Need to embed this within other
140 structures which allow for generalized relations.
141 Goal is to ensure that relations imply and drive
142 the potential for statistical relativeness such as
143 correlation, interference, and similar concepts."))
146 ;;; Generics specialized above matrix-like, particularly for
147 ;;; dataframe-like objects. Need methods for any storage
148 ;;; implementation.
150 (defgeneric dataframe-dimensions (df)
151 (:documentation "")
152 (:method ((df dataframe-like))
153 (error "dispatch on virtual class.")))
155 (defgeneric dataframe-dimension (df index)
156 (:documentation "")
157 (:method ((df dataframe-like) index)
158 (elt (dataframe-dimensions df) index)))
160 (defgeneric dfref (df index1 index2 &key return-type)
161 (:documentation "scalar access with selection of possible return
162 object types.")
163 (:method ((df dataframe-like) index1 index2 &key return-type)
164 (error "need a real class with real storage to reference elements.")))
166 ;;; Specializing on superclasses...
167 ;;; Access and Extraction: implementations needed for any storage
168 ;;; type. But here, just to point out that we've got a specializing
169 ;;; virtual subclass (DATAFRAME-LIKE specializing MATRIX-LIKE).
171 (defmethod nrows ((df dataframe-like))
172 "specializes on inheritance from matrix-like in lisp-matrix."
173 (error "Need implementation; can't dispatch on virtual class DATAFRAME-LIKE."))
175 (defmethod ncols ((df dataframe-like))
176 "specializes on inheritance from matrix-like in lisp-matrix."
177 (error "Need implementation; can't dispatch on virtual class DATAFRAME-LIKE."))
179 ;; Testing consistency/coherency.
181 (defgeneric consistent-dataframe-p (df)
182 (:documentation "methods to check for consistency.")
183 (:method ((df dataframe-like))
184 (error "Need implementation; can't dispatch on virtual class DATAFRAME-LIKE.")))
188 (defun ensure-consistent-datatable-type (dt lot)
189 "given a datatable and a listoftypes, ensure that the datatble
190 variables are consistent."
191 (destructuring-bind (n p) ;; why use let when we can be cool? Sigh.
192 (array-dimensions dt)
193 (dotimes (i n)
194 (dotimes (j p)
195 (check-type (aref dt i j) (elt lot j))))))
198 ;;; change the following to generic functions and dispatch on
199 ;;; array, matrix, and dataframe? Others?
201 (defun row-order-as-list (ary)
202 "Pull out data in row order into a list."
203 (let ((result (list))
204 (nrows (nth 0 (array-dimensions ary)))
205 (ncols (nth 1 (array-dimensions ary))))
206 (dotimes (i ncols)
207 (dotimes (j nrows)
208 (append result (aref ary i j))))))
210 (defun col-order-as-list (ary)
211 "Pull out data in row order into a list."
212 (let ((result (list))
213 (nrows (nth 0 (array-dimensions ary)))
214 (ncols (nth 1 (array-dimensions ary))))
215 (dotimes (i nrows)
216 (dotimes (j ncols)
217 (append result (aref ary i j))))))
219 (defun transpose-array (ary)
220 "map NxM to MxN."
221 (make-array (reverse (array-dimensions ary))
222 :initial-contents (col-order-as-list ary)))
224 ;;; THE FOLLOWING 2 dual-sets done to provide error checking
225 ;;; possibilities on top of the generic function structure. Not
226 ;;; intended as make-work!
228 (defun varlabels (df)
229 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
230 (var-labels df))
232 (defun set-varlabels (df vl)
233 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
234 (if (= (length (var-labels df))
235 (length vl))
236 (setf (var-labels df) vl)
237 (error "wrong size.")))
239 (defsetf varlabels set-varlabels)
241 ;;; Case-name handling for Tables. Needs error checking.
242 (defun caselabels (df)
243 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
244 (case-labels df))
246 (defun set-caselabels (df cl)
247 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
248 (if (= (length (case-labels df))
249 (length cl))
250 (setf (case-labels df) cl)
251 (error "wrong size.")))
253 (defsetf caselabels set-caselabels)
255 ;;;;;;;;;;;; IMPLEMENTATIONS, with appropriate methods.
257 ;; See also:
258 ;; (documentation 'dataframe-like 'type)
260 (defclass dataframe-array (dataframe-like)
261 ((store :initform nil
262 :initarg :storage
263 :type (array * *)
264 :accessor dataset
265 :documentation "Data storage: typed as array."))
266 (:documentation "example implementation of dataframe-like using storage
267 based on lisp arrays. An obvious alternative could be a
268 dataframe-matrix-like which uses the lisp-matrix classes."))
270 (defmethod nrows ((df dataframe-array))
271 "specializes on inheritance from matrix-like in lisp-matrix."
272 (array-dimension (dataset df) 0))
274 (defmethod ncols ((df dataframe-array))
275 "specializes on inheritance from matrix-like in lisp-matrix."
276 (array-dimension (dataset df) 1))
278 (defmethod consistent-dataframe-p ((ds dataframe-array))
279 "Test that dataframe-like is internally consistent with metadata.
280 Ensure that dims of stored data are same as case and var labels.
282 Currently checks length of things, but needs to check type of things
283 as well."
284 (and
285 ;; ensure dimensionality
286 (equal (list (ncols ds) (nrows ds)) ; array-dimensions (dataset ds))
287 (list (length (var-labels ds))
288 (length (case-labels ds))))
289 ;; when dims sane, check-type for each variable
290 (progn
291 (dolist (i (ncols ds))
292 (dotimes (j (nrows ds))
293 (typep (aref (dataset ds) i j) (nth i (var-types ds)))))
294 t)))
300 (defun testecase (s)
301 (ecase s
302 ((scalar) 1)
303 ((asd asdf) 2)))
305 (testecase 'scalar)
306 (testecase 'asd)
307 (testecase 'asdf)
308 (testecase 'as)
312 (defmethod dfref ((df dataframe-array) (index1 number) (index2 number) &key return-type)
313 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
314 idx1/2 is row/col or case/var. Return-type could be 'scalar,
315 'dataframe, ..."
316 (ecase return-type
317 ((scalar)
318 (aref (dataset df) index1 index2))
319 ((dataframe)
320 (make-instance 'dataframe-array
321 :storage (make-array
322 (list 1 1)
323 :initial-contents (dfref df index1 index2))
324 ;; ensure copy for this and following
325 :doc (doc-string df)
326 :case-labels (nth index1 (caselabels df))
327 :var-labels (nth index2 (varlabels df))
328 ;; shound the type spec assume, as
329 ;; below, or should it inherit from the
330 ;; dataframe we are selecting from?
331 :var-types (nth index2 (var-types df))))))
335 (defmethod dfref ((df dataframe-array) (index1 string) (index2 string) &key return-type)
336 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
337 idx1/2 is row/col or case/var. This method dispatches when using
338 strings or symbols. Merge with the index-as-number variant?"
339 (let ((idx1 (strsym->indexnum df index1))
340 (idx2 (strsym->indexnum df index2)))
341 (ecase return-type
342 ((scalar) (aref (dataset df) idx1 idx2))
343 ((dataframe) (make-instance 'dataframe-array
344 :storage (make-array
345 (list 1 1)
346 :initial-contents (dfref df idx1 idx2))
347 ;; ensure copy for this and following
348 :doc (doc-string df)
349 :case-labels (elt (caselabels df) idx1)
350 :var-labels (elt (varlabels df) idx2)
351 ;; shound the type spec assume, as
352 ;; below, or should it inherit from the
353 ;; dataframe we are selecting from?
354 :var-types (nth idx2 (var-types df)))))))
357 (defun dfref-var (df index return-type)
358 "Returns the data in a single variable as type.
359 type = sequence, vector, vector-like (if valid numeric type) or dataframe."
360 (ecase return-type
361 (('list)
362 (map 'list
363 #'(lambda (x) (dfref df index x))
364 (gen-seq (nth 2 (array-dimensions (dataset df))))))
365 (('vector) t)
366 (:vector-like t)
367 (:matrix-like t)
368 (:dataframe t)))
370 (defun dfref-case (df index return-type)
371 "Returns row as sequence."
372 (ecase return-type
373 (:list
374 (map 'list
375 #'(lambda (x) (dfref df x index))
376 (gen-seq (nth 1 (array-dimensions (dataset df))))))
377 (:vector t)
378 (:vector-like t)
379 (:matrix-like t)
380 (:dataframe t)))
382 ;; FIXME
383 (defun dfref-2indexlist (df indexlist1 indexlist2 &key (return-type :array))
384 "return an array, row X col dims. FIXME TESTME"
385 (case return-type
386 (:array
387 (let ((my-pre-array (list)))
388 (dolist (x indexlist1)
389 (dolist (y indexlist2)
390 (append my-pre-array (dfref df x y))))
391 (make-array (list (length indexlist1)
392 (length indexlist2))
393 :initial-contents my-pre-array)))
394 (:dataframe
395 (make-instance 'dataframe-array
396 :storage (make-array
397 (list (length indexlist1)
398 (length indexlist2))
399 :initial-contents (dataset df))
400 ;; ensure copy for this and following
401 :doc (doc-string df)
402 ;; the following 2 need to be subseted based on
403 ;; the values of indexlist1 and indexlist2
404 :case-labels (case-labels df)
405 :var-labels (var-labels df)))))
407 ;;; Do we establish methods for dataframe-like, which specialize to
408 ;;; particular instances of storage?
410 (defmethod print-object ((object dataframe-array) stream)
411 (print-unreadable-object (object stream :type t)
412 (format stream " ~d x ~d" (nrows object) (ncols object))
413 (terpri stream)
414 ;; (format stream "~T ~{~S ~T~}" (var-labels object))
415 (dotimes (j (ncols object))
416 (write-char #\tab stream)
417 (format stream "~A~T" (nth j (var-labels object))))
418 (dotimes (i (nrows object))
419 (terpri stream)
420 (format stream "~A:~T" (nth i (case-labels object)))
421 (dotimes (j (ncols object))
422 ;; (write-char #\space stream)
423 (write-char #\tab stream)
424 (write (dfref object i j) :stream stream)))))
427 (defun print-structure-relational (ds)
428 "example of what we want the methods to look like. Should be sort
429 of like a graph of spreadsheets if the storage is a relational
430 structure."
431 (dolist (k (relations ds))
432 (let ((currentRelationSet (getRelation ds k)))
433 (print-as-row (var-labels currentRelationSet))
434 (let ((j -1))
435 (dolist (i (case-labels currentRelationSet))
436 (print-as-row
437 (append (list i)
438 (dfref-obsn (dataset currentRelationSet)
439 (incf j)))))))))