more clean up for dataframe -- description of general array storage, methods and...
[CommonLispStat.git] / src / data / data-clos.lisp
blob49a2848b7f83409ab0215dbb8a9cd1c76971876d
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-03-24 08:14:16 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: data-clos.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
8 ;;; on how it arrives.
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics.
11 ;;; This redoes data storage structures in a CLOS based
12 ;;; framework.
13 ;;;
15 ;;; What is this talk of 'release'? Klingons do not make software
16 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
17 ;;; designers and quality assurance people in its wake.
19 (in-package :lisp-stat-data-clos)
21 ;;; No real basis for work, there is a bit of new-ness and R-ness to
22 ;;; this work. In particular, the notion of relation is key and
23 ;;; integral to the analysis. Tables are related and matched vectors,
24 ;;; for example. "column" vectors are related observations (by
25 ;;; measure/recording) while "row" vectors are related readings (by
26 ;;; case)
27 ;;;
29 ;;; Relational structure -- can we capture a completely unnormalized
30 ;;; data strucutre to propose possible modeling approaches, and
31 ;;; propose appropriate models and inferential strategies?
32 ;;;
33 ;;; So we want a verb-driven API for data collection construction. We
34 ;;; should encode independence or lack of, as possible.
36 ;; Need to figure out typed vectors. We then map a series of typed
37 ;; vectors over to tables where columns are equal typed. In a sense,
38 ;; this is a relation (1-1) of equal-typed arrays. For the most part,
39 ;; this ends up making the R data.frame into a relational building
40 ;; block (considering 1-1 mappings using row ID as a relation).
41 ;; Is this a worthwhile generalization?
43 ;;; verbs vs semantics for DS conversion -- consider the possibily of
44 ;;; how adverbs and verbs relate, where to put which semantically to
45 ;;; allow for general approach.
47 ;;; eg. Kasper's talk on the FUSION collection of parsers.
49 ;;;
50 ;;; Need to consider modification APIs
51 ;;; actions are:
52 ;;; - import
53 ;;; - get/set row names (case names)
54 ;;; - column names (variable names)
55 ;;; - dataset values
56 ;;; - annotation/metadata
57 ;;; - make sure that we do coherency checking in the exported
58 ;;; - functions.
59 ;;; - ...
60 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
61 ;;; additional input).
62 ;;; - either overwriting or not, i.e. with or without copy.
63 ;;; - check consistency of resulting data with metadata and related
64 ;;; data information.
65 ;;; -
68 ;;;; Misc Fucntions
70 (defun gen-seq (n &optional (start 1))
71 "There has to be a better way -- I'm sure of it! default count from 1.
72 (gen-seq 4) ; => (1 2 3 4)
73 (gen-seq 0) ; => nil
74 (gen-seq 5 3) ; => 3 4 5
76 (if (>= n start)
77 (append (gen-seq (- n 1) start) (list n))))
79 ;;;; abstract dataframe class
81 (defclass dataframe-like (matrix-like)
83 ;; Matrix-like (from lisp-matrix) is basically a rectangular table
84 ;; without storage. We emulate that, and add storage, row/column
85 ;; labels, and within-column-typing.
87 ;; STORE is the storage component. We ignore this in the DATAFRAME-LIKE
88 ;; class, as it is the primary differentiator, driving how access
89 ;; (getting/setting) is done. We create methods depending on the
90 ;; storage component, which access data as appropriate. See
91 ;; DATAFRAME-ARRAY for an example implementation.
92 ;; the rest of this is metadata. In particular, we should find a
93 ;; more flexible, compact way to store this.
94 (case-labels :initform nil
95 :initarg :case-labels
96 :type list
97 :accessor case-labels
98 :documentation "labels used for describing cases (doc
99 metadata), possibly used for merging.")
100 (var-labels :initform nil
101 :initarg :var-labels
102 :type list
103 :accessor var-labels
104 :documentation "Variable names.")
105 (var-types :initform nil
106 :initarg :var-types
107 :type list
108 :accessor var-types
109 :documentation "variable types to ensure fit")
110 (documentation-string :initform nil
111 :initarg :doc
112 :accessor doc-string
113 :documentation "additional information,
114 potentially uncomputable, possibly metadata, about dataframe-like
115 instance."))
116 (:documentation "Abstract class for standard statistical analysis
117 dataset for independent data. Rows are considered
118 to be independent, matching observations. Columns
119 are considered to be type-consistent, match a
120 variable with distribution. inherits from
121 lisp-matrix base MATRIX-LIKE class.
123 DATAFRAME-LIKE is the basic cases by variables
124 framework. Need to embed this within other
125 structures which allow for generalized relations.
126 Goal is to ensure that relations imply and drive
127 the potential for statistical relativeness such as
128 correlation, interference, and similar concepts."))
131 ;;; Generics specialized above matrix-like, particularly for
132 ;;; dataframe-like objects. Need methods for any storage
133 ;;; implementation.
135 (defgeneric dfref (df index1 index2 &key return-type)
136 (:documentation "scalar access with selection of possible return
137 object types.")
138 (:method ((df dataframe-like) index1 index2 &key return-type)
139 (error "need a real class with real storage to reference elements.")))
141 ;;; Access and Extraction: implementations needed for any storage
142 ;;; type. But here, just to point out that we've got a specializing
143 ;;; virtual subclass (DATAFRAME-LIKE specializing MATRIX-LIKE).
145 (defmethod nrows ((df dataframe-like))
146 "specializes on inheritance from matrix-like in lisp-matrix."
147 (error "Need implementation; can't dispatch on virtual class."))
149 (defmethod ncols ((df dataframe-like))
150 "specializes on inheritance from matrix-like in lisp-matrix."
151 (error "Need implementation; can't dispatch on virtual class."))
153 ;; Testing consistency/coherency.
155 (defgeneric consistent-dataframe-like-p (df)
156 (:documentation "methods to check for consistency.")
157 (:method ((df dataframe-like))
158 (error "need a real class with real storage to reference elements.")))
162 (defun ensure-consistent-datatable-type (dt lot)
163 "given a datatable and a listoftypes, ensure that the datatble
164 variables are consistent."
165 (destructuring-bind (n p) ;; why use let when we can be cool? Sigh.
166 (array-dimensions dt)
167 (dotimes (i n)
168 (dotimes (j p)
169 (check-type (aref dt i j) (elt lot j))))))
172 ;;; change the following to generic functions and dispatch on
173 ;;; array, matrix, and dataframe? Others?
175 (defun row-order-as-list (ary)
176 "Pull out data in row order into a list."
177 (let ((result (list))
178 (nrows (nth 0 (array-dimensions ary)))
179 (ncols (nth 1 (array-dimensions ary))))
180 (dotimes (i ncols)
181 (dotimes (j nrows)
182 (append result (aref ary i j))))))
184 (defun col-order-as-list (ary)
185 "Pull out data in row order into a list."
186 (let ((result (list))
187 (nrows (nth 0 (array-dimensions ary)))
188 (ncols (nth 1 (array-dimensions ary))))
189 (dotimes (i nrows)
190 (dotimes (j ncols)
191 (append result (aref ary i j))))))
193 (defun transpose-array (ary)
194 "map NxM to MxN."
195 (make-array (reverse (array-dimensions ary))
196 :initial-contents (col-order-as-list ary)))
199 ;;;; THE FOLLOWING 2 dual-sets done to provide error checking
200 ;;;; possibilities. Not intended as make-work!
202 (defun varNames (ds)
203 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
204 (var-labels ds))
206 (defun set-varNames (ds vN)
207 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
208 (if (= (length (var-labels ds))
209 (length vN))
210 (setf (var-labels ds) vN)
211 (error "wrong size.")))
213 (defsetf varNames set-varNames)
215 ;;; Case-name handling for Tables. Needs error checking.
216 (defun caseNames (ds)
217 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
218 (case-labels ds))
220 (defun set-caseNames (ds vN)
221 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
222 (if (= (length (case-labels ds))
223 (length vN))
224 (setf (case-labels ds) vN)
225 (error "wrong size.")))
227 (defsetf caseNames set-caseNames)
229 ;;;;;;;;;;;; IMPLEMENTATIONS, with appropriate methods.
231 ;; See also:
232 ;; (documentation 'dataframe-like 'type)
234 (defclass dataframe-array (dataframe-like)
235 ((store :initform nil
236 :initarg :storage
237 :type (array * *)
238 :accessor dataset
239 :documentation "Data storage: typed as array."))
240 (:documentation "example implementation of dataframe-like using storage
241 based on lisp arrays. An obvious alternative could be a
242 dataframe-matrix-like which uses the lisp-matrix classes."))
244 (defmethod nrows ((df dataframe-array))
245 "specializes on inheritance from matrix-like in lisp-matrix."
246 (array-dimension (dataset df) 0))
248 (defmethod ncols ((df dataframe-array))
249 "specializes on inheritance from matrix-like in lisp-matrix."
250 (array-dimension (dataset df) 1))
252 (defmethod consistent-dataframe-like-p ((ds dataframe-array))
253 "Test that dataframe-like is internally consistent with metadata.
254 Ensure that dims of stored data are same as case and var labels.
256 Currently checks length of things, but needs to check type of things
257 as well."
258 (and
259 ;; ensure dimensionality
260 (equal (list (ncols ds) (nrows ds)) ; array-dimensions (dataset ds))
261 (list (length (var-labels ds))
262 (length (case-labels ds))))
263 ;; when dims sane, check-type for each variable
264 (progn
265 (dolist (i (ncols ds))
266 (dotimes (j (nrows ds))
267 (typep (aref (dataset ds) i j) (nth i (var-types ds)))))
268 t)))
271 (defmethod dfref ((df dataframe-array) index1 index2 &key return-type)
272 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
273 idx1/2 is row/col or case/var."
274 (ecase return-type
275 ((scalar) (aref (dataset df) index1 index2))
276 ((dataframe) (make-instance 'dataframe-array
277 :storage (make-array
278 (list 1 1)
279 :initial-contents (dfref df index1 index2))
280 ;; ensure copy for this and following
281 :doc (doc-string df)
282 :case-labels (nth index1 (caseNames df))
283 :var-labels (nth index2 (varNames df))
284 ;; shound the type spec assume, as
285 ;; below, or should it inherit from the
286 ;; dataframe we are selecting from?
287 :var-types (nth index2 (var-types df))))))
291 (defun dfref-var (df index return-type)
292 "Returns the data in a single variable as type.
293 type = sequence, vector, vector-like (if valid numeric type) or dataframe."
294 (ecase return-type
295 (('list)
296 (map 'list
297 #'(lambda (x) (dfref df index x))
298 (gen-seq (nth 2 (array-dimensions (dataset df))))))
299 (('vector) t)
300 (:vector-like t)
301 (:matrix-like t)
302 (:dataframe t)))
304 (defun dfref-obsn (df index return-type)
305 "Returns row as sequence."
306 (ecase return-type
307 (:list
308 (map 'list
309 #'(lambda (x) (dfref df x index))
310 (gen-seq (nth 1 (array-dimensions (dataset df))))))
311 (:vector t)
312 (:vector-like t)
313 (:matrix-like t)
314 (:dataframe t)))
316 ;; FIXME
317 (defun dfref-2indexlist (df indexlist1 indexlist2 &key (return-type :array))
318 "return an array, row X col dims. FIXME TESTME"
319 (case return-type
320 (:array
321 (let ((my-pre-array (list)))
322 (dolist (x indexlist1)
323 (dolist (y indexlist2)
324 (append my-pre-array (dfref df x y))))
325 (make-array (list (length indexlist1)
326 (length indexlist2))
327 :initial-contents my-pre-array)))
328 (:dataframe
329 (make-instance 'dataframe-array
330 :storage (make-array
331 (list (length indexlist1)
332 (length indexlist2))
333 :initial-contents (dataset df))
334 ;; ensure copy for this and following
335 :doc (doc-string df)
336 ;; the following 2 need to be subseted based on
337 ;; the values of indexlist1 and indexlist2
338 :case-labels (case-labels df)
339 :var-labels (var-labels df)))))
341 ;;; Do we establish methods for dataframe-like, which specialize to
342 ;;; particular instances of storage?
344 (defmethod print-object ((object dataframe-array) stream)
345 (print-unreadable-object (object stream :type t)
346 (format stream " ~d x ~d" (nrows object) (ncols object))
347 (terpri stream)
348 ;; (format stream "~T ~{~S ~T~}" (var-labels object))
349 (dotimes (j (ncols object))
350 (write-char #\tab stream)
351 (format stream "~A~T" (nth j (var-labels object))))
352 (dotimes (i (nrows object))
353 (terpri stream)
354 (format stream "~A:~T" (nth i (case-labels object)))
355 (dotimes (j (ncols object))
356 ;; (write-char #\space stream)
357 (write-char #\tab stream)
358 (write (dfref object i j) :stream stream)))))
361 (defun print-structure-relational (ds)
362 "example of what we want the methods to look like. Should be sort
363 of like a graph of spreadsheets if the storage is a relational
364 structure."
365 (dolist (k (relations ds))
366 (let ((currentRelationSet (getRelation ds k)))
367 (print-as-row (var-labels currentRelationSet))
368 (let ((j -1))
369 (dolist (i (case-labels currentRelationSet))
370 (print-as-row
371 (append (list i)
372 (dfref-obsn (dataset currentRelationSet)
373 (incf j)))))))))