fixed print-method for dataframes. Now it sort-o works.
[CommonLispStat.git] / src / data / data-clos.lisp
blob15d5971acfadcfc1cf41ffe05a63234b3124037f
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-03-23 17:47:38 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: data-clos.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
8 ;;; on how it arrives.
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics.
11 ;;; This redoes data storage structures in a CLOS based
12 ;;; framework.
13 ;;;
15 ;;; What is this talk of 'release'? Klingons do not make software
16 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
17 ;;; designers and quality assurance people in its wake.
19 (in-package :lisp-stat-data-clos)
21 ;;; No real basis for work, there is a bit of new-ness and R-ness to
22 ;;; this work. In particular, the notion of relation is key and
23 ;;; integral to the analysis. Tables are related and matched vectors,
24 ;;; for example. "column" vectors are related observations (by
25 ;;; measure/recording) while "row" vectors are related readings (by
26 ;;; case)
27 ;;;
29 ;;; Relational structure -- can we capture a completely unnormalized
30 ;;; data strucutre to propose possible modeling approaches, and
31 ;;; propose appropriate models and inferential strategies?
32 ;;;
33 ;;; So we want a verb-driven API for data collection construction. We
34 ;;; should encode independence or lack of, as possible.
36 ;; Need to figure out typed vectors. We then map a series of typed
37 ;; vectors over to tables where columns are equal typed. In a sense,
38 ;; this is a relation (1-1) of equal-typed arrays. For the most part,
39 ;; this ends up making the R data.frame into a relational building
40 ;; block (considering 1-1 mappings using row ID as a relation).
41 ;; Is this a worthwhile generalization?
43 ;;; verbs vs semantics for DS conversion -- consider the possibily of
44 ;;; how adverbs and verbs relate, where to put which semantically to
45 ;;; allow for general approach.
47 ;;; eg. Kasper's talk on the FUSION collection of parsers.
49 ;;;
50 ;;; Need to consider modification APIs
51 ;;; actions are:
52 ;;; - import
53 ;;; - get/set row names (case names)
54 ;;; - column names (variable names)
55 ;;; - dataset values
56 ;;; - annotation/metadata
57 ;;; - make sure that we do coherency checking in the exported
58 ;;; - functions.
59 ;;; - ...
60 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
61 ;;; additional input).
62 ;;; - either overwriting or not, i.e. with or without copy.
63 ;;; - check consistency of resulting data with metadata and related
64 ;;; data information.
65 ;;; -
67 (defclass dataframe-like (matrix-like)
69 ;; STORE is the storage component. We ignore this in the DATAFRAME-LIKE
70 ;; class, as it is the primary differentiator, driving how access
71 ;; (getting/setting) is done. We create methods depending on the
72 ;; storage component, which access data as appropriate. See
73 ;; DATAFRAME-ARRAY for an example implementation.
74 ;; the rest of this is metadata. In particular, we should find a
75 ;; more flexible, compact way to store this.
76 (case-labels :initform nil
77 :initarg :case-labels
78 :type list
79 :accessor case-labels
80 :documentation "labels used for describing cases (doc
81 metadata), possibly used for merging.")
82 (var-labels :initform nil
83 :initarg :var-labels
84 :type list
85 :accessor var-labels
86 :documentation "Variable names.")
87 (var-types :initform nil
88 :initarg :var-types
89 :type list
90 :accessor var-types
91 :documentation "variable types to ensure fit")
92 (documentation-string :initform nil
93 :initarg :doc
94 :accessor doc-string
95 :documentation "additional information,
96 potentially uncomputable, about dataframe-like instance."))
97 (:documentation "Abstract class for standard statistical analysis
98 dataset for independent data. Rows are considered
99 to be independent, matching observations. Columns
100 are considered to be type-consistent, match a
101 variable with distribution. inherits from
102 lisp-matrix base MATRIX-LIKE class.
104 DATAFRAME-LIKE is the basic cases by variables
105 framework. Need to embed this within other
106 structures which allow for generalized relations.
107 Goal is to ensure that relations imply and drive
108 the potential for statistical relativeness such as
109 correlation, interference, and similar concepts."))
112 ;;; Access and Extraction
114 (defun dfref (df idx1 idx2 &key (type :scalar))
115 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
116 idx1/2 is row/col or case/var."
117 (case type
118 (:scalar (aref (dataset df) idx1 idx2))
119 (:dataframe (make-instance 'dataframe-array
120 :storage (make-array
121 (list 1 1)
122 :initial-contents (dfref df idx1 idx2))
123 ;; ensure copy for this and following
124 :doc (doc-string df)
125 :case-labels (nth idx1 (caseNames df))
126 :var-labels (nth idx2 (varNames df))
127 ;; shound the type spec assume, as
128 ;; below, or should it inherit from the
129 ;; dataframe we are selecting from?
130 :var-types (nth idx2 (var-types df))))))
133 (defun gen-seq (n &optional (start 1))
134 "There has to be a better way -- I'm sure of it! default count from 1.
135 (gen-seq 4) ; => (1 2 3 4)
136 (gen-seq 0) ; => nil
137 (gen-seq 5 3) ; => 3 4 5
139 (if (>= n start)
140 (append (gen-seq (- n 1) start) (list n))))
142 (defun dfref-var (df index &key (return-type :list))
143 "Returns the data in a single variable as type.
144 type = sequence, vector, vector-like (if valid numeric type) or dataframe."
145 (ecase return-type
146 (:list
147 (map 'list
148 #'(lambda (x) (dfref df index x))
149 (gen-seq (nth 2 (array-dimensions (dataset df))))))
150 (:vector t)
151 (:vector-like t)
152 (:dataframe t)))
154 (defun dfref-obsn (df index)
155 "Returns row as sequence."
156 (map 'sequence
157 #'(lambda (x) (dfref df x index))
158 (gen-seq (nth 1 (array-dimensions (dataset df))))))
160 ;; FIXME
161 (defun dfref-2indexlist (df indexlist1 indexlist2 &key (return-type :array))
162 "return an array, row X col dims. FIXME TESTME"
163 (case return-type
164 (:array
165 (let ((my-pre-array (list)))
166 (dolist (x indexlist1)
167 (dolist (y indexlist2)
168 (append my-pre-array (dfref df x y))))
169 (make-array (list (length indexlist1)
170 (length indexlist2))
171 :initial-contents my-pre-array)))
172 (:dataframe
173 (make-instance 'dataframe-array
174 :storage (make-array
175 (list (length indexlist1)
176 (length indexlist2))
177 :initial-contents (dataset df))
178 ;; ensure copy for this and following
179 :doc (doc-string df)
180 ;; the following 2 need to be subseted based on
181 ;; the values of indexlist1 and indexlist2
182 :case-labels (case-labels df)
183 :var-labels (var-labels df)))))
186 ;; Testing consistency/coherency.
188 (defgeneric consistent-dataframe-like-p (ds)
189 (:documentation "methods to check for consistency."))
191 (defmethod consistent-dataframe-like-p ((ds dataframe-like))
192 "Test that dataframe-like is internally consistent with metadata.
193 Ensure that dims of stored data are same as case and var labels.
195 Currently checks length of things, but needs to check type of things
196 as well."
197 (and
198 ;; ensure dimensionality
199 (equal (list (ncols ds) (nrows ds)) ; array-dimensions (dataset ds))
200 (list (length (var-labels ds))
201 (length (case-labels ds))))
202 ;; when dims sane, check-type for each variable
203 (progn
204 (dolist (i (ncols ds))
205 (dotimes (j (nrows ds))
206 (typep (aref (dataset ds) i j) (nth i (var-types ds)))))
207 t)))
212 (defun ensure-consistent-datatable-type (dt lot)
213 "given a datatable and a listoftypes, ensure that the datatble
214 variables are consistent."
215 (destructuring-bind (n p)
216 (array-dimensions dt)
217 (dotimes (i n)
218 (dotimes (j p)
219 (check-type (aref dt i j) (elt lot j))))))
222 ;;; Printing methods and support.
224 (defun print-as-row (seq)
225 "Print a sequence formated as a row in a table."
226 (format t "~{~D~T~}" seq))
228 ;; (print-as-row (list 1 2 3))
230 (defun print-structure-table (ds)
231 "example of what we want the methods to look like. Should be sort
232 of like a spreadsheet if the storage is a table."
233 (print-as-row (var-labels ds))
234 (let ((j -1))
235 (dolist (i (case-labels ds))
236 (print-as-row (append (list i)
237 (dfref-obsn (dataset ds) (incf j)))))))
240 (defun print-structure-relational (ds)
241 "example of what we want the methods to look like. Should be sort
242 of like a graph of spreadsheets if the storage is a relational
243 structure."
244 (dolist (k (relations ds))
245 (let ((currentRelationSet (getRelation ds k)))
246 (print-as-row (var-labels currentRelationSet))
247 (let ((j -1))
248 (dolist (i (case-labels currentRelationSet))
249 (print-as-row
250 (append (list i)
251 (dfref-obsn (dataset currentRelationSet)
252 (incf j)))))))))
256 (defun row-order-as-list (ary)
257 "Pull out data in row order into a list."
258 (let ((result (list))
259 (nrows (nth 0 (array-dimensions ary)))
260 (ncols (nth 1 (array-dimensions ary))))
261 (dotimes (i ncols)
262 (dotimes (j nrows)
263 (append result (aref ary i j))))))
265 (defun col-order-as-list (ary)
266 "Pull out data in row order into a list."
267 (let ((result (list))
268 (nrows (nth 0 (array-dimensions ary)))
269 (ncols (nth 1 (array-dimensions ary))))
270 (dotimes (i nrows)
271 (dotimes (j ncols)
272 (append result (aref ary i j))))))
275 (defun transpose-array (ary)
276 "map NxM to MxN."
277 (make-array (reverse (array-dimensions ary))
278 :initial-contents (col-order-as-list ary)))
281 ;;; Variable-name handling for Tables. Needs error checking.
282 (defun varNames (ds)
283 (var-labels ds))
285 (defun set-varNames (ds vN)
286 (if (= (length (var-labels ds))
287 (length vN))
288 (setf (var-labels ds) vN)
289 (error "wrong size.")))
291 (defsetf varNames set-varNames)
293 ;;; Case-name handling for Tables. Needs error checking.
294 (defun caseNames (ds)
295 (case-labels ds))
297 (defun set-caseNames (ds vN)
298 (if (= (length (case-labels ds))
299 (length vN))
300 (setf (case-labels ds) vN)
301 (error "wrong size.")))
303 (defsetf caseNames set-caseNames)
305 ;;;;;;;;;;;; IMPLEMENTATIONS, with appropriate methods.
307 ;; (documentation 'dataframe-like 'type)
309 (defclass dataframe-array (dataframe-like)
310 ((store :initform nil
311 :initarg :storage
312 :type (array * *)
313 :accessor dataset
314 :documentation "Data storage: typed as array."))
315 (:documentation "example implementation of dataframe-like using storage
316 based on lisp arrays. An obvious alternative could be a
317 dataframe-matrix-like which uses the lisp-matrix classes."))
319 (defmethod nrows ((df dataframe-array))
320 (array-dimension (dataset df) 0))
322 (defmethod ncols ((df dataframe-array))
323 (array-dimension (dataset df) 1))
325 ;;; NEED TO FIGURE OUT HOW TO EXTEND THE MATRIX-LIKE CLASS PRINT
326 ;;; METHOD!
328 (defmethod print-object ((object dataframe-array) stream)
329 (print-unreadable-object (object stream :type t)
330 (format stream " ~d x ~d" (nrows object) (ncols object))
331 (terpri stream)
332 ;; (format stream "~T ~{~S ~T~}" (var-labels object))
333 (dotimes (j (ncols object))
334 (write-char #\tab stream)
335 (format stream "~A~T" (nth j (var-labels object))))
336 (dotimes (i (nrows object))
337 (terpri stream)
338 (format stream "~A:~T" (nth i (case-labels object)))
339 (dotimes (j (ncols object))
340 ;; (write-char #\space stream)
341 (write-char #\tab stream)
342 (write (dfref object i j) :stream stream)))))