cleanup of dataframe code. Still lots more work to go.
[CommonLispStat.git] / src / data / data-clos.lisp
blob159509f5ee182c5198233ff3a5a7aa31853dcf17
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-03-21 09:24:56 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: data-clos.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
8 ;;; on how it arrives.
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics.
11 ;;; This redoes data storage structures in a CLOS based
12 ;;; framework.
13 ;;;
15 ;;; What is this talk of 'release'? Klingons do not make software
16 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
17 ;;; designers and quality assurance people in its wake.
19 (in-package :lisp-stat-data-clos)
21 ;;; No real basis for work, there is a bit of new-ness and R-ness to
22 ;;; this work. In particular, the notion of relation is key and
23 ;;; integral to the analysis. Tables are related and matched vectors,
24 ;;; for example. "column" vectors are related observations (by
25 ;;; measure/recording) while "row" vectors are related readings (by
26 ;;; case)
27 ;;;
29 ;;; Relational structure -- can we capture a completely unnormalized
30 ;;; data strucutre to propose possible modeling approaches, and
31 ;;; propose appropriate models and inferential strategies?
32 ;;;
33 ;;; So we want a verb-driven API for data collection construction. We
34 ;;; should encode independence or lack of, as possible.
36 ;; Need to figure out typed vectors. We then map a series of typed
37 ;; vectors over to tables where columns are equal typed. In a sense,
38 ;; this is a relation (1-1) of equal-typed arrays. For the most part,
39 ;; this ends up making the R data.frame into a relational building
40 ;; block (considering 1-1 mappings using row ID as a relation).
41 ;; Is this a worthwhile generalization?
43 ;;; verbs vs semantics for DS conversion -- consider the possibily of
44 ;;; how adverbs and verbs relate, where to put which semantically to
45 ;;; allow for general approach.
47 ;;; eg. Kasper's talk on the FUSION collection of parsers.
49 ;;;
50 ;;; Need to consider modification APIs
51 ;;; actions are:
52 ;;; - import
53 ;;; - get/set row names (case names)
54 ;;; - column names (variable names)
55 ;;; - dataset values
56 ;;; - annotation/metadata
57 ;;; - make sure that we do coherency checking in the exported
58 ;;; - functions.
59 ;;; - ...
60 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
61 ;;; additional input).
62 ;;; - either overwriting or not, i.e. with or without copy.
63 ;;; - check consistency of resulting data with metadata and related
64 ;;; data information.
65 ;;; -
67 (defclass dataframe-like (matrix-like)
69 ;; STORE is the storage component. We ignore this in the DATAFRAME-LIKE
70 ;; class, as it is the primary differentiator, driving how access
71 ;; (getting/setting) is done. We create methods depending on the
72 ;; storage component, which access data as appropriate. See
73 ;; DATAFRAME-ARRAY for an example implementation.
74 ;; the rest of this is metadata. In particular, we should find a
75 ;; more flexible, compact way to store this.
76 (case-labels :initform nil
77 :initarg :case-labels
78 :type list
79 :accessor case-labels
80 :documentation "labels used for describing cases (doc
81 metadata), possibly used for merging.")
82 (var-labels :initform nil
83 :initarg :var-labels
84 :type list
85 :accessor var-labels
86 :documentation "Variable names.")
87 (var-types :initform nil
88 :initarg :var-types
89 :type list
90 :accessor var-types
91 :documentation "variable types to ensure fit")
92 (documentation-string :initform nil
93 :initarg :doc
94 :accessor doc-string
95 :documentation "additional information,
96 potentially uncomputable, about dataframe-like instance."))
97 (:documentation "Abstract class for standard statistical analysis
98 dataset for independent data. Rows are considered
99 to be independent, matching observations. Columns
100 are considered to be type-consistent, match a
101 variable with distribution. inherits from
102 lisp-matrix base MATRIX-LIKE class.
104 DATAFRAME-LIKE is the basic cases by variables
105 framework. Need to embed this within other
106 structures which allow for generalized relations.
107 Goal is to ensure that relations imply and drive
108 the potential for statistical relativeness such as
109 correlation, interference, and similar concepts."))
112 ;;; Access and Extraction
114 (defun dfref (df idx1 idx2 &key (type :scalar))
115 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
116 idx1/2 is row/col or case/var."
117 (case type
118 (:scalar (aref (dataset df) idx1 idx2))
119 (:dataframe (make-instance 'dataframe-array
120 :storage (make-array
121 (list 1 1)
122 :initial-contents (dfref df idx1 idx2))
123 ;; ensure copy for this and following
124 :doc (doc-string df)
125 :case-labels (nth idx1 (caseNames df))
126 :var-labels (nth idx2 (varNames df))
127 ;; shound the type spec assume, as
128 ;; below, or should it inherit from the
129 ;; dataframe we are selecting from?
130 :var-types (nth idx2 (var-types df))))))
133 (defun gen-seq (n &optional (start 1))
134 "There has to be a better way -- I'm sure of it! default count from 1.
135 (gen-seq 4) ; => (1 2 3 4)
136 (gen-seq 0) ; => nil
137 (gen-seq 5 3) ; => 3 4 5
139 (if (>= n start)
140 (append (gen-seq (- n 1) start) (list n))))
142 (defun dfref-var (sds index &key (type :list))
143 "Returns data as type.
144 type = sequence, vector, vector-like (if valid numeric type) or dataframe."
145 (ecase type
146 (:list
147 (map 'list
148 #'(lambda (x) (dfref sds index x))
149 (gen-seq (nth 2 (array-dimensions (dataset sds))))))
150 (:vector t)
151 (:vector-like t)
152 (:dataframe t)))
154 (defun dfref-obsn (sds index)
155 "Returns row as sequence."
156 (map 'sequence
157 #'(lambda (x) (extract-1 sds x index))
158 (gen-seq (nth 1 (array-dimensions (dataset sds))))))
160 ;; FIXME
161 (defun extract-idx (sds idx1Lst idx2Lst)
162 "return an array, row X col dims. FIXME TESTME"
163 (let ((my-pre-array (list)))
164 (dolist (x idx1Lst)
165 (dolist (y idx2Lst)
166 (append my-pre-array (extract-1 sds x y))))
167 (make-array (list (length idx1Lst) (length idx2Lst))
168 :initial-contents my-pre-array)))
171 (defun extract-idx-sds (sds idx1Lst idx2Lst)
172 "return a dataset encapsulated version of extract-idx."
173 (make-instance 'dataframe-array
174 :storage (make-array
175 (list (length idx1Lst) (length idx2Lst))
176 :initial-contents (dataset sds))
177 ;; ensure copy for this and following
178 :doc (doc-string sds)
179 :case-labels (caseNames sds)
180 :var-labels (varNames sds)))
182 (defgeneric extract (sds whatAndRange)
183 (:documentation "data extraction approach"))
185 ;; Testing consistency/coherency.
187 (defgeneric consistent-dataframe-like-p (ds)
188 (:documentation "methods to check for consistency."))
190 (defmethod consistent-dataframe-like-p ((ds dataframe-like))
191 "Test that dataframe-like is internally consistent with metadata.
192 Ensure that dims of stored data are same as case and var labels.
194 Currently checks length of things, but needs to check type of things
195 as well."
196 (and
197 ;; ensure dimensionality
198 (equal (list (ncols ds) (nrows ds)) ; array-dimensions (dataset ds))
199 (list (length (var-labels ds))
200 (length (case-labels ds))))
201 ;; when dims sane, check-type for each variable
202 (progn
203 (dolist (i (ncols ds))
204 (dotimes (j (nrows ds))
205 (typep (aref (dataset ds) i j) (nth i (var-types ds)))))
206 t)))
211 (defun ensure-consistent-datatable-type (dt lot)
212 "given a datatable and a listoftypes, ensure that the datatble
213 variables are consistent."
214 (destructuring-bind (n p)
215 (array-dimensions dt)
216 (dotimes (i n)
217 (dotimes (j p)
218 (check-type (aref dt i j) (elt lot j))))))
221 ;;; Printing methods and support.
223 (defun print-as-row (seq)
224 "Print a sequence formated as a row in a table."
225 (format t "~{~D~T~}" seq))
227 ;; (print-as-row (list 1 2 3))
229 (defun print-structure-table (ds)
230 "example of what we want the methods to look like. Should be sort
231 of like a spreadsheet if the storage is a table."
232 (print-as-row (var-labels ds))
233 (let ((j -1))
234 (dolist (i (case-labels ds))
235 (print-as-row (append (list i)
236 (extract-row (dataset ds) (incf j)))))))
239 (defun print-structure-relational (ds)
240 "example of what we want the methods to look like. Should be sort
241 of like a graph of spreadsheets if the storage is a relational
242 structure."
243 (dolist (k (relations ds))
244 (let ((currentRelationSet (getRelation ds k)))
245 (print-as-row (var-labels currentRelationSet))
246 (let ((j -1))
247 (dolist (i (case-labels currentRelationSet))
248 (print-as-row
249 (append (list i)
250 (extract-row (dataset currentRelationSet)
251 (incf j)))))))))
255 ;;; Shaping for computation
257 (defgeneric reshapeData (dataform into-form as-copy)
258 (:documentation "pulling data into a new form"))
260 (defmethod reshapeData ((sds dataframe-like) what into-form))
262 (defmethod reshapeData ((ds array) (sp list) copy-p)
263 "Array via specList specialization: similar to the common R
264 approaches to redistribution.")
266 (defclass data-format () ())
268 (defun row-order-as-list (ary)
269 "Pull out data in row order into a list."
270 (let ((result (list))
271 (nrows (nth 0 (array-dimensions ary)))
272 (ncols (nth 1 (array-dimensions ary))))
273 (dotimes (i ncols)
274 (dotimes (j nrows)
275 (append result (aref ary i j))))))
277 (defun col-order-as-list (ary)
278 "Pull out data in row order into a list."
279 (let ((result (list))
280 (nrows (nth 0 (array-dimensions ary)))
281 (ncols (nth 1 (array-dimensions ary))))
282 (dotimes (i nrows)
283 (dotimes (j ncols)
284 (append result (aref ary i j))))))
286 (defun transpose (ary)
287 "map NxM to MxN."
288 (make-array (reverse (array-dimensions ary))
289 :initial-contents (col-order-as-list ary)))
292 ;;; Variable-name handling for Tables. Needs error checking.
293 (defun varNames (ds)
294 (var-labels ds))
296 (defun set-varNames (ds vN)
297 (if (= (length (var-labels ds))
298 (length vN))
299 (setf (var-labels ds) vN)
300 (error "wrong size.")))
302 (defsetf varNames set-varNames)
304 ;;; Case-name handling for Tables. Needs error checking.
305 (defun caseNames (ds)
306 (case-labels ds))
308 (defun set-caseNames (ds vN)
309 (if (= (length (case-labels ds))
310 (length vN))
311 (setf (case-labels ds) vN)
312 (error "wrong size.")))
314 (defsetf caseNames set-caseNames)
316 ;;;;;;;;;;;; IMPLEMENTATIONS, with appropriate methods.
318 ;; (documentation 'dataframe-like 'type)
320 (defclass dataframe-array (dataframe-like)
321 ((store :initform nil
322 :initarg :storage
323 :type (array * *)
324 :accessor dataset
325 :documentation "Data storage: typed as array."))
326 (:documentation "example implementation of dataframe-like using storage
327 based on lisp arrays. An obvious alternative could be a
328 dataframe-matrix-like which uses the lisp-matrix classes."))
330 (defmethod nrows ((df dataframe-array))
331 (array-dimension (dataset df) 0))
333 (defmethod ncols ((df dataframe-array))
334 (array-dimension (dataset df) 1))
336 ;;; NEED TO FIGURE OUT HOW TO EXTEND THE MATRIX-LIKE CLASS PRINT
337 ;;; METHOD!
340 (defmethod print-object ((object dataframe-array) stream)
341 (print-unreadable-object (object stream :type t)
342 (format stream " ~d x ~d" (nrows object) (ncols object))
343 (terpri stream)
344 (format stream "~{~A~}" (var-labels object))
345 (dotimes (i (nrows object))
346 (terpri stream)
347 (dotimes (j (ncols object))
348 (format stream " obs ~A" (nth i (case-labels object)))
349 (write-char #\space stream)
350 (write (dfref object i j) :stream stream)))))