setting up dataframe-like and dataframe-array classes and strucutre.
[CommonLispStat.git] / src / data / data-clos.lisp
bloba6272000638f16574d5dc7c34108043a90025496
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-03-16 20:20:07 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: data-clos.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
8 ;;; on how it arrives.
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics.
11 ;;; This redoes data storage structures in a CLOS based
12 ;;; framework.
13 ;;;
15 ;;; What is this talk of 'release'? Klingons do not make software
16 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
17 ;;; designers and quality assurance people in its wake.
20 (in-package :lisp-stat-data-clos)
22 ;;; No real basis for work, there is a bit of new-ness and R-ness to
23 ;;; this work. In particular, the notion of relation is key and
24 ;;; integral to the analysis. Tables are related and matched vectors,
25 ;;; for example. "column" vectors are related observations (by
26 ;;; measure/recording) while "row" vectors are related readings (by
27 ;;; case)
28 ;;;
30 ;;; Relational structure -- can we capture a completely unnormalized
31 ;;; data strucutre to propose possible modeling approaches, and
32 ;;; propose appropriate models and inferential strategies?
33 ;;;
34 ;;; So we want a verb-driven API for data collection construction. We
35 ;;; should encode independence or lack of, as possible.
37 #+nil(progn
38 (def-statschema MyDB
39 :tables (list (list t1 )
40 (list t2 )
41 (list t4 ))
42 :unique-key key
43 :stat-relation '(t1 (:nest-within t2) (:nest-within t3))))
45 ;; Need to figure out typed vectors. We then map a series of typed
46 ;; vectors over to tables where columns are equal typed. In a sense,
47 ;; this is a relation (1-1) of equal-typed arrays. For the most part,
48 ;; this ends up making the R data.frame into a relational building
49 ;; block (considering 1-1 mappings using row ID as a relation).
50 ;; Is this a worthwhile generalization?
52 ;;; verbs vs semantics for DS conversion -- consider the possibily of
53 ;;; how adverbs and verbs relate, where to put which semantically to
54 ;;; allow for general approach.
56 ;;; eg. Kasper's talk on the FUSION collection of parsers.
58 ;;;
59 ;;; Need to consider modification APIs
60 ;;; actions are:
61 ;;; - import
62 ;;; - get/set row names (case names)
63 ;;; - column names (variable names)
64 ;;; - dataset values
65 ;;; - annotation/metadata
66 ;;; - make sure that we do coherency checking in the exported
67 ;;; - functions.
68 ;;; - ...
69 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
70 ;;; additional input).
71 ;;; - either overwriting or not, i.e. with or without copy.
72 ;;; - check consistency of resulting data with metadata and related
73 ;;; data information.
74 ;;; -
76 (defclass dataframe-like (matrix-like)
79 ;; STORE is the storage component. We ignore this in the -like ;
80 ;; class, as it is the primary differentiator, driving how access
81 ;; (getting/setting) is done. We create methods depending on the
82 ;; storage component, which access data as appropriate.
84 ;; so: subclass this based on storage type, and ensure that generic
85 ;; accessors have the right methods to do the right thing.
87 (store :initform nil
88 :initarg :storage
89 :accessor dataset
90 :documentation "Data storage: typed as table, array,
91 relation, or pointer/reference to such.")
93 (documentation-string :initform nil
94 :initarg :doc
95 :accessor doc-string
96 :documentation "uncomputable information
97 about dataframe-like
98 instance.")
100 ;; the rest of this is metadata. In particular, we should find a
101 ;; more flexible, compact way to store this.
102 (case-labels :initform nil
103 :initarg :case-labels
104 :accessor case-labels
105 :documentation "labels used for describing cases (doc
106 metadata), possibly used for merging.")
107 (var-labels :initform nil
108 :initarg :var-labels
109 :accessor var-labels
110 :documentation "Variable names.")
111 (var-types :initform nil
112 :initarg :var-types
113 :accessor var-types
114 :documentation "variable types to ensure fit"
116 (:documentation "Abstract class for standard statistical analysis
117 dataset for independent data. Rows are considered
118 to be independent, matching observations. Columns
119 are considered to be type-consistent, match a
120 varioable with distribution. inherits from
121 lisp-matrix base matrix-like class. "))
124 ;; dataframe-like is the basic cases by variables framework. Need to
125 ;; embed this within other structures which allow for generalized
126 ;; relations. Goal is to ensure that relations imply and drive the
127 ;; potential for statistical relativeness such as correlation,
128 ;; interference, and similar concepts.
131 (defclass dataframe-array (dataframe-like)
132 ((store :initform nil
133 :initarg :storage
134 :type (array * *)
135 :accessor dataset
136 :documentation "Data storage: typed as table, array,
137 relation, or pointer/reference to such."))
138 (:documentation "example implementation of dataframe-like using storage
139 based on lisp arrays."))
142 ;; Actions on a statistical data structure.
145 (defgeneric consistent-dataframe-like-p (ds)
146 (:documentation "methods to check for consistency."))
148 (defmethod consistent-dataframe-like-p ((ds dataframe-like))
149 "Test that dataframe-like is internally consistent with metadata.
150 Ensure that dims of stored data are same as case and var labels."
151 (equal (array-dimensions (dataset ds))
152 (list (length (var-labels ds))
153 (length (case-labels ds)))))
154 ;; FIXME: NEED TO CHECK TYPING AS WELL!
157 ;;; Extraction
159 (defgeneric access (dataframe-like spec-list)
160 (:documentation "access to array presevingtype."))
162 (defgeneric get-variable-matrix (dataframe-like-object list-of-variable-names)
163 (:documentation "retrieves a matrix whose columns are the variable
164 names in same order specified."))
166 (defgeneric get-variable-vector (dataframe-like-object variable-name))
168 (defun extract-1 (sds idx1 idx2)
169 "Returns a scalar."
170 (aref (dataset sds) idx1 idx2))
172 (defun extract-1-as-sds (sds idx1 idx2)
173 "Need a version which returns a dataset."
174 (make-instance 'dataframe-array
175 :storage (make-array
176 (list 1 1)
177 :initial-contents (extract-1 sds idx1 idx2))
178 ;; ensure copy for this and following
179 :doc (doc-string sds)
180 :case-labels (caseNames sds)
181 :var-labels (varNames sds)))
183 (defun gen-seq (n &optional (start 1))
184 "There has to be a better way -- I'm sure of it! Always count from 1."
185 (if (>= n start)
186 (append (gen-seq (- n 1) start) (list n))))
187 ;; (gen-seq 4)
188 ;; => (1 2 3 4)
189 ;; (gen-seq 0)
190 ;; => nil
191 ;; (gen-seq 5 3)
192 ;; => 3 4 5
195 (defun extract-col (sds index)
196 "Returns data as sequence."
197 (map 'sequence
198 #'(lambda (x) (extract-1 sds index x))
199 (gen-seq (nth 2 (array-dimensions (dataset sds))))))
201 (defun extract-col-as-sds (sds index)
202 "Returns data as SDS, copied."
203 (map 'sequence
204 #'(lambda (x) (extract-1 sds index x))
205 (gen-seq (nth 2 (array-dimensions (dataset sds))))))
207 (defun extract-row (sds index)
208 "Returns row as sequence."
209 (map 'sequence
210 #'(lambda (x) (extract-1 sds x index))
211 (gen-seq (nth 1 (array-dimensions (dataset sds))))))
213 (defun extract-idx (sds idx1Lst idx2Lst)
214 "return an array, row X col dims. FIXME TESTME"
215 (let ((my-pre-array (list)))
216 (dolist (x idx1Lst)
217 (dolist (y idx2Lst)
218 (append my-pre-array (extract-1 sds x y))))
219 (make-array (list (length idx1Lst) (length idx2Lst))
220 :initial-contents my-pre-array)))
223 (defun extract-idx-sds (sds idx1Lst idx2Lst)
224 "return a dataset encapsulated version of extract-idx."
225 (make-instance 'dataframe-array
226 :storage (make-array
227 (list (length idx1Lst) (length idx2Lst))
228 :initial-contents (dataset sds))
229 ;; ensure copy for this and following
230 :doc (doc-string sds)
231 :case-labels (caseNames sds)
232 :var-labels (varNames sds)))
234 (defgeneric extract (sds whatAndRange)
235 (:documentation "data extraction approach"))
238 ;;; Printing methods and support.
240 (defun print-as-row (seq)
241 "Print a sequence formated as a row in a table."
242 (format t "~{~D~T~}" seq))
244 ;; (print-as-row (list 1 2 3))
246 (defun print-structure-table (ds)
247 "example of what we want the methods to look like. Should be sort
248 of like a spreadsheet if the storage is a table."
249 (print-as-row (var-labels ds))
250 (let ((j -1))
251 (dolist (i (case-labels ds))
252 (print-as-row (append (list i)
253 (extract-row (dataset ds) (incf j)))))))
256 (defun print-structure-relational (ds)
257 "example of what we want the methods to look like. Should be sort
258 of like a graph of spreadsheets if the storage is a relational
259 structure."
260 (dolist (k (relations ds))
261 (let ((currentRelationSet (getRelation ds k)))
262 (print-as-row (var-labels currentRelationSet))
263 (let ((j -1))
264 (dolist (i (case-labels currentRelationSet))
265 (print-as-row
266 (append (list i)
267 (extract-row (dataset currentRelationSet)
268 (incf j)))))))))
272 ;;; Shaping for computation
274 (defgeneric reshapeData (dataform into-form as-copy)
275 (:documentation "pulling data into a new form"))
277 (defmethod reshapeData ((sds dataframe-like) what into-form))
279 (defmethod reshapeData ((ds array) (sp list) copy-p)
280 "Array via specList specialization: similar to the common R
281 approaches to redistribution.")
283 (defclass data-format () ())
285 (defun row-order-as-list (ary)
286 "Pull out data in row order into a list."
287 (let ((result (list))
288 (nrows (nth 0 (array-dimensions ary)))
289 (ncols (nth 1 (array-dimensions ary))))
290 (dotimes (i ncols)
291 (dotimes (j nrows)
292 (append result (aref ary i j))))))
294 (defun col-order-as-list (ary)
295 "Pull out data in row order into a list."
296 (let ((result (list))
297 (nrows (nth 0 (array-dimensions ary)))
298 (ncols (nth 1 (array-dimensions ary))))
299 (dotimes (i nrows)
300 (dotimes (j ncols)
301 (append result (aref ary i j))))))
304 (defun transpose (ary)
305 "map NxM to MxN."
306 (make-array (reverse (array-dimensions ary))
307 :initial-contents (col-order-as-list ary)))
310 ;;; Variable-name handling for Tables. Needs error checking.
311 (defun varNames (ds)
312 (var-labels ds))
314 (defun set-varNames (ds vN)
315 (if (= (length (var-labels ds))
316 (length vN))
317 (setf (var-labels ds) vN)
318 (error "wrong size.")))
320 (defsetf varNames set-varNames)
322 ;;; Case-name handling for Tables. Needs error checking.
323 (defun caseNames (ds)
324 (case-labels ds))
326 (defun set-caseNames (ds vN)
327 (if (= (length (case-labels ds))
328 (length vN))
329 (setf (case-labels ds) vN)
330 (error "wrong size.")))
332 (defsetf caseNames set-caseNames)