3 ;;; Time-stamp: <2009-03-16 20:20:07 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: data-clos.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics.
11 ;;; This redoes data storage structures in a CLOS based
15 ;;; What is this talk of 'release'? Klingons do not make software
16 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
17 ;;; designers and quality assurance people in its wake.
20 (in-package :lisp-stat-data-clos
)
22 ;;; No real basis for work, there is a bit of new-ness and R-ness to
23 ;;; this work. In particular, the notion of relation is key and
24 ;;; integral to the analysis. Tables are related and matched vectors,
25 ;;; for example. "column" vectors are related observations (by
26 ;;; measure/recording) while "row" vectors are related readings (by
30 ;;; Relational structure -- can we capture a completely unnormalized
31 ;;; data strucutre to propose possible modeling approaches, and
32 ;;; propose appropriate models and inferential strategies?
34 ;;; So we want a verb-driven API for data collection construction. We
35 ;;; should encode independence or lack of, as possible.
39 :tables
(list (list t1
)
43 :stat-relation
'(t1 (:nest-within t2
) (:nest-within t3
))))
45 ;; Need to figure out typed vectors. We then map a series of typed
46 ;; vectors over to tables where columns are equal typed. In a sense,
47 ;; this is a relation (1-1) of equal-typed arrays. For the most part,
48 ;; this ends up making the R data.frame into a relational building
49 ;; block (considering 1-1 mappings using row ID as a relation).
50 ;; Is this a worthwhile generalization?
52 ;;; verbs vs semantics for DS conversion -- consider the possibily of
53 ;;; how adverbs and verbs relate, where to put which semantically to
54 ;;; allow for general approach.
56 ;;; eg. Kasper's talk on the FUSION collection of parsers.
59 ;;; Need to consider modification APIs
62 ;;; - get/set row names (case names)
63 ;;; - column names (variable names)
65 ;;; - annotation/metadata
66 ;;; - make sure that we do coherency checking in the exported
69 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
70 ;;; additional input).
71 ;;; - either overwriting or not, i.e. with or without copy.
72 ;;; - check consistency of resulting data with metadata and related
76 (defclass dataframe-like
(matrix-like)
79 ;; STORE is the storage component. We ignore this in the -like ;
80 ;; class, as it is the primary differentiator, driving how access
81 ;; (getting/setting) is done. We create methods depending on the
82 ;; storage component, which access data as appropriate.
84 ;; so: subclass this based on storage type, and ensure that generic
85 ;; accessors have the right methods to do the right thing.
90 :documentation
"Data storage: typed as table, array,
91 relation, or pointer/reference to such.")
93 (documentation-string :initform nil
96 :documentation
"uncomputable information
100 ;; the rest of this is metadata. In particular, we should find a
101 ;; more flexible, compact way to store this.
102 (case-labels :initform nil
103 :initarg
:case-labels
104 :accessor case-labels
105 :documentation
"labels used for describing cases (doc
106 metadata), possibly used for merging.")
107 (var-labels :initform nil
110 :documentation
"Variable names.")
111 (var-types :initform nil
114 :documentation
"variable types to ensure fit"
116 (:documentation
"Abstract class for standard statistical analysis
117 dataset for independent data. Rows are considered
118 to be independent, matching observations. Columns
119 are considered to be type-consistent, match a
120 varioable with distribution. inherits from
121 lisp-matrix base matrix-like class. "))
124 ;; dataframe-like is the basic cases by variables framework. Need to
125 ;; embed this within other structures which allow for generalized
126 ;; relations. Goal is to ensure that relations imply and drive the
127 ;; potential for statistical relativeness such as correlation,
128 ;; interference, and similar concepts.
131 (defclass dataframe-array
(dataframe-like)
132 ((store :initform nil
136 :documentation
"Data storage: typed as table, array,
137 relation, or pointer/reference to such."))
138 (:documentation
"example implementation of dataframe-like using storage
139 based on lisp arrays."))
142 ;; Actions on a statistical data structure.
145 (defgeneric consistent-dataframe-like-p
(ds)
146 (:documentation
"methods to check for consistency."))
148 (defmethod consistent-dataframe-like-p ((ds dataframe-like
))
149 "Test that dataframe-like is internally consistent with metadata.
150 Ensure that dims of stored data are same as case and var labels."
151 (equal (array-dimensions (dataset ds
))
152 (list (length (var-labels ds
))
153 (length (case-labels ds
)))))
154 ;; FIXME: NEED TO CHECK TYPING AS WELL!
159 (defgeneric access
(dataframe-like spec-list
)
160 (:documentation
"access to array presevingtype."))
162 (defgeneric get-variable-matrix
(dataframe-like-object list-of-variable-names
)
163 (:documentation
"retrieves a matrix whose columns are the variable
164 names in same order specified."))
166 (defgeneric get-variable-vector
(dataframe-like-object variable-name
))
168 (defun extract-1 (sds idx1 idx2
)
170 (aref (dataset sds
) idx1 idx2
))
172 (defun extract-1-as-sds (sds idx1 idx2
)
173 "Need a version which returns a dataset."
174 (make-instance 'dataframe-array
177 :initial-contents
(extract-1 sds idx1 idx2
))
178 ;; ensure copy for this and following
179 :doc
(doc-string sds
)
180 :case-labels
(caseNames sds
)
181 :var-labels
(varNames sds
)))
183 (defun gen-seq (n &optional
(start 1))
184 "There has to be a better way -- I'm sure of it! Always count from 1."
186 (append (gen-seq (- n
1) start
) (list n
))))
195 (defun extract-col (sds index
)
196 "Returns data as sequence."
198 #'(lambda (x) (extract-1 sds index x
))
199 (gen-seq (nth 2 (array-dimensions (dataset sds
))))))
201 (defun extract-col-as-sds (sds index
)
202 "Returns data as SDS, copied."
204 #'(lambda (x) (extract-1 sds index x
))
205 (gen-seq (nth 2 (array-dimensions (dataset sds
))))))
207 (defun extract-row (sds index
)
208 "Returns row as sequence."
210 #'(lambda (x) (extract-1 sds x index
))
211 (gen-seq (nth 1 (array-dimensions (dataset sds
))))))
213 (defun extract-idx (sds idx1Lst idx2Lst
)
214 "return an array, row X col dims. FIXME TESTME"
215 (let ((my-pre-array (list)))
218 (append my-pre-array
(extract-1 sds x y
))))
219 (make-array (list (length idx1Lst
) (length idx2Lst
))
220 :initial-contents my-pre-array
)))
223 (defun extract-idx-sds (sds idx1Lst idx2Lst
)
224 "return a dataset encapsulated version of extract-idx."
225 (make-instance 'dataframe-array
227 (list (length idx1Lst
) (length idx2Lst
))
228 :initial-contents
(dataset sds
))
229 ;; ensure copy for this and following
230 :doc
(doc-string sds
)
231 :case-labels
(caseNames sds
)
232 :var-labels
(varNames sds
)))
234 (defgeneric extract
(sds whatAndRange
)
235 (:documentation
"data extraction approach"))
238 ;;; Printing methods and support.
240 (defun print-as-row (seq)
241 "Print a sequence formated as a row in a table."
242 (format t
"~{~D~T~}" seq
))
244 ;; (print-as-row (list 1 2 3))
246 (defun print-structure-table (ds)
247 "example of what we want the methods to look like. Should be sort
248 of like a spreadsheet if the storage is a table."
249 (print-as-row (var-labels ds
))
251 (dolist (i (case-labels ds
))
252 (print-as-row (append (list i
)
253 (extract-row (dataset ds
) (incf j
)))))))
256 (defun print-structure-relational (ds)
257 "example of what we want the methods to look like. Should be sort
258 of like a graph of spreadsheets if the storage is a relational
260 (dolist (k (relations ds
))
261 (let ((currentRelationSet (getRelation ds k
)))
262 (print-as-row (var-labels currentRelationSet
))
264 (dolist (i (case-labels currentRelationSet
))
267 (extract-row (dataset currentRelationSet
)
272 ;;; Shaping for computation
274 (defgeneric reshapeData
(dataform into-form as-copy
)
275 (:documentation
"pulling data into a new form"))
277 (defmethod reshapeData ((sds dataframe-like
) what into-form
))
279 (defmethod reshapeData ((ds array
) (sp list
) copy-p
)
280 "Array via specList specialization: similar to the common R
281 approaches to redistribution.")
283 (defclass data-format
() ())
285 (defun row-order-as-list (ary)
286 "Pull out data in row order into a list."
287 (let ((result (list))
288 (nrows (nth 0 (array-dimensions ary
)))
289 (ncols (nth 1 (array-dimensions ary
))))
292 (append result
(aref ary i j
))))))
294 (defun col-order-as-list (ary)
295 "Pull out data in row order into a list."
296 (let ((result (list))
297 (nrows (nth 0 (array-dimensions ary
)))
298 (ncols (nth 1 (array-dimensions ary
))))
301 (append result
(aref ary i j
))))))
304 (defun transpose (ary)
306 (make-array (reverse (array-dimensions ary
))
307 :initial-contents
(col-order-as-list ary
)))
310 ;;; Variable-name handling for Tables. Needs error checking.
314 (defun set-varNames (ds vN
)
315 (if (= (length (var-labels ds
))
317 (setf (var-labels ds
) vN
)
318 (error "wrong size.")))
320 (defsetf varNames set-varNames
)
322 ;;; Case-name handling for Tables. Needs error checking.
323 (defun caseNames (ds)
326 (defun set-caseNames (ds vN
)
327 (if (= (length (case-labels ds
))
329 (setf (case-labels ds
) vN
)
330 (error "wrong size.")))
332 (defsetf caseNames set-caseNames
)