better documentation.
[CommonLispStat.git] / src / data / data-clos.lisp
blobdc296d630a1a5424d638b98ba954c88b9c7c9019
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2009-03-16 21:03:58 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: data-clos.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
8 ;;; on how it arrives.
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics.
11 ;;; This redoes data storage structures in a CLOS based
12 ;;; framework.
13 ;;;
15 ;;; What is this talk of 'release'? Klingons do not make software
16 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
17 ;;; designers and quality assurance people in its wake.
19 (in-package :lisp-stat-data-clos)
21 ;;; No real basis for work, there is a bit of new-ness and R-ness to
22 ;;; this work. In particular, the notion of relation is key and
23 ;;; integral to the analysis. Tables are related and matched vectors,
24 ;;; for example. "column" vectors are related observations (by
25 ;;; measure/recording) while "row" vectors are related readings (by
26 ;;; case)
27 ;;;
29 ;;; Relational structure -- can we capture a completely unnormalized
30 ;;; data strucutre to propose possible modeling approaches, and
31 ;;; propose appropriate models and inferential strategies?
32 ;;;
33 ;;; So we want a verb-driven API for data collection construction. We
34 ;;; should encode independence or lack of, as possible.
36 ;; Need to figure out typed vectors. We then map a series of typed
37 ;; vectors over to tables where columns are equal typed. In a sense,
38 ;; this is a relation (1-1) of equal-typed arrays. For the most part,
39 ;; this ends up making the R data.frame into a relational building
40 ;; block (considering 1-1 mappings using row ID as a relation).
41 ;; Is this a worthwhile generalization?
43 ;;; verbs vs semantics for DS conversion -- consider the possibily of
44 ;;; how adverbs and verbs relate, where to put which semantically to
45 ;;; allow for general approach.
47 ;;; eg. Kasper's talk on the FUSION collection of parsers.
49 ;;;
50 ;;; Need to consider modification APIs
51 ;;; actions are:
52 ;;; - import
53 ;;; - get/set row names (case names)
54 ;;; - column names (variable names)
55 ;;; - dataset values
56 ;;; - annotation/metadata
57 ;;; - make sure that we do coherency checking in the exported
58 ;;; - functions.
59 ;;; - ...
60 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
61 ;;; additional input).
62 ;;; - either overwriting or not, i.e. with or without copy.
63 ;;; - check consistency of resulting data with metadata and related
64 ;;; data information.
65 ;;; -
67 (defclass dataframe-like (matrix-like)
70 ;; STORE is the storage component. We ignore this in the -like ;
71 ;; class, as it is the primary differentiator, driving how access
72 ;; (getting/setting) is done. We create methods depending on the
73 ;; storage component, which access data as appropriate.
75 ;; so: subclass this based on storage type, and ensure that generic
76 ;; accessors have the right methods to do the right thing.
78 (store :initform nil
79 :initarg :storage
80 :accessor dataset
81 :documentation "Data storage: typed as table, array,
82 relation, or pointer/reference to such.")
84 (documentation-string :initform nil
85 :initarg :doc
86 :accessor doc-string
87 :documentation "uncomputable information
88 about dataframe-like
89 instance.")
91 ;; the rest of this is metadata. In particular, we should find a
92 ;; more flexible, compact way to store this.
93 (case-labels :initform nil
94 :initarg :case-labels
95 :accessor case-labels
96 :documentation "labels used for describing cases (doc
97 metadata), possibly used for merging.")
98 (var-labels :initform nil
99 :initarg :var-labels
100 :accessor var-labels
101 :documentation "Variable names.")
102 (var-types :initform nil
103 :initarg :var-types
104 :accessor var-types
105 :documentation "variable types to ensure fit"
107 (:documentation "Abstract class for standard statistical analysis
108 dataset for independent data. Rows are considered
109 to be independent, matching observations. Columns
110 are considered to be type-consistent, match a
111 varioable with distribution. inherits from
112 lisp-matrix base matrix-like class.
114 dataframe-like
115 is the basic cases by variables framework. Need to
116 embed this within other structures which allow for
117 generalized relations. Goal is to ensure that
118 relations imply and drive the potential for
119 statistical relativeness such as correlation,
120 interference, and similar concepts.
123 ;; (documentation 'dataframe-like 'type)
125 (defclass dataframe-array (dataframe-like)
126 ((store :initform nil
127 :initarg :storage
128 :type (array * *)
129 :accessor dataset
130 :documentation "Data storage: typed as array."))
131 (:documentation "example implementation of dataframe-like using storage
132 based on lisp arrays."))
135 (let ((df (make-new 'dataframe-array))))
139 ;; Actions on a statistical data structure.
142 (defgeneric consistent-dataframe-like-p (ds)
143 (:documentation "methods to check for consistency."))
145 (defmethod consistent-dataframe-like-p ((ds dataframe-like))
146 "Test that dataframe-like is internally consistent with metadata.
147 Ensure that dims of stored data are same as case and var labels."
148 (equal (array-dimensions (dataset ds))
149 (list (length (var-labels ds))
150 (length (case-labels ds)))))
151 ;; FIXME: NEED TO CHECK TYPING AS WELL!
154 ;;; Extraction
156 (defgeneric access (dataframe-like spec-list)
157 (:documentation "access to array presevingtype."))
159 (defgeneric get-variable-matrix (dataframe-like-object list-of-variable-names)
160 (:documentation "retrieves a matrix whose columns are the variable
161 names in same order specified."))
163 (defgeneric get-variable-vector (dataframe-like-object variable-name))
165 (defun extract-1 (sds idx1 idx2)
166 "Returns a scalar."
167 (aref (dataset sds) idx1 idx2))
169 (defun extract-1-as-sds (sds idx1 idx2)
170 "Need a version which returns a dataset."
171 (make-instance 'dataframe-array
172 :storage (make-array
173 (list 1 1)
174 :initial-contents (extract-1 sds idx1 idx2))
175 ;; ensure copy for this and following
176 :doc (doc-string sds)
177 :case-labels (caseNames sds)
178 :var-labels (varNames sds)))
180 (defun gen-seq (n &optional (start 1))
181 "There has to be a better way -- I'm sure of it! Always count from 1."
182 (if (>= n start)
183 (append (gen-seq (- n 1) start) (list n))))
184 ;; (gen-seq 4)
185 ;; => (1 2 3 4)
186 ;; (gen-seq 0)
187 ;; => nil
188 ;; (gen-seq 5 3)
189 ;; => 3 4 5
192 (defun extract-col (sds index)
193 "Returns data as sequence."
194 (map 'sequence
195 #'(lambda (x) (extract-1 sds index x))
196 (gen-seq (nth 2 (array-dimensions (dataset sds))))))
198 (defun extract-col-as-sds (sds index)
199 "Returns data as SDS, copied."
200 (map 'sequence
201 #'(lambda (x) (extract-1 sds index x))
202 (gen-seq (nth 2 (array-dimensions (dataset sds))))))
204 (defun extract-row (sds index)
205 "Returns row as sequence."
206 (map 'sequence
207 #'(lambda (x) (extract-1 sds x index))
208 (gen-seq (nth 1 (array-dimensions (dataset sds))))))
210 (defun extract-idx (sds idx1Lst idx2Lst)
211 "return an array, row X col dims. FIXME TESTME"
212 (let ((my-pre-array (list)))
213 (dolist (x idx1Lst)
214 (dolist (y idx2Lst)
215 (append my-pre-array (extract-1 sds x y))))
216 (make-array (list (length idx1Lst) (length idx2Lst))
217 :initial-contents my-pre-array)))
220 (defun extract-idx-sds (sds idx1Lst idx2Lst)
221 "return a dataset encapsulated version of extract-idx."
222 (make-instance 'dataframe-array
223 :storage (make-array
224 (list (length idx1Lst) (length idx2Lst))
225 :initial-contents (dataset sds))
226 ;; ensure copy for this and following
227 :doc (doc-string sds)
228 :case-labels (caseNames sds)
229 :var-labels (varNames sds)))
231 (defgeneric extract (sds whatAndRange)
232 (:documentation "data extraction approach"))
235 ;;; Printing methods and support.
237 (defun print-as-row (seq)
238 "Print a sequence formated as a row in a table."
239 (format t "~{~D~T~}" seq))
241 ;; (print-as-row (list 1 2 3))
243 (defun print-structure-table (ds)
244 "example of what we want the methods to look like. Should be sort
245 of like a spreadsheet if the storage is a table."
246 (print-as-row (var-labels ds))
247 (let ((j -1))
248 (dolist (i (case-labels ds))
249 (print-as-row (append (list i)
250 (extract-row (dataset ds) (incf j)))))))
253 (defun print-structure-relational (ds)
254 "example of what we want the methods to look like. Should be sort
255 of like a graph of spreadsheets if the storage is a relational
256 structure."
257 (dolist (k (relations ds))
258 (let ((currentRelationSet (getRelation ds k)))
259 (print-as-row (var-labels currentRelationSet))
260 (let ((j -1))
261 (dolist (i (case-labels currentRelationSet))
262 (print-as-row
263 (append (list i)
264 (extract-row (dataset currentRelationSet)
265 (incf j)))))))))
269 ;;; Shaping for computation
271 (defgeneric reshapeData (dataform into-form as-copy)
272 (:documentation "pulling data into a new form"))
274 (defmethod reshapeData ((sds dataframe-like) what into-form))
276 (defmethod reshapeData ((ds array) (sp list) copy-p)
277 "Array via specList specialization: similar to the common R
278 approaches to redistribution.")
280 (defclass data-format () ())
282 (defun row-order-as-list (ary)
283 "Pull out data in row order into a list."
284 (let ((result (list))
285 (nrows (nth 0 (array-dimensions ary)))
286 (ncols (nth 1 (array-dimensions ary))))
287 (dotimes (i ncols)
288 (dotimes (j nrows)
289 (append result (aref ary i j))))))
291 (defun col-order-as-list (ary)
292 "Pull out data in row order into a list."
293 (let ((result (list))
294 (nrows (nth 0 (array-dimensions ary)))
295 (ncols (nth 1 (array-dimensions ary))))
296 (dotimes (i nrows)
297 (dotimes (j ncols)
298 (append result (aref ary i j))))))
300 (defun transpose (ary)
301 "map NxM to MxN."
302 (make-array (reverse (array-dimensions ary))
303 :initial-contents (col-order-as-list ary)))
306 ;;; Variable-name handling for Tables. Needs error checking.
307 (defun varNames (ds)
308 (var-labels ds))
310 (defun set-varNames (ds vN)
311 (if (= (length (var-labels ds))
312 (length vN))
313 (setf (var-labels ds) vN)
314 (error "wrong size.")))
316 (defsetf varNames set-varNames)
318 ;;; Case-name handling for Tables. Needs error checking.
319 (defun caseNames (ds)
320 (case-labels ds))
322 (defun set-caseNames (ds vN)
323 (if (= (length (case-labels ds))
324 (length vN))
325 (setf (case-labels ds) vN)
326 (error "wrong size.")))
328 (defsetf caseNames set-caseNames)