more framework for data structures -- but nothing which works!
[CommonLispStat.git] / src / data / data-clos.lisp
blob251903c76edd0bfdb4d06dfb294577157f736c2a
1 ;;; -*- mode: lisp -*-
3 ;;; File: data-clos.lisp
4 ;;; Author: AJ Rossini <blindglobe@gmail.com>
5 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
6 ;;; on how it arrives.
7 ;;; Purpose: data package for lispstat
8 ;;; Time-stamp: <2008-09-02 18:40:53 tony>
9 ;;; Creation: <2008-03-12 17:18:42 user>
11 ;;; What is this talk of 'release'? Klingons do not make software
12 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
13 ;;; designers and quality assurance people in its wake.
15 ;;; This organization and structure is new to the 21st Century
16 ;;; version.
18 ;;; data-clos.lisp
19 ;;;
20 ;;; redoing data structures in a CLOS based framework.
21 ;;;
22 ;;; No real basis for work, there is a bit of new-ness and R-ness to
23 ;;; this work. In particular, the notion of relation is key and
24 ;;; integral to the analysis. Tables are related and matched
25 ;;; vectors,for example. "column" vectors are related observations
26 ;;; (by measure/recording) while "row" vectors are related readings
27 ;;; (by case)
28 ;;;
30 ;;; Relational structure -- can we capture a completely unnormalized
31 ;;; data strucutre to propose possible modeling approaches, and
32 ;;; propose appropriate models and inferential strategies?
33 ;;;
35 ;; verb-driven schema for data collection. Should encode independence
36 ;; or lack of when possible.
38 #+nil(progn
39 (def-statschema MyDB
40 :tables (list (list t1 )
41 (list t2 )
42 (list t4 ))
43 :unique-key key
44 :stat-relation '(t1 (:nest-within t2) (:nest-within t3))
45 :))
49 (in-package :cl-user)
51 (defpackage :lisp-stat-data-clos
52 (:use :common-lisp
53 ;;:clem
55 (:export statistical-dataset ;; primary class for working.
57 modifyData ;; metadata mods
58 importData ;; get it in
59 reshapeData ;; data mods
61 consistent-statistical-dataset-p
62 varNames caseNames ;; metadata explicit modifiers
64 extract
65 ;; and later, we remove the following, exposing only
66 ;; through the above method.
67 extract-1 extract-row extract-col extract-idx
70 (in-package :lisp-stat-data-clos)
72 ;; Need to figure out typed vectors. We then map a series of typed
73 ;; vectors over to tables where columns are equal typed. In a sense,
74 ;; this is a relation (1-1) of equal-typed arrays. For the most part,
75 ;; this ends up making the R data.frame into a relational building
76 ;; block (considering 1-1 mappings using row ID as a relation).
77 ;; Is this a worthwhile generalization?
79 (defclass statistical-dataset-metadata (rdf-type)
80 (())
85 (defclass statistical-dataset ()
86 ((store :initform nil
87 :initarg :storage
88 :accessor dataset
89 :documentation "Data storage: typed as table, array,
90 relation, or pointer/reference to such.")
91 (documentation-string :initform nil
92 :initarg :doc
93 :accessor doc-string
94 :documentation "uncomputable information
95 about statistical-dataset
96 instance.")
98 ;; the rest of this is metadata. In particular, we should find a
99 ;; more flexible, compact way to store this.
100 (case-labels :initform nil
101 :initarg :case-labels
102 :accessor case-labels
103 :documentation "labels used for describing cases (doc
104 metadata), possibly used for merging.")
105 (var-labels :initform nil
106 :initarg :var-labels
107 :accessor var-labels
108 :documentation "Variable names."))
109 (:documentation "Standard Cases by Variables Statistical-Dataset,
110 i.e. an S data.frame."))
113 ;; statistical-dataset is the basic cases by variables framework.
114 ;; Need to embed this within other structures which allow for
115 ;; generalized relations. Goal is to ensure that relations imply and
116 ;; drive the potential for statistical relativeness such as
117 ;; correlation, interference, and similar concepts.
119 ;; Actions on a statistical data structure.
122 (defgeneric consistent-statistical-dataset-p (ds)
123 (:documentation "methods to check for consistency."))
125 (defmethod consistent-statistical-dataset-p ((ds statistical-dataset))
126 "Test that statistical-dataset is internally consistent with metadata.
127 Ensure that dims of stored data are same as case and var labels."
128 (equal (array-dimensions (dataset ds))
129 (list (length (var-labels ds))
130 (length (case-labels ds)))))
132 ;;; Extraction
134 (defun extract-1 (sds idx1 idx2)
135 "Returns a scalar."
136 (aref (dataset sds) idx1 idx2))
138 (defun extract-1-as-sds (sds idx1 idx2)
139 "Need a version which returns a dataset."
140 (make-instance 'statistical-dataset
141 :storage (make-array
142 (list 1 1)
143 :initial-contents (extract-1 sds idx1 idx2))
144 ;; ensure copy for this and following
145 :doc (doc-string sds)
146 :case-labels (caseNames sds)
147 :var-labels (varNames sds)))
149 (defun gen-seq (n &optional (start 1))
150 "There has to be a better way -- I'm sure of it! Always count from 1."
151 (if (>= n start)
152 (append (gen-seq (- n 1) start) (list n))))
153 ;; (gen-seq 4)
154 ;; => (1 2 3 4)
155 ;; (gen-seq 0)
156 ;; => nil
157 ;; (gen-seq 5 3)
158 ;; => 3 4 5
161 (defun extract-col (sds index)
162 "Returns data as sequence."
163 (map 'sequence
164 #'(lambda (x) (extract-1 sds index x))
165 (gen-seq (nth 2 (array-dimensions (dataset sds))))))
167 (defun extract-col-as-sds (sds index)
168 "Returns data as SDS, copied."
169 (map 'sequence
170 #'(lambda (x) (extract-1 sds index x))
171 (gen-seq (nth 2 (array-dimensions (dataset sds))))))
173 (defun extract-row (sds index)
174 "Returns row as sequence."
175 (map 'sequence
176 #'(lambda (x) (extract-1 sds x index))
177 (gen-seq (nth 1 (array-dimensions (dataset sds))))))
179 (defun extract-idx (sds idx1Lst idx2Lst)
180 "return an array, row X col dims. FIXME TESTME"
181 (let ((my-pre-array (list)))
182 (dolist (x idx1Lst)
183 (dolist (y idx2Lst)
184 (append my-pre-array (extract-1 sds x y))))
185 (make-array (list (length idx1Lst) (length idx2Lst))
186 :initial-contents my-pre-array)))
189 (defun extract-idx-sds (sds idx1Lst idx2Lst)
190 "return a dataset encapsulated version of extract-idx."
191 (make-instance 'statistical-dataset
192 :storage (make-array
193 (list (length idx1Lst) (length idx2Lst))
194 :initial-contents (dataset sds))
195 ;; ensure copy for this and following
196 :doc (doc-string sds)
197 :case-labels (caseNames sds)
198 :var-labels (varNames sds)))
200 (defgeneric extract (sds whatAndRange)
201 (:documentation "data extraction approach"))
203 ;;; Printing methods and support.
205 (defun print-as-row (seq)
206 "Print a sequence formated as a row in a table."
207 (format t "~{~D~T~}" seq))
209 ;; (print-as-row (list 1 2 3))
211 (defun print-structure-table (ds)
212 "example of what we want the methods to look like. Should be sort
213 of like a spreadsheet if the storage is a table."
214 (print-as-row (var-labels ds))
215 (let ((j -1))
216 (dolist (i (case-labels ds))
217 (print-as-row (append (list i)
218 (extract-row (dataset ds) (incf j)))))))
221 (defun print-structure-relational (ds)
222 "example of what we want the methods to look like. Should be sort
223 of like a graph of spreadsheets if the storage is a relational
224 structure."
225 (dolist (k (relations ds))
226 (let ((currentRelationSet (getRelation ds k)))
227 (print-as-row (var-labels currentRelationSet))
228 (let ((j -1))
229 (dolist (i (case-labels currentRelationSet))
230 (print-as-row
231 (append (list i)
232 (extract-row (dataset currentRelationSet)
233 (incf j)))))))))
237 ;;; Shaping for computation
239 (defgeneric reshapeData (dataform into-form as-copy)
240 (:documentation "pulling data into a new form"))
242 (defmethod reshapeData ((sds statistical-dataset) what into-form))
244 (defmethod reshapeData ((ds array) (sp list) copy-p)
245 "Array via specList specialization: similar to the common R
246 approaches to redistribution.")
248 (defclass data-format () ())
250 (defun row-order-as-list (ary)
251 "Pull out data in row order into a list."
252 (let ((result (list))
253 (nrows (nth 0 (array-dimensions ary)))
254 (ncols (nth 1 (array-dimensions ary))))
255 (dotimes (i ncols)
256 (dotimes (j nrows)
257 (nappend result (aref ary i j))))))
259 (defun col-order-as-list (ary)
260 "Pull out data in row order into a list."
261 (let ((result (list))
262 (nrows (nth 0 (array-dimensions ary)))
263 (ncols (nth 1 (array-dimensions ary))))
264 (dotimes (i nrows)
265 (dotimes (j ncols)
266 (nappend result (aref ary i j))))))
268 (defun transpose (ary)
269 "map NxM to MxN."
270 (make-array (reverse (array-dimensions ary))
271 :initial-contents (col-order-as-list ary)))
274 ;;; verbs vs semantics for dt conversion -- consider the possibily of
275 ;;; how adverbs and verbs relate, where to put which semantically to
276 ;;; allow for general approach.
278 ;;; eg. Kasper's talk on the FUSION collection of parsers.
285 ;;; Need to consider modification APIs
286 ;;; actions are:
287 ;;; - import
288 ;;; - get/set row names (case names)
289 ;;; - column names (variable names)
290 ;;; - dataset values
291 ;;; - annotation/metadata
292 ;;; - make sure that we do coherency checking in the exported
293 ;;; - functions.
294 ;;; - ...
295 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
296 ;;; additional input).
297 ;;; - either overwriting or not, i.e. with or without copy.
298 ;;; - check consistency of resulting data with metadata and related
299 ;;; data information.
300 ;;; -
302 ;;; Variable-name handling for Tables. Needs error checking.
303 (defun varNames (ds)
304 (var-labels ds))
306 (defun set-varNames (ds vN)
307 (if (= (length (var-labels ds))
308 (length vN))
309 (setf (var-labels ds) vN)
310 (error "wrong size.")))
312 (defsetf varNames set-varNames)
314 ;;; Case-name handling for Tables. Needs error checking.
315 (defun caseNames (ds)
316 (case-labels ds))
318 (defun set-caseNames (ds vN)
319 (if (= (length (case-labels ds))
320 (length vN))
321 (setf (case-labels ds) vN)
322 (error "wrong size.")))
324 (defsetf caseNames set-caseNames)
326 ;;; General modification approaches.
328 (defgeneric importData (source featureList)
329 (:documentation "command to get data into CLS. Specific methods
330 will need to handle pathnames, internal data structures, and
331 external services such as DBMS's. We would like to be able to do
332 thinks like:
333 (importData MyPathName '(:formattype 'csvString))
334 (importData '(sqlConnection :server host.domain.net :port 666)
335 '(:formattype 'table
336 and so on."))
339 (defun pathname-example (name)
340 (let ((my-path (parse-namestring name)))
341 (values (pathname-name my-path :case :common)
342 (pathname-name my-path :case :local))))
344 (defvar sourceTypes (list 'csv 'lisp 'tsv 'special)
345 "list of possible symbols.
347 Thsees are used to specify source formats that might be supported for
348 input. CSV and TSV are standard, LISP refers to forms, and SPECIAL
349 refers to a FUNCTION which parses as appropriately.")
351 ;;; WRONG LOGIC.
352 (defmethod importData ((fileHandle pathname)
353 (fmt list)) ;sourceTypes))
354 "File-based input for data.
355 Usually used by:
356 (importData (parse-namestring 'path/to/file')
357 (list :format 'csv))
359 (importData myPathName (list :format 'lisp))
361 (let* ((fmtType (getf fmt :format))
362 (newData (getDataAsLists fileHandle fmtType)))
363 (case fmtType
364 ('csv ( ))
365 ('tsv ( ))
366 ('lisp ( ))
367 ('special (let ((parserFcn (getf fmt :special-parser)))))
368 (:default (error "no standard default importData format")))))
370 (defmethod importData ((ds array) (fmt list))
371 "mapping arrays into CLS data.")
373 (defmethod importData ((dsSpec DBMSandSQLextract)
374 (fmt mappingTypes))
375 "mapping DBMS into CLS data.")
379 ;;(defmacro with-dataframe (env &rest progn)
380 ;; "Compute using variable names with with.data.frame type semantics.")