first pass at extraction.
[CommonLispStat.git] / data-clos.lisp
blob6d90e80b2347858f33284b229927ad95f070e810
1 ;;; -*- mode: lisp -*-
3 ;;; File: data-clos.lisp
4 ;;; Author: AJ Rossini <blindglobe@gmail.com>
5 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
6 ;;; on how it arrives.
7 ;;; Purpose: data package for lispstat
8 ;;; Time-stamp: <2008-03-12 17:18:42 user>
9 ;;; Creation: <2008-03-12 17:18:42 user>
11 ;;; What is this talk of 'release'? Klingons do not make software
12 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
13 ;;; designers and quality assurance people in its wake.
15 ;;; This organization and structure is new to the 21st Century
16 ;;; version.
18 ;;; data-clos.lisp
19 ;;;
20 ;;; redoing data structures in a CLOS based framework.
21 ;;;
22 ;;; No real basis for work, there is a bit of new-ness and R-ness to
23 ;;; this work. In particular, the notion of relation is key and
24 ;;; integral to the analysis. Tables are related and matched
25 ;;; vectors,for example. "column" vectors are related observations
26 ;;; (by measure/recording) while "row" vectors are related readings
27 ;;; (by case)
28 ;;;
30 ;;; Relational structure -- can we capture a completely unnormalized
31 ;;; data strucutre to propose possible modeling approaches, and
32 ;;; propose appropriate models and inferential strategies?
33 ;;;
35 ;; verb-driven schema for data collection. Should encode independence
36 ;; or lack of when possible.
38 #+nil(progn
39 (def-statschema MyDB
40 :tables (list (list t1 )
41 (list t2 )
42 (list t4 ))
43 :unique-key key
44 :stat-relation '(t1 (:nest-within t2) (:nest-within t3))
45 :))
49 (in-package :cl-user)
51 (defpackage :lisp-stat-data-clos
52 (:use :common-lisp
53 ;;:clem
55 (:export statistical-dataset ;; primary class for working.
57 modifyData ;; metadata mods
58 importData ;; get it in
59 reshapeData ;; data mods
61 consistent-statistical-dataset-p
62 varNames caseNames ;; metadata explicit modifiers
65 (in-package :lisp-stat-data-clos)
67 ;; Need to figure out typed vectors. We then map a series of typed
68 ;; vectors over to tables where columns are equal typed. In a sense,
69 ;; this is a relation (1-1) of equal-typed arrays. For the most part,
70 ;; this ends up making the R data.frame into a relational building
71 ;; block (considering 1-1 mappings using row ID as a relation).
72 ;; Is this a worthwhile generalization?
74 (defclass statistical-dataset ()
75 ((store :initform nil
76 :initarg :storage
77 :accessor dataset
78 :documentation "Data storage slot. Should be an array or a
79 relation,")
80 (documentation-string :initform nil
81 :initarg :doc
82 :accessor doc-string
83 :documentation "Information about statistical-dataset.")
84 (case-labels :initform nil
85 :initarg :case-labels
86 :accessor case-labels
87 :documentation "labels used for describing cases (doc
88 metadata), possibly used for merging.")
89 (var-labels :initform nil
90 :initarg :var-labels
91 :accessor var-labels
92 :documentation "Variable names."))
93 (:documentation "Standard Cases by Variables Statistical-Dataset."))
96 ;; statistical-dataset is the basic cases by variables framework.
97 ;; Need to embed this within other structures which allow for
98 ;; generalized relations. Goal is to ensure that relations imply and
99 ;; drive the potential for statistical relativeness such as
100 ;; correlation, interference, and similar concepts.
102 ;; Actions on a statistical data structure.
105 (defgeneric consistent-statistical-dataset-p (ds)
106 (:documentation "methods to check for consistency."))
108 (defmethod consistent-statistical-dataset-p ((ds statistical-dataset))
109 "Test that statistical-dataset is internally consistent with metadata.
110 Ensure that dims of stored data are same as case and var labels."
111 (equal (array-dimensions (dataset ds))
112 (list (length (var-labels ds))
113 (length (case-labels ds)))))
115 ;;; Extraction
117 (defun extract-1 (sds index1 index2)
118 (aref (dataset sds) index1 index2))
120 (defun extract-column (sds index1)
121 (map 'sequence
122 #'(lambda (x) (extract-1 sds index1 x))
123 (doloop 1(array-dimensions (dataset sds)))))
125 (defun extract-row (sds row-index1))
127 (defun extract-range (sds rowIdxLst colIdxLst)
128 "return a rectangular structure of row X col dims."
131 (defgeneric extract (sds whatAndRange)
132 (:documentation "data extraction approach"))
137 (defun print-structure-table (ds)
138 "example of what we want the methods to look like. Should be sort
139 of like a spreadsheet if the storage is a table."
140 (print-as-row (var-labels ds))
141 (let ((j -1))
142 (dolist (i (case-labels ds))
143 (princ (format "%i %v" i (row-extract (dataset ds) (incr j)))))))
145 (defun print-structure-relational (ds)
146 "example of what we want the methods to look like. Should be sort
147 of like a graph of spreadsheets if the storage is a relational
148 structure."
149 (dolist (k (relations ds))
150 (print-as-row (var-labels ds))
151 (let ((j -1))
152 (dolist (i (case-labels ds))
153 (princ "%i %v" i (row-extract (dataset ds) (incr j)))))))
158 (defgeneric reshapeData (dataform into-form as-copy)
159 (:documentation "pulling data into a new form"))
161 (defmethod reshapeData ((ds statistical-dataset) what into-form)
162 (reshape (get ds what) into-form))
164 (defmethod reshapeData ((ds array) (sp list) copy-p)
165 "Array via specList specialization: similar to the common R
166 approaches to redistribution."
167 (let ((widep (getf sp :toWide))
168 (primaryKey (getf sp :primaryKey)))
172 (defclass data-format () ())
174 (defun transpose (x)
175 "map NxM to MxN.")
177 (defun reorder-by-rank (x order &key (by-row t))
178 " .")
180 (defun reorder-by-permutation (x perm &key (by-row t))
181 " .")
183 ;;; verbs vs semantics for dt conversion -- consider the possibily of
184 ;;; how adverbs and verbs relate, where to put which semantically to
185 ;;; allow for general approach.
187 ;;; eg. Kasper's talk on the FUSION collection of parsers.
201 ;;; Need to consider modification APIs
202 ;;; actions are:
203 ;;; - import
204 ;;; - get/set row names (case names)
205 ;;; - column names (variable names)
206 ;;; - dataset values
207 ;;; - annotation/metadata
208 ;;; - make sure that we do coherency checking in the exported
209 ;;; - functions.
210 ;;; - ...
211 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
212 ;;; additional input).
213 ;;; - either overwriting or not, i.e. with or without copy.
214 ;;; - check consistency of resulting data with metadata and related
215 ;;; data information.
216 ;;; -
218 ;;; Variable-name handling for Tables. Needs error checking.
219 (defun varNames (ds)
220 (var-labels ds))
222 (defun set-varNames (ds vN)
223 (if (= (length (var-labels ds))
224 (length vN))
225 (setf (var-labels ds) vN)
226 (error "wrong size.")))
228 (defsetf varNames set-varNames)
230 ;;; Case-name handling for Tables. Needs error checking.
231 (defun caseNames (ds)
232 (case-labels ds))
234 (defun set-caseNames (ds vN)
235 (if (= (length (case-labels ds))
236 (length vN))
237 (setf (case-labels ds) vN)
238 (error "wrong size.")))
240 (defsetf caseNames set-caseNames)
242 ;;; General modification approaches.
244 (defgeneric importData (source featureList)
245 (:documentation "command to get data into CLS. Specific methods
246 will need to handle files, internal data structures, and DBMS's. We
247 would like to be able to do:
254 (defun pathname-example (name)
255 (let ((my-path (parse-namestring name)))
256 (values (pathname-name my-path :case :common)
257 (pathname-name my-path :case :local))))
259 (defvar sourceTypes (list 'csv 'lisp 'tsv 'special)
260 "list of possible symbols used to specify source formats that might
261 be supported for input. CSV and TSV are standard, LISP refers to
262 forms, and SPECIAL refers to a FUNCTION which parses as
263 appropriately.")
265 ;;; WRONG LOGIC.
266 (defmethod importData ((fileHandle pathname)
267 (fmt list)) ;sourceTypes))
268 "File-based input for data.
269 Usually used by:
270 (importData (parse-namestring 'path/to/file')
271 (list :format 'csv))
273 (importData myPathName (list :format 'lisp))
275 (let* ((fmtType (getf fmt :format))
276 (newData (getDataAsLists fileHandle fmtType)))
277 (case fmtType
278 ('csv ( ))
279 ('tsv ( ))
280 ('lisp ( ))
281 ('special (let ((parserFcn (getf fmt :special-parser)))))
282 (:default (error "no standard default importData format")))))
284 (defmethod importData ((ds array) (fmt list))
285 "mapping arrays into CLS data.")
288 (defmethod importData ((dsSpec DBMSandSQLextract)
289 (fmt mappingTypes))
290 "mapping DBMS into CLS data.")
294 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
295 ;;; EXPERIMENT
296 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
298 (in-package :cl-user)
300 ;; if needed, but need to set the ASDf path first...!
301 ;; (asdf:oos 'asdf:load-op :lift)
303 (defpackage :lisp-stat-data-clos-example
304 (:use :common-lisp
305 :lift :lisp-stat-unittests
306 :lisp-stat-data-clos))
308 (in-package :lisp-stat-data-clos-example)
312 ;;; Use of this package: To see what gets exported for use in others,
313 ;;; and how much corruption can be done to objects within a package.
317 (deftestsuite lisp-stat-dataclos (lisp-stat)
319 (:tests
320 (initdata (ensure-true ))))
324 (addtest (lisp-stat-dataclos) testnameData
325 (ensure-same
326 (dataset (list 'a 'b 'c 'd) :form (list 2 2))
327 #2A(('a 'b) ('c 'd))
328 :test 'eql))
333 (defvar my-ds-1 nil
334 "test ds for experiment.")
335 (setf my-ds-1 (make-instance 'statistical-dataset))
336 my-ds-1
339 (defvar my-ds-2 nil
340 "test ds for experiment.")
341 (setf my-ds-2 (make-instance 'statistical-dataset
342 :storage #2A((1 2 3 4 5) (10 20 30 40 50))
343 :doc "This is an interesting statistical-dataset"
344 :case-labels (list "a" "b" "c" "d" "e")
345 :var-labels (list "x" "y")))
346 my-ds-2
347 (make-array (list 3 5))
349 (array-dimensions (lisp-stat-data-clos::dataset my-ds-2))
352 (addtest (lisp-stat-dataclos) consData
353 (ensure-true
354 (consistent-statistical-dataset-p my-ds-2)
357 (addtest (lisp-stat-dataclos) badAccess1
358 (ensure-error
359 (slot-value my-ds-2 'store)))
361 (addtest (lisp-stat-dataclos) badAccess2
362 (ensure-error
363 (slot-value my-ds-2 'store)))
365 (addtest (lisp-stat-dataclos) badAccess3
366 (ensure-error
367 (dataset my-ds-2)))
369 (addtest (lisp-stat-dataclos) badAccess4
370 (ensure-equal
371 (slot-value my-ds-2 'lisp-stat-data-clos::store)
372 (lisp-stat-data-clos::dataset my-ds-2)))
375 (addtest (lisp-stat-dataclos) badAccess5
376 (ensure-true
377 (eq (lisp-stat-data-clos::dataset my-ds-2)
378 (slot-value my-ds-2 'lisp-stat-data-clos::store))))
381 ;; NEVER DO THE FOLLOWING, UNLESS YOU WANT TO MUCK UP STRUCTURES...
382 (addtest (lisp-stat-dataclos) badAccess5
383 (ensure-true
384 (lisp-stat-data-clos::doc-string my-ds-2)))
386 (addtest (lisp-stat-dataclos) badAccess5
387 (ensure-true
388 (lisp-stat-data-clos::case-labels my-ds-2)))
390 (addtest (lisp-stat-dataclos) badAccess5
391 (ensure-true
392 (lisp-stat-data-clos::var-labels my-ds-2)))
394 ;; need to ensure that for things like the following, that we protect
395 ;; this a bit more so that the results are not going to to be wrong.
396 ;; That would be a bit nasty if the statistical-dataset becomes
397 ;; inconsistent.
399 (addtest (lisp-stat-dataclos) badAccess5
400 (ensure-true
401 (setf (lisp-stat-data-clos::var-labels my-ds-2)
402 (list "a" "b"))))
404 (addtest (lisp-stat-dataclos) badAccess5
405 (ensure-error
406 (setf (lisp-stat-data-clos::var-labels my-ds-2)
407 (list "a" "b" "c")))) ;; Should error!
409 (addtest (lisp-stat-dataclos) badAccess5
410 (ensure-error
411 (consistent-statistical-dataset-p my-ds-2))) ;; Nil
413 (addtest (lisp-stat-dataclos) badAccess5
414 (ensure
415 (setf (lisp-stat-data-clos::var-labels my-ds-2)
416 (list "a" "b"))))
419 (addtest (lisp-stat-dataclos) badAccess5
420 (ensure
421 (consistent-statistical-dataset-p my-ds-2))) ;; T
423 ;; This is now done by:
425 (addtest (lisp-stat-dataclos) badAccess5
426 (ensure-true
427 (progn
428 (varNames my-ds-2)
429 (setf (varNames my-ds-2) (list "a" "b"))
430 (varNames my-ds-2))))
432 ;; break this up.
433 (defvar origCaseNames nil)
435 (addtest (lisp-stat-dataclos) badAccess5
436 (ensure
437 (progn
438 (setf origCaseNames (caseNames my-ds-2))
439 (setf (caseNames my-ds-2) (list "a" "b" "c" 4 5))
440 (caseNames my-ds-2)
441 (ignore-errors (setf (caseNames my-ds-2) (list "a" "b" 4 5)))
442 (setf (caseNames my-ds-2) origCaseNames))))
444 ;; (run-tests)
446 ;; (describe (run-tests))