3 ;;; Time-stamp: <2009-03-10 21:36:40 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: data-clos.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics.
11 ;;; This redoes data storage structures in a CLOS based
15 ;;; What is this talk of 'release'? Klingons do not make software
16 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
17 ;;; designers and quality assurance people in its wake.
20 (in-package :lisp-stat-data-clos
)
22 ;;; No real basis for work, there is a bit of new-ness and R-ness to
23 ;;; this work. In particular, the notion of relation is key and
24 ;;; integral to the analysis. Tables are related and matched vectors,
25 ;;; for example. "column" vectors are related observations (by
26 ;;; measure/recording) while "row" vectors are related readings (by
30 ;;; Relational structure -- can we capture a completely unnormalized
31 ;;; data strucutre to propose possible modeling approaches, and
32 ;;; propose appropriate models and inferential strategies?
34 ;;; So we want a verb-driven API for data collection construction. We
35 ;;; should encode independence or lack of, as possible.
39 :tables
(list (list t1
)
43 :stat-relation
'(t1 (:nest-within t2
) (:nest-within t3
))))
45 ;; Need to figure out typed vectors. We then map a series of typed
46 ;; vectors over to tables where columns are equal typed. In a sense,
47 ;; this is a relation (1-1) of equal-typed arrays. For the most part,
48 ;; this ends up making the R data.frame into a relational building
49 ;; block (considering 1-1 mappings using row ID as a relation).
50 ;; Is this a worthwhile generalization?
52 ;;; verbs vs semantics for DS conversion -- consider the possibily of
53 ;;; how adverbs and verbs relate, where to put which semantically to
54 ;;; allow for general approach.
56 ;;; eg. Kasper's talk on the FUSION collection of parsers.
59 ;;; Need to consider modification APIs
62 ;;; - get/set row names (case names)
63 ;;; - column names (variable names)
65 ;;; - annotation/metadata
66 ;;; - make sure that we do coherency checking in the exported
69 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
70 ;;; additional input).
71 ;;; - either overwriting or not, i.e. with or without copy.
72 ;;; - check consistency of resulting data with metadata and related
76 (defclass DataFrame-like
(matrix-like)
79 ;; STORE is the storage component. We ignore this in the -like ;
80 ;; class, as it is the primary differentiator, driving how access
81 ;; (getting/setting) is done. We create methods depending on the
82 ;; storage component, which access data as appropriate.
87 :documentation
"Data storage: typed as table, array,
88 relation, or pointer/reference to such.")
90 (documentation-string :initform nil
93 :documentation
"uncomputable information
94 about statistical-dataset
97 ;; the rest of this is metadata. In particular, we should find a
98 ;; more flexible, compact way to store this.
99 (case-labels :initform nil
100 :initarg
:case-labels
101 :accessor case-labels
102 :documentation
"labels used for describing cases (doc
103 metadata), possibly used for merging.")
104 (var-labels :initform nil
107 :documentation
"Variable names."))
108 (:documentation
"Abstract class for standard statistical analysis
109 dataset for independent data. Rows are considered
110 to be independent, matching observations. Columns
111 are considered to be type-consistent, match a
112 varioable with distribution."))
115 (defgeneric get-variable-matrix
(dataset-pointer list-of-variable-names
)
116 (:documentation
"retrieves a matrix whose columns are the variable
117 names in same order specified."))
119 (defgeneric get-variable-vector
(dataset-pointer variable-name
))
121 ;; statistical-dataset is the basic cases by variables framework.
122 ;; Need to embed this within other structures which allow for
123 ;; generalized relations. Goal is to ensure that relations imply and
124 ;; drive the potential for statistical relativeness such as
125 ;; correlation, interference, and similar concepts.
127 ;; Actions on a statistical data structure.