3 ;;; Time-stamp: <2009-04-02 15:48:29 tony>
4 ;;; Creation: <2008-03-12 17:18:42 blindglobe@gmail.com>
5 ;;; File: dataframe.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
10 ;;; Purpose: Data packaging and access for Common Lisp Statistics.
11 ;;; This redoes dataframe structures in a CLOS based
12 ;;; framework. Currently contains the virtual class
13 ;;; DATAFRAME-LIKE as well as the actual classes
14 ;;; DATAFRAME-ARRAY and DATAFRAME-MATRIXLIKE
16 ;;; What is this talk of 'release'? Klingons do not make software
17 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
18 ;;; designers and quality assurance people in its wake.
20 (in-package :lisp-stat-dataframe
)
22 ;;; No real basis for work, there is a bit of new-ness and R-ness to
23 ;;; this work. In particular, the notion of relation is key and
24 ;;; integral to the analysis. Tables are related and matched vectors,
25 ;;; for example. "column" vectors are related observations (by
26 ;;; measure/recording) while "row" vectors are related readings (by
27 ;;; case, independence). This does mean that we are placing
28 ;;; statistical semantics into the computational data object -- and
29 ;;; that it is a violation of use to consider rows which are not at
30 ;;; the least conditionally independent (though the conditioning
31 ;;; should be outside the data set, not internally specified).
33 ;;; So we want a verb-driven API for data collection construction. We
34 ;;; should encode independence or lack of, as possible.
36 ;;; Need to figure out statistically-typed vectors. We then map a
37 ;;; series of typed vectors over to tables where columns are equal
38 ;;; typed. In a sense, this is a relation (1-1) of equal-typed
39 ;;; arrays. For the most part, this ends up making the R data.frame
40 ;;; into a relational building block (considering 1-1 mappings using
41 ;;; row ID as a relation). Is this a worthwhile generalization or
42 ;;; communicable analogy?
44 ;;; verbs vs semantics for DF construction -- consider the possibily
45 ;;; of how adverbs and verbs relate, where to put which semantically
46 ;;; to allow for general approach.
48 ;;; Need to consider modification APIs
51 ;;; - get/set row names (case names)
52 ;;; - column names (variable names)
54 ;;; - annotation/metadata
55 ;;; - make sure that we do coherency checking in the exported
58 ;;; - reshapeData/reformat/reshapr a reformed version of the dataset (no
59 ;;; additional input).
60 ;;; - either overwriting or not, i.e. with or without copy.
61 ;;; - check consistency of resulting data with metadata and related
65 ;;; Misc Functions (to move into a lisp data manipulation support package)
67 ;; the next two should be merged into a general replicator pattern.
68 (defun gen-seq (n &optional
(start 1))
69 "Generates an integer sequence of length N starting at START. Used
72 (append (gen-seq (- n
1) start
) (list n
))))
74 (defun repeat-seq (n item
)
75 "FIXME: There has to be a better way -- I'm sure of it!
76 (repeat-seq 3 \"d\") ; => (\"d\" \"d\" \"d\")
77 (repeat-seq 3 'd) ; => ('d 'd 'd)"
79 (append (repeat-seq (1- n
) item
) (list item
))))
81 (defun strsym->indexnum
(df strsym
)
82 "Returns a number indicating the DF column labelled by STRSYM.
83 Probably should be a method dispatching on DATAFRAME-LIKE type."
84 (position strsym
(varlabels df
)))
86 (defun string->number
(str)
87 "Convert a string <str> representing a number to a number. A second value is
88 returned indicating the success of the conversion.
90 (string->number \"123\") ; => 123 t
91 (string->number \"1.23\") ; => 1.23 t"
92 (let ((*read-eval
* nil
))
93 (let ((num (read-from-string str
)))
94 (values num
(numberp num
)))))
97 (equal 'testme
'testme
)
98 (defparameter *test-pos
* 'testme
)
99 (position *test-pos
* (list 'a
'b
'testme
'c
))
100 (position #'(lambda (x) (equal x
"testme")) (list "a" "b" "testme" "c"))
101 (position #'(lambda (x) (equal x
1)) (list 2 1 3 4))
104 ;;; abstract dataframe class
106 (defclass dataframe-like
(matrix-like)
107 ((case-labels :initform nil
108 :initarg
:case-labels
110 :accessor case-labels
111 :documentation
"labels used for describing cases (doc
112 metadata), possibly used for merging.")
113 (var-labels :initform nil
117 :documentation
"Variable names.")
118 (var-types :initform nil
122 :documentation
"variable types to ensure fit")
123 (doc-string :initform nil
126 :documentation
"additional information, potentially
127 uncomputable, possibly metadata, about
128 dataframe-like instance."))
129 (:documentation
"Abstract class for standard statistical analysis
130 dataset for independent data. Rows are considered
131 to be independent, matching observations. Columns
132 are considered to be type-consistent, match a
133 variable with distribution. inherits from
134 lisp-matrix base MATRIX-LIKE class.
135 MATRIX-LIKE (from lisp-matrix) is basically a
136 rectangular table without storage. We emulate
137 that, and add storage, row/column labels, and
138 within-column-typing.
140 DATAFRAME-LIKE is the basic cases by variables
141 framework. Need to embed this within other
142 structures which allow for generalized relations.
143 Goal is to ensure that relations imply and drive
144 the potential for statistical relativeness such as
145 correlation, interference, and similar concepts.
147 STORE is the storage component. We ignore this in
148 the DATAFRAME-LIKE class, as it is the primary
149 differentiator, spec'ing the structure used for
150 storing the actual data. We create methods which
151 depend on STORE for access. See DATAFRAME-ARRAY
152 and DATAFRAME-MATRIXLIKE for examples. The rest of
155 ;;; Generics specialized above matrix-like, particularly for
156 ;;; dataframe-like objects. Need implementation of methods which
157 ;;; depend on storage form.
159 (defgeneric dataframe-dimensions
(df)
161 (:method
((df dataframe-like
))
162 (error "Dispatch on virtual class, Method needed for
163 DATAFRAME-DIMENSIONS with class ~A." (find-class df
))))
165 (defgeneric dataframe-dimension
(df index
)
167 (:method
((df dataframe-like
) index
)
168 (elt (dataframe-dimensions df
) index
)))
170 (defgeneric dfref
(df index1 index2
)
171 (:documentation
"Scalar access to entries in dataframe.")
172 (:method
((df dataframe-like
) index1 index2
)
173 (error "Dispatch on virtual class, Method needed for DFREF with
174 class ~A." (find-class df
))))
176 (defgeneric set-dfref
(df index1 index2 val
)
177 (:documentation
"setter for dfref")
178 (:method
((df dataframe-like
) index1 index2 val
)
179 (error "Dispatch on virtual class, Method needed for SET-DFREF
180 with class ~A." (find-class df
))))
182 (defsetf dfref set-dfref
)
184 (defgeneric dfselect
(df &optional cases vars indices
)
185 (:documentation
"access to sub-dataframes. Always returns a dataframe.")
186 (:method
((df dataframe-like
) &optional cases vars indices
)
187 (declare (ignorable cases vars
))
188 (if indices
(error "Indicies not used yet"))
189 (error "Dispatch on virtual class, Method needed for DFSELECT with
190 class ~A." (find-class df
))))
192 ;;; Specializing on superclasses...
194 ;;; Access and Extraction: implementations needed for any storage
195 ;;; type. But here, just to point out that we've got a specializing
196 ;;; virtual subclass (DATAFRAME-LIKE specializing MATRIX-LIKE).
198 (defmethod nrows ((df dataframe-like
))
199 "specializes on inheritance from matrix-like in lisp-matrix."
200 (error "Need implementation; can't dispatch on virtual class DATAFRAME-LIKE."))
202 (defmethod ncols ((df dataframe-like
))
203 "specializes on inheritance from matrix-like in lisp-matrix."
204 (error "Need implementation; can't dispatch on virtual class DATAFRAME-LIKE."))
206 ;; Testing consistency/coherency.
208 (defgeneric consistent-dataframe-p
(df)
209 (:documentation
"methods to check for consistency.")
210 (:method
((df dataframe-like
))
212 ;; ensure dimensionality
213 (= (length (var-labels df
)) (ncols df
)) ; array-dimensions (dataset df))
214 (= (length (case-labels df
)) (nrows df
))
215 ;; when dims sane, check-type for each variable
217 (dotimes (i (nrows df
))
218 (dotimes (j (ncols df
))
219 ;; dfref bombs if not a df-like subclass so we don't worry
220 ;; about specialization.
221 ;; (check-type (aref dt i j) (elt lot j)))))) ???
222 (typep (dfref df i j
) (nth j
(var-types df
)))))
226 ;;; FUNCTIONS WHICH DISPATCH ON INTERNAL METHODS OR ARGS
228 ;;; Q: change the following to generic functions and dispatch on
229 ;;; array, matrix, and dataframe? Others?
230 (defun make-labels (initstr num
)
231 "generate a list of strings which can be used as labels, i.e. something like
232 (make-labels \"a\" 3) => '(\"a1\" \"a2\" \"a3\")."
233 (check-type initstr string
)
234 (mapcar #'(lambda (x y
) (concatenate 'string x y
))
235 (repeat-seq num initstr
)
236 (mapcar #'(lambda (x) (format nil
"~A" x
)) (gen-seq num
))))
238 (defun ncase-store (store)
240 (array (array-dimension store
0))
241 (matrix-like (nrows store
))))
243 (defun nvars-store (store)
245 (array (array-dimension store
1))
246 (matrix-like (ncols store
))))
249 (defun make-dataframe (newdata
251 (caselabels nil
) (varlabels nil
)
253 "Helper function to use instead of make-instance to assure
254 construction of proper DF-array."
255 (check-type newdata
(or matrix-like array
))
256 (check-type caselabels sequence
)
257 (check-type varlabels sequence
)
258 (check-type doc string
)
259 (let ((ncases (ncase-store newdata
))
260 (nvars (nvars-store newdata
)))
261 (if caselabels
(assert (= ncases
(length caselabels
))))
262 (if varlabels
(assert (= nvars
(length varlabels
))))
263 (let ((newcaselabels (if caselabels
265 (make-labels "C" ncases
)))
266 (newvarlabels (if varlabels
268 (make-labels "V" nvars
))))
271 (make-instance 'dataframe-array
273 :nrows
(length newcaselabels
)
274 :ncols
(length newvarlabels
)
275 :case-labels newcaselabels
276 :var-labels newvarlabels
277 :var-types vartypes
))
279 (make-instance 'dataframe-matrixlike
281 :nrows
(length newcaselabels
)
282 :ncols
(length newvarlabels
)
283 :case-labels newcaselabels
284 :var-labels newvarlabels
285 :var-types vartypes
))))))
288 (make-dataframe #2A
((1.2d0
1.3d0
) (2.0d0
4.0d0
)))
289 (make-dataframe #2A
(('a
1) ('b
2)))
290 (dfref (make-dataframe #2A
(('a
1) ('b
2))) 0 1)
291 (dfref (make-dataframe #2A
(('a
1) ('b
2))) 1 0)
292 (make-dataframe 4) ; ERROR, should we allow?
293 (make-dataframe #2A
((4)))
294 (make-dataframe (rand 10 5)) ;; ERROR, but should work!
298 (defun row-order-as-list (ary)
299 "Pull out data in row order into a list."
300 (let ((result (list))
301 (nrows (nth 0 (array-dimensions ary
)))
302 (ncols (nth 1 (array-dimensions ary
))))
305 (append result
(aref ary i j
))))))
307 (defun col-order-as-list (ary)
308 "Pull out data in row order into a list."
309 (let ((result (list))
310 (nrows (nth 0 (array-dimensions ary
)))
311 (ncols (nth 1 (array-dimensions ary
))))
314 (append result
(aref ary i j
))))))
316 (defun transpose-array (ary)
318 (make-array (reverse (array-dimensions ary
))
319 :initial-contents
(col-order-as-list ary
)))
321 ;;; THE FOLLOWING 2 dual-sets done to provide error checking
322 ;;; possibilities on top of the generic function structure. Not
323 ;;; intended as make-work!
325 (defun varlabels (df)
326 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
329 (defun set-varlabels (df vl
)
330 "Variable-name handling for DATAFRAME-LIKE. Needs error checking."
331 (if (= (length (var-labels df
))
333 (setf (var-labels df
) vl
)
334 (error "wrong size.")))
336 (defsetf varlabels set-varlabels
)
338 ;;; Case-name handling for Tables. Needs error checking.
339 (defun caselabels (df)
340 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
343 (defun set-caselabels (df cl
)
344 "Case-name handling for DATAFRAME-LIKE. Needs error checking."
345 (if (= (length (case-labels df
))
347 (setf (case-labels df
) cl
)
348 (error "wrong size.")))
350 (defsetf caselabels set-caselabels
)
352 ;;;;;;;;;;;; IMPLEMENTATIONS, with appropriate methods.
354 ;; (documentation 'dataframe-like 'type)
356 ;;;;; DATAFRAME-ARRAY
358 (defclass dataframe-array
(dataframe-like)
359 ((store :initform nil
363 :documentation
"Data storage: typed as array."))
364 (:documentation
"example implementation of dataframe-like using storage
365 based on lisp arrays. An obvious alternative could be a
366 dataframe-matrix-like which uses the lisp-matrix classes."))
368 (defmethod nrows ((df dataframe-array
))
369 "specializes on inheritance from matrix-like in lisp-matrix."
370 (array-dimension (dataset df
) 0))
372 (defmethod ncols ((df dataframe-array
))
373 "specializes on inheritance from matrix-like in lisp-matrix."
374 (array-dimension (dataset df
) 1))
376 (defmethod dfref ((df dataframe-array
)
377 (index1 number
) (index2 number
))
378 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
379 idx1/2 is row/col or case/var."
380 (aref (dataset df
) index1 index2
))
382 (defmethod set-dfref ((df dataframe-array
) (index1 number
) (index2 number
) val
)
383 "set value for df-ar."
384 ;; (check-type val (elt (var-type df) index2))
385 (setf (aref (dataset df
) index1 index2
) val
))
387 (defparameter *default-dataframe-class
* 'dataframe-array
)
389 (defmethod dfselect ((df dataframe-array
)
390 &optional cases vars indices
)
391 "Extract the OR of cases, vars, or have a list of indices to extract"
392 (if indices
(error "Indicies not used yet"))
393 (let ((newdf (make-instance *default-dataframe-class
*
394 :storage
(make-array (list (length cases
) (length vars
)))
395 :nrows
(length cases
)
398 :case-labels
(select-list caselist
(case-labels df
))
399 :var-labels
(select-list varlist
(var-labels df
))
400 :var-types
(select-list varlist
(vartypes df
))
403 (dotimes (i (length cases
))
404 (dotimes (j (length vars
))
405 (setf (dfref newdf i j
)
407 (position (elt cases i
) (case-labels df
))
408 (position (elt vars j
) (var-labels df
))))))))
410 ;;;;; DATAFRAME-MATRIXLIKE
412 (defclass dataframe-matrixlike
(dataframe-like)
413 ((store :initform nil
417 :documentation
"Data storage: typed as matrix-like
419 (:documentation
"example implementation of dataframe-like using storage
420 based on lisp-matrix structures."))
422 (defmethod nrows ((df dataframe-matrixlike
))
423 "specializes on inheritance from matrix-like in lisp-matrix."
424 (matrix-dimension (dataset df
) 0))
426 (defmethod ncols ((df dataframe-matrixlike
))
427 "specializes on inheritance from matrix-like in lisp-matrix."
428 (matrix-dimension (dataset df
) 1))
430 (defmethod dfref ((df dataframe-matrixlike
)
431 (index1 number
) (index2 number
))
432 "Returns a scalar in array, in the same vein as aref, mref, vref, etc.
433 idx1/2 is row/col or case/var."
434 (mref (dataset df
) index1 index2
))
436 (defmethod set-dfref ((df dataframe-matrixlike
)
437 (index1 number
) (index2 number
) val
)
438 "Sets a value for df-ml."
439 ;; NEED TO CHECK TYPE!
440 ;; (check-type val (elt (vartype df) index2))
441 (setf (mref (dataset df
) index1 index2
) val
))
445 ;;;;;; IMPLEMENTATION INDEPENDENT FUNCTIONS AND METHODS
446 ;;;;;; (use only dfref, nrows, ncols and similar dataframe-like
447 ;;;;;; components as core).
449 (defun dfref-var (df index return-type
)
450 "Returns the data in a single variable as type.
451 type = sequence, vector, vector-like (if valid numeric type) or dataframe."
455 #'(lambda (x) (dfref df index x
))
456 (gen-seq (nth 2 (array-dimensions (dataset df
))))))
462 (defun dfref-case (df index return-type
)
463 "Returns row as sequence."
467 #'(lambda (x) (dfref df x index
))
468 (gen-seq (nth 1 (array-dimensions (dataset df
))))))
475 (defun dfref-2indexlist (df indexlist1 indexlist2
&key
(return-type :array
))
476 "return an array, row X col dims. FIXME TESTME"
479 (let ((my-pre-array (list)))
480 (dolist (x indexlist1
)
481 (dolist (y indexlist2
)
482 (append my-pre-array
(dfref df x y
))))
483 (make-array (list (length indexlist1
)
485 :initial-contents my-pre-array
)))
487 (make-instance 'dataframe-array
489 (list (length indexlist1
)
491 :initial-contents
(dataset df
))
492 ;; ensure copy for this and following
494 ;; the following 2 need to be subseted based on
495 ;; the values of indexlist1 and indexlist2
496 :case-labels
(case-labels df
)
497 :var-labels
(var-labels df
)))))
499 ;;; Do we establish methods for dataframe-like, which specialize to
500 ;;; particular instances of storage?
502 (defmethod print-object ((object dataframe-like
) stream
)
503 (print-unreadable-object (object stream
:type t
)
504 (format stream
" ~d x ~d" (nrows object
) (ncols object
))
506 ;; (format stream "~T ~{~S ~T~}" (var-labels object))
507 (dotimes (j (ncols object
)) ; print labels
508 (write-char #\tab stream
)
509 (write-char #\tab stream
)
510 (format stream
"~T~A~T" (nth j
(var-labels object
))))
511 (dotimes (i (nrows object
)) ; print obs row
513 (format stream
"~A:~T" (nth i
(case-labels object
)))
514 (dotimes (j (ncols object
))
515 (write-char #\tab stream
) ; (write-char #\space stream)
516 ;; (write (dfref object i j) :stream stream)
517 (format stream
"~7,3E" (dfref object i j
)) ; if works, need to include a general output mechanism control
521 (defun print-structure-relational (ds)
522 "example of what we want the methods to look like. Should be sort
523 of like a graph of spreadsheets if the storage is a relational
525 (dolist (k (relations ds
))
526 (let ((currentRelationSet (getRelation ds k
)))
527 (print-as-row (var-labels currentRelationSet
))
529 (dolist (i (case-labels currentRelationSet
))
532 (dfref-obsn (dataset currentRelationSet
)