docs and description improved.
[CommonLispStat.git] / src / data / data.lisp
blobac39d097a8f4131e9ccd27587cdbe905d5c9c5f3
1 ;;; -*- mode: lisp -*-
2 ;;; Copyright (c) 2005--2007, by A.J. Rossini <blindglobe@gmail.com>
3 ;;; See COPYRIGHT file for any additional restrictions (BSD license).
4 ;;; Since 1991, ANSI was finally finished. Edited for ANSI Common Lisp.
6 ;;; File: data.lisp
7 ;;; Author: AJ Rossini <blindglobe@gmail.com>
8 ;;; Copyright: (c)2007, AJ Rossini. BSD, LLGPL, or GPLv2, depending
9 ;;; on how it arrives.
10 ;;; Purpose: data package for lispstat
11 ;;; Time-stamp: <2006-05-19 12:33:41 rossini>
12 ;;; Creation: <2006-05-17 21:34:07 rossini>
14 ;;; What is this talk of 'release'? Klingons do not make software
15 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
16 ;;; designers and quality assurance people in its wake.
18 ;;; This organization and structure is new to the 21st Century
19 ;;; version.
21 ;;; conside that data has 3 genotypic chracteristrics. The first
22 ;;; would be form -- scalar, vector, array. second would be datarep
23 ;;; ("computer science simplistic data" type. in particular integer,
24 ;;; real, string, symbol. The last would be statistical type
25 ;;; ("usually handled by computer sicience approaches via metadata").
26 ;;; augmenting datarep type with use in a statistical context,
27 ;;; i.e. that would include nominal, ordinal, integer, continous,
28 ;;; interval (orderable subtypes). Clearly, the statistical type can
29 ;;; be inherited, likewise the numerical type as well. The form can
30 ;;; be pushed up or simplified as necessary, but this can be
31 ;;; challenging.
33 ;;; The first approach we will take for CLS is to handle this as
34 ;;; lisp-only structures. At the time of realization
35 ;;; (?instantiation?) of an "abstract" model, the data should be
36 ;;; pushed into an appropriate form (either "en masse", or
37 ;;; "on-demand") into a linear algebra framework.
39 ;;; There is some excellent material on this by John Chambers in one
40 ;;; of his earlier books. Reference is being ignored to encourage
41 ;;; people to read them all. With all due respect to John, they've
42 ;;; lasted quite well, but need to be updated.
44 (in-package :cl-user)
46 (defpackage :lisp-stat-data
47 (:documentation "Data management, integration, I/O, and other data technologies.")
48 (:nicknames :ls-data)
49 (:use :common-lisp
50 :lisp-stat-object-system
51 :lisp-stat-config
52 :lisp-stat-types
53 :lisp-stat-compound-data)
54 (:shadowing-import-from :lisp-stat-object-system
55 slot-value call-method call-next-method)
56 (:export open-file-dialog read-data-file read-data-columns load-data
57 load-example *variables* *ask-on-redefine*
58 def variables savevar undef))
60 (in-package :lisp-stat-data)
62 ;;; The purpose of this package is to manage data which will be
63 ;;; processed by LispStat. In particular, it will be important to
64 ;;; register variables, datasets, relational structures, and other
65 ;;; objects which could be the target for statistical modeling and
66 ;;; inference.
68 (defvar *lisp-stat-data-table* (make-hash-table)
69 "Marks up the data the could be used by.")
71 (defvar *lisp-stat-data-count* 0
72 "number of items currently recorded.")
74 ;;; Data (storage) Types, dt-{.*}
75 ;;;
76 ;;; Data types are the representation of data from a computer-science
77 ;;; perspective, i.e. what it is that they contain. These types
78 ;;; include particular forms of compound types (i.e. dataframe is
79 ;;; array-like, but types differ, difference is row-wise, while array
80 ;;; is a compound of elements of the same type.
81 ;;;
82 ;;; This is completely subject to change.
84 ;;Examples:
85 ;; (defun equidimensional (a)
86 ;; (or (< (array-rank a) 2)
87 ;; (apply #'= (array-dimensions a)))) => EQUIDIMENSIONAL
88 ;; (deftype square-matrix (&optional type size)
89 ;; `(and (array ,type (,size ,size))
90 ;; (satisfies equidimensional))) => SQUARE-MATRIX
92 (defun array-of-equal-dt-scalar-type (x)
93 ;; return dt-scalar-type which fits (more precise that works)
94 (if x
95 'integer
96 nil))
98 (defun array-of-equal-dt-scalar-type-within-column (x)
99 ;; return dt-scalar-type which fits (more precise that works)
100 (if x
101 'integer
102 nil))
106 (deftype dt-scalar (&optional type)
107 `(or integer double complex symbol))
109 (deftype dt-array (&optional ndim dimlist)
110 `(satisfies array-of-equal-dt-scalar-type))
112 (deftype dt-dataframe (&optional )
113 `(satisfies array-of-equal-dt-scalar-type-within-column))
115 ;(deftype dt-relationaldata ()
116 ; `(satisfies (foreach unit in relationalUnit
117 ; (typep unit 'dt-dataframe))))
120 ;;; Statistical Variable Types, sv-{.*}
121 ;;;
122 ;;; Statistical variable types work to represent the statistical
123 ;;; category represented by the variable, i.e. nominal, ordinal,
124 ;;; integral, continous, ratio. This metadata can be used to hint at
125 ;;; appropriate analysis methods -- or perhaps more critically, to
126 ;;; define how these methods will fail in the final interrpretation.
128 (deftype sv-nominal (&optional n)
131 (deftype sv-ordinal (ordering &optional n)
134 (deftype sv-categorical ()
135 `(satisfies (or sv-nominal sv-ordinal)))
136 ;;(deftype sv-integer )
137 ;;(deftype sv-real ) ;; precision could be a secondary component of real, rational, complex.
138 ;;(deftype sv-rational )
139 ;;(deftype sv-complex )
140 ;;(deftype sv-continuous (or 'sv-integer 'sv-real 'sv-rational 'sv-complex)) ;; perhaps, call it "mostly contin..."
143 ;;; Data I/O
145 ;; We can read 2 types of data -- those which are pure data, and those
146 ;; which are impure (lisp-enabled, data as program as data thingy's).
148 (defparameter *lisp-stat-data-formats*
149 '(csv tsv))
151 ;; (defgeneric data-read (srce frmt)
152 ;; "read data from stream srce, in format frmt.")
154 ;; (defgeneric data-write (srce frmt)
155 ;; "read data from stream srce, in format frmt.")
157 ;; (defmacro with-data (body)
158 ;; "Stream-handling, maintaining I/O through object typing.")
160 ;; design-wise should these be replaced with a "with-data" form?
162 ;;; These need to be elsewhere...!
163 ;; DSV processing
164 ;; XML processing
166 ;;; DM operations should be somewhere else as well.
167 ;;; Data Management
169 ;; the goal is to have 2 operations which can be used to create new
170 ;; data formats out of old ones.
172 ;; (defgeneric data-subset (ds description)
173 ;; "Take a dataset and make it smaller.")
175 ;; (defgeneric data-relate (ds description)
176 ;; "Take 2 or more datasets, and grow them into a bigger one through
177 ;; relating them (i.e. merge is one example).")
179 ;;; What should we be able to do?
181 ;;; Actions on a single dataset
182 ;;* subset-dataset original-set
183 ;; :list-of-columns :list-of-rows :list-of-rows-and-columns
184 ;; :list-of-indices
185 ;;* resample-dataset original-set ;
186 ;; :by-rows :by-columns :row-weights :column-weights
187 ;; :new-number-of-columns :new-number-of-rows
190 ;;; Actions based on 2 or more datasets
191 ;;* concat-dataset set1 set2 ; no matching
192 ;; :by-row :by-column :kronecker-product
193 ;;* merge-dataset set1 set2
194 ;; :match-on-column :match-on-row
197 ;;; Data tools from "statistics.lsp"
199 ;;;;
200 ;;;; Data File Reading
201 ;;;;
203 (defun count-file-columns (fname)
204 "Args: (fname)
205 Returns the number of lisp items on the first nonblank line of file FNAME."
206 (with-open-file (f fname)
207 (if f
208 (let ((line (do ((line (read-line f) (read-line f)))
209 ((or (null line) (< 0 (length line))) line))))
210 (if line
211 (with-input-from-string (s line)
212 (do ((n 0 (+ n 1)) (eof (gensym)))
213 ((eq eof (read s nil eof)) n))))))))
215 #+xlisp (defvar *xlisptable* *readtable*)
217 (if (not (fboundp 'open-file-dialog))
218 #+dialogs
219 (defun open-file-dialog () ;; why?(&optional set)
220 (get-string-dialog "Enter a data file name:"))
221 #-dialogs
222 (defun open-file-dialog () ;; why? (&optional set)
223 (error "You must provide a file name explicitly")))
225 (defun read-data-file (&optional (file (open-file-dialog t)))
226 "Args: (file)
227 Returns a list of all lisp objects in FILE. FILE can be a string or a symbol,
228 in which case the symbol'f print name is used."
229 (if file
230 (let ((eof (gensym)))
231 (with-open-file (f file)
232 (if f
233 (do* ((r (read f nil eof) (read f nil eof))
234 (x (list nil))
235 (tail x (cdr tail)))
236 ((eq r eof) (cdr x))
237 (setf (cdr tail) (list r))))))))
239 ;;; New definition to avoid stack size limit in apply
240 (defun read-data-columns (&optional (file (open-file-dialog t))
241 (cols (if file
242 (count-file-columns file))))
243 "Args: (&optional file cols)
244 Reads the data in FILE as COLS columns and returns a list of lists representing the columns."
245 (if (and file cols)
246 (transpose (split-list (read-data-file file) cols))))
249 ;;; FIXME:AJR: ALL THE FOLLOWING NEED TO BE SOLVED BY PLATFORM-INDEP PATHNAME WORK!
250 ;;; FIXME:AJR: use either string or pathname.
252 (defun path-string-to-path (p s)
253 (pathname (concatenate 'string (namestring p) s)))
255 (defun load-data (file)
256 "Args: (file) as string
257 Read in data file from the data examples library."
258 (if (load (path-string-to-path *lispstat-data-dir* file))
260 (load (path-string-to-path *lispstat-examples-dir* file))))
262 (defun load-example (file)
263 "Args: (file) as string
264 Read in lisp example file from the examples library."
265 (if (load (path-string-to-path *lispstat-examples-dir* file))
267 (load (path-string-to-path *lispstat-data-dir* file))))
269 ;;;;
270 ;;;; Listing and Saving Variables and Functions
271 ;;;;
273 (defvar *variables* nil)
274 (defvar *ask-on-redefine* nil)
276 (defmacro def (symbol value)
277 "Syntax: (def var form)
278 VAR is not evaluated and must be a symbol. Assigns the value of FORM to
279 VAR and adds VAR to the list *VARIABLES* of def'ed variables. Returns VAR.
280 If VAR is already bound and the global variable *ASK-ON-REDEFINE*
281 is not nil then you are asked if you want to redefine the variable."
282 `(unless (and *ask-on-redefine*
283 (boundp ',symbol)
284 (not (y-or-n-p "Variable has a value. Redefine?")))
285 (if (boundp ',symbol)
286 (setf ,symbol ,value)
287 (defvar ,symbol ,value))
288 (pushnew ',symbol *variables*)
289 ',symbol))
291 (defun variables-list ()
292 (mapcar #'intern (sort-data (mapcar #'string *variables*))))
294 (defun variables ()
295 "Args:()
296 Returns a list of the names of all def'ed variables to STREAM"
297 (if *variables*
298 (mapcar #'intern (sort-data (mapcar #'string *variables*)))))
300 (defun savevar (vars file)
301 "Args: (vars file-name-root)
302 VARS is a symbol or a list of symbols. FILE-NAME-ROOT is a string (or a symbol
303 whose print name is used) not endinf in .lsp. The VARS and their current values
304 are written to the file FILE-NAME-ROOT.lsp in a form suitable for use with the
305 load command."
306 (with-open-file (f (concatenate 'string (namestring file) ".lsp")
307 :direction :output)
308 (let ((vars (if (consp vars) vars (list vars))))
309 (flet ((save-one (x)
310 (let ((v (symbol-value x)))
311 (if (objectp v)
312 (format f "(def ~s ~s)~%" x (send v :save))
313 (format f "(def ~s '~s)~%" x v)))))
314 (mapcar #'save-one vars))
315 vars)))
317 (defun undef (v)
318 "Args: (v)
319 If V is the symbol of a defined variable the variable it is unbound and
320 removed from the list of defined variables. If V is a list of variable
321 names each is unbound and removed. Returns V."
322 (dolist (s (if (listp v) v (list v)))
323 (when (member s *variables*)
324 (setq *variables* (delete s *variables*))
325 (makunbound s)))