redid timestamp madness. Ignore, please.
[CommonLispStat.git] / data.lisp
blobd39e8c34cd0607158e087804608d27282c2c47d4
1 ;;; -*- mode: lisp -*-
2 ;;; Copyright (c) 2005--2007, by A.J. Rossini <blindglobe@gmail.com>
3 ;;; See COPYRIGHT file for any additional restrictions (BSD license).
4 ;;; Since 1991, ANSI was finally finished. Edited for ANSI Common Lisp.
6 ;;; File: data.lisp
7 ;;; Author: AJ Rossini <blindglobe@gmail.com>
8 ;;; Copyright: (c)2007, AJ Rossini. BSD, LLGPL, or GPLv2, depending
9 ;;; on how it arrives.
10 ;;; Purpose: data package for lispstat
11 ;;; Time-stamp: <2006-05-19 12:33:41 rossini>
12 ;;; Creation: <2006-05-17 21:34:07 rossini>
14 ;;; What is this talk of 'release'? Klingons do not make software
15 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
16 ;;; designers and quality assurance people in its wake.
18 ;;; This organization and structure is new to the 21st Century
19 ;;; version.
21 ;;; conside that data has 3 genotypic chracteristrics. The first
22 ;;; would be form -- scalar, vector, array. second would be
23 ;;; datarep type. in particular integer, real, string, symbol. The last
24 ;;; would be statistical type. augmenting datarep type with use in a
25 ;;; statistical context, i.e. that would include nominal, ordinal,
26 ;;; integer, continous, interval (orderable subtypes). Clearly, the
27 ;;; statistical type can be inherited, likewise the numerical type as
28 ;;; well. The form can be pushed up or simplified as necessary, but
29 ;;; this can be challenging.
31 (in-package :cl-user)
33 (defpackage :lisp-stat-data
34 (:documentation "Data I/O, management, other data technologies.")
35 (:nicknames :ls-data)
36 (:use :common-lisp
37 ;;:cxml
38 :lisp-stat-config
39 :lisp-stat-object-system
40 :lisp-stat-types
41 :lisp-stat-compound-data
42 :lisp-stat-matrix
43 :lisp-stat-linalg)
44 (:shadowing-import-from :lisp-stat-object-system
45 slot-value call-method call-next-method)
46 (:export open-file-dialog read-data-file read-data-columns load-data
47 load-example *variables* *ask-on-redefine*
48 def variables savevar undef))
50 (in-package :lisp-stat-data)
52 ;;; The purpose of this package is to manage data which will be
53 ;;; processed by LispStat. In particular, it willbe importnat to
54 ;;; register variables, datasets, relational structures, and other
55 ;;; objects which could be the target for statistical modeling and
56 ;;; inference.
58 (defvar *lisp-stat-data-table* (make-hash-table)
59 "Marks up the data the could be used by.")
61 (defvar *lisp-stat-data-count* 0
62 "number of items currently recorded.")
65 ;;; Data (storage) Types, dt-{.*}
66 ;;;
67 ;;; Data types are the representation of data from a computer-science
68 ;;; perspective, i.e. what it is that they contain. These types
69 ;;; include particular forms of compound types (i.e. dataframe is
70 ;;; array-like, but types differ, difference is row-wise, while array
71 ;;; is a compound of elements of the same type.
72 ;;;
74 ;;Examples:
75 ;; (defun equidimensional (a)
76 ;; (or (< (array-rank a) 2)
77 ;; (apply #'= (array-dimensions a)))) => EQUIDIMENSIONAL
78 ;; (deftype square-matrix (&optional type size)
79 ;; `(and (array ,type (,size ,size))
80 ;; (satisfies equidimensional))) => SQUARE-MATRIX
82 (defun array-of-equal-dt-scalar-type (x)
83 ;; return dt-scalar-type which fits (more precise that works)
84 (if x
85 'integer
86 nil))
88 (defun array-of-equal-dt-scalar-type-within-column (x)
89 ;; return dt-scalar-type which fits (more precise that works)
90 (if x
91 'integer
92 nil))
96 (deftype dt-scalar (&optional type)
97 `(or integer double complex symbol))
99 (deftype dt-array (&optional ndim dimlist)
100 `(satisfies array-of-equal-dt-scalar-type))
102 (deftype dt-dataframe (&optional )
103 `(satisfies array-of-equal-dt-scalar-type-within-column))
105 ;(deftype dt-relationaldata ()
106 ; `(satisfies (foreach unit in relationalUnit
107 ; (typep unit 'dt-dataframe))))
110 ;;; Statistical Variable Types, sv-{.*}
111 ;;;
112 ;;; Statistical variable types work to represent the statistical
113 ;;; category represented by the variable, i.e. nominal, ordinal,
114 ;;; integral, continous, ratio. This metadata can be used to hint at
115 ;;; appropriate analysis methods -- or perhaps more critically, to
116 ;;; define how these methods will fail in the final interrpretation.
118 (deftype sv-nominal (&optional n)
121 (deftype sv-ordinal (ordering &optional n)
124 (deftype sv-categorical ()
125 `(satisfies (or sv-nominal sv-ordinal)))
126 ;;(deftype sv-integer )
127 ;;(deftype sv-real ) ;; precision could be a secondary component of real, rational, complex.
128 ;;(deftype sv-rational )
129 ;;(deftype sv-complex )
130 ;;(deftype sv-continuous (or 'sv-integer 'sv-real 'sv-rational 'sv-complex)) ;; perhaps, call it "mostly contin..."
133 ;;; Data I/O
135 ;; We can read 2 types of data -- those which are pure data, and those
136 ;; which are impure (lisp-enabled, data as program as data thingy's).
138 (defparameter *lisp-stat-data-formats*
139 '(csv tsv))
141 ;; (defgeneric data-read (srce frmt)
142 ;; "read data from stream srce, in format frmt.")
144 ;; (defgeneric data-write (srce frmt)
145 ;; "read data from stream srce, in format frmt.")
147 ;; (defmacro with-data (body)
148 ;; "Stream-handling, maintaining I/O through object typing.")
150 ;; design-wise should these be replaced with a "with-data" form?
152 ;; DSV processing
154 ;; XML processing
156 ;;; Data Management
158 ;; the goal is to have 2 operations which can be used to create new
159 ;; data formats out of old ones.
161 ;; (defgeneric data-subset (ds description)
162 ;; "Take a dataset and make it smaller.")
164 ;; (defgeneric data-relate (ds description)
165 ;; "Take 2 or more datasets, and grow them into a bigger one through
166 ;; relating them (i.e. merge is one example).")
168 ;;; Data tools from "statistics.lsp"
170 ;;;;
171 ;;;; Data File Reading
172 ;;;;
174 (defun count-file-columns (fname)
175 "Args: (fname)
176 Returns the number of lisp items on the first nonblank line of file FNAME."
177 (with-open-file (f fname)
178 (if f
179 (let ((line (do ((line (read-line f) (read-line f)))
180 ((or (null line) (< 0 (length line))) line))))
181 (if line
182 (with-input-from-string (s line)
183 (do ((n 0 (+ n 1)) (eof (gensym)))
184 ((eq eof (read s nil eof)) n))))))))
186 #+xlisp (defvar *xlisptable* *readtable*)
188 (if (not (fboundp 'open-file-dialog))
189 #+dialogs
190 (defun open-file-dialog () ;; why?(&optional set)
191 (get-string-dialog "Enter a data file name:"))
192 #-dialogs
193 (defun open-file-dialog () ;; why? (&optional set)
194 (error "You must provide a file name explicitly")))
196 (defun read-data-file (&optional (file (open-file-dialog t)))
197 "Args: (file)
198 Returns a list of all lisp objects in FILE. FILE can be a string or a symbol,
199 in which case the symbol'f print name is used."
200 (if file
201 (let ((eof (gensym)))
202 (with-open-file (f file)
203 (if f
204 (do* ((r (read f nil eof) (read f nil eof))
205 (x (list nil))
206 (tail x (cdr tail)))
207 ((eq r eof) (cdr x))
208 (setf (cdr tail) (list r))))))))
210 ;;; New definition to avoid stack size limit in apply
211 (defun read-data-columns (&optional (file (open-file-dialog t))
212 (cols (if file
213 (count-file-columns file))))
214 "Args: (&optional file cols)
215 Reads the data in FILE as COLS columns and returns a list of lists representing the columns."
216 (if (and file cols)
217 (transpose (split-list (read-data-file file) cols))))
220 ;;; FIXME:AJR: ALL THE FOLLOWING NEED TO BE SOLVED BY PLATFORM-INDEP PATHNAME WORK!
221 ;;; FIXME:AJR: use either string or pathname.
223 (defun path-string-to-path (p s)
224 (pathname (concatenate 'string (namestring p) s)))
226 (defun load-data (file)
227 "Args: (file) as string
228 Read in data file from the data examples library."
229 (if (load (path-string-to-path *lispstat-data-dir* file))
231 (load (path-string-to-path *lispstat-examples-dir* file))))
233 (defun load-example (file)
234 "Args: (file) as string
235 Read in lisp example file from the examples library."
236 (if (load (path-string-to-path *lispstat-examples-dir* file))
238 (load (path-string-to-path *lispstat-data-dir* file))))
240 ;;;;
241 ;;;; Listing and Saving Variables and Functions
242 ;;;;
244 (defvar *variables* nil)
245 (defvar *ask-on-redefine* nil)
247 (defmacro def (symbol value)
248 "Syntax: (def var form)
249 VAR is not evaluated and must be a symbol. Assigns the value of FORM to
250 VAR and adds VAR to the list *VARIABLES* of def'ed variables. Returns VAR.
251 If VAR is already bound and the global variable *ASK-ON-REDEFINE*
252 is not nil then you are asked if you want to redefine the variable."
253 `(unless (and *ask-on-redefine*
254 (boundp ',symbol)
255 (not (y-or-n-p "Variable has a value. Redefine?")))
256 (if (boundp ',symbol)
257 (setf ,symbol ,value)
258 (defvar ,symbol ,value))
259 (pushnew ',symbol *variables*)
260 ',symbol))
262 (defun variables-list ()
263 (mapcar #'intern (sort-data (mapcar #'string *variables*))))
265 (defun variables ()
266 "Args:()
267 Returns a list of the names of all def'ed variables to STREAM"
268 (if *variables*
269 (mapcar #'intern (sort-data (mapcar #'string *variables*)))))
271 (defun savevar (vars file)
272 "Args: (vars file-name-root)
273 VARS is a symbol or a list of symbols. FILE-NAME-ROOT is a string (or a symbol
274 whose print name is used) not endinf in .lsp. The VARS and their current values
275 are written to the file FILE-NAME-ROOT.lsp in a form suitable for use with the
276 load command."
277 (with-open-file (f (concatenate 'string (namestring file) ".lsp")
278 :direction :output)
279 (let ((vars (if (consp vars) vars (list vars))))
280 (flet ((save-one (x)
281 (let ((v (symbol-value x)))
282 (if (objectp v)
283 (format f "(def ~s ~s)~%" x (send v :save))
284 (format f "(def ~s '~s)~%" x v)))))
285 (mapcar #'save-one vars))
286 vars)))
288 (defun undef (v)
289 "Args: (v)
290 If V is the symbol of a defined variable the variable it is unbound and
291 removed from the list of defined variables. If V is a list of variable
292 names each is unbound and removed. Returns V."
293 (dolist (s (if (listp v) v (list v)))
294 (when (member s *variables*)
295 (setq *variables* (delete s *variables*))
296 (makunbound s)))