Merge branch 'tonylocal' into mob
[CommonLispStat.git] / data.lisp
blob75821468c5f7e57fa5b267c5023ae11a6b4f4066
1 ;;; -*- mode: lisp -*-
2 ;;; Copyright (c) 2005--2007, by A.J. Rossini <blindglobe@gmail.com>
3 ;;; See COPYRIGHT file for any additional restrictions (BSD license).
4 ;;; Since 1991, ANSI was finally finished. Edited for ANSI Common Lisp.
6 ;;; File: data.lisp
7 ;;; Author: AJ Rossini <blindglobe@gmail.com>
8 ;;; Copyright: (c)2007, AJ Rossini. BSD, LLGPL, or GPLv2, depending on how it arrives.
9 ;;; Purpose: data package for lispstat
10 ;;; Time-stamp: <2006-05-19 12:33:41 rossini>
11 ;;; Creation: <2006-05-17 21:34:07 rossini>
13 ;;; What is this talk of 'release'? Klingons do not make software
14 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
15 ;;; designers and quality assurance people in its wake.
17 ;;; This organization and structure is new to the 21st Century
18 ;;; version.
20 ;;; conside that dataa has 3 genotypic chracteristrics. The first
21 ;;; would be form -- scalar, vector, array. second would be
22 ;;; datarep type. in particular integer, real, string, symbol. The last
23 ;;; would be statistical type. augmenting datarep type with use in a
24 ;;; statistical context, i.e. that would include nominal, ordinal,
25 ;;; integer, continous, interval (orderable subtypes)
27 (in-package :cl-user)
29 (defpackage :lisp-stat-data
30 (:documentation "Data I/O, management, other data technologies.")
31 (:nicknames :ls-data)
32 (:use :common-lisp
33 ;;:lisp-stat-config
34 ;;:cxml
35 :lisp-stat-object-system
36 :lisp-stat-types
37 :lisp-stat-compound-data
38 :lisp-stat-matrix
39 :lisp-stat-linalg
40 :lisp-stat-sequence)
41 (:shadowing-import-from :lisp-stat-object-system
42 slot-value call-method call-next-method)
43 (:export
44 ;;; from statistics.lsp
45 open-file-dialog read-data-file read-data-columns load-data
46 load-example *variables* *ask-on-redefine* def variables savevar
47 undef split-list ))
49 (in-package :lisp-stat-data)
51 ;; (deftype dataformtype (list scalar vector sequence list array relation))
52 ;; (deftype datareptype (list integer rational real complex string symbol=
53 ;; (deftype stattzpe (list
58 ;;; The purpose of this package is to manage data which will be
59 ;;; processed by LispStat. In particular, it willbe importnat to
60 ;;; register variables, datasets, relational structures, and other
61 ;;; objects which could be the target for statistical modeling and
62 ;;; inference.
64 (defvar *lisp-stat-data-table* (make-hash-table)
65 "Marks up the data the could be used by.")
67 (defvar *lisp-stat-data-count* 0
68 "number of items currently recorded.")
71 ;;; Data Types:
72 ;;;
73 ;;; Data types are the representation of data from a computer-science
74 ;;; perspective, i.e. what it is that they contain. These types
75 ;;; include particular forms of compound types (i.e. dataframe,
76 ;;; relationdata are compounds of arrays of different types where the
77 ;;; difference is row-wise, while array is a compound of elements of
78 ;;; the same type.
80 ;;Examples:
81 ;; (defun equidimensional (a)
82 ;; (or (< (array-rank a) 2)
83 ;; (apply #'= (array-dimensions a)))) => EQUIDIMENSIONAL
84 ;; (deftype square-matrix (&optional type size)
85 ;; `(and (array ,type (,size ,size))
86 ;; (satisfies equidimensional))) => SQUARE-MATRIX
88 (deftype dt-scalar (&optional type)
89 `(or integer double complex))
91 (deftype dt-array (&optional type)
92 `(satisfies array-of-equal-type))
94 (deftype dt-dataframe ()
95 `(satisfies array-of-equal-type-within-column))
97 (deftype dt-relationaldata ()
98 `(satisfies (foreach unit in relationalUnit
99 (typep unit 'dt-dataframe))))
104 ;;; Statistical Variable Classes
107 (deftype sv-nominal (&optional length)
110 (deftype sv-ordinal (ordering &optional length)
113 (deftype sv-categorical ()
114 `(satisfies (or sv-nominal sv-ordinal)))
115 ;;(deftype sv-integer )
116 ;;(deftype sv-real )
117 ;;(deftype sv-rational )
118 ;;(deftype sv-complex )
119 ;;(deftype sv-continuous (or 'sv-integer 'sv-real 'sv-rational 'sv-complex))
124 ;;; Data I/O
126 ;; We can read 2 types of data -- those which are pure data, and those
127 ;; which are imprue (lisp-enables).
129 (defparameter *lisp-stat-data-formats*
130 '(csv tsv))
132 ;; (defgeneric data-read (srce frmt)
133 ;; "read data from stream srce, in format frmt.")
135 ;; (defgeneric data-write (srce frmt)
136 ;; "read data from stream srce, in format frmt.")
138 ;; (defmacro with-data (body)
139 ;; "Stream-handling, maintaining I/O through object typing.")
141 ;; design-wise should these be replaced with a "with-data" form?
144 ;; DSV processing
146 ;; XML processing
148 ;;; Data Management
150 ;; the goal is to have 2 operations which can be used to create new
151 ;; data formats out of old ones.
153 ;; (defgeneric data-subset (ds description)
154 ;; "Take a dataset and make it smaller.")
156 ;; (defgeneric data-relate (ds description)
157 ;; "Take 2 or more datasets, and grow them into a bigger one through
158 ;; relating them (i.e. merge is one example).")
160 ;;; Data tools from "statistics.lsp"
162 ;;;;
163 ;;;; Data File Reading
164 ;;;;
166 (defun count-file-columns (fname)
167 "Args: (fname)
168 Returns the number of lisp items on the first nonblank line of file FNAME."
169 (with-open-file (f fname)
170 (if f
171 (let ((line (do ((line (read-line f) (read-line f)))
172 ((or (null line) (< 0 (length line))) line))))
173 (if line
174 (with-input-from-string (s line)
175 (do ((n 0 (+ n 1)) (eof (gensym)))
176 ((eq eof (read s nil eof)) n))))))))
178 #+xlisp (defvar *xlisptable* *readtable*)
180 (if (not (fboundp 'open-file-dialog))
181 #+dialogs
182 (defun open-file-dialog (&optional set)
183 (get-string-dialog "Enter a data file name:"))
184 #-dialogs
185 (defun open-file-dialog (&optional set)
186 (error "You must provide a file name explicitly")))
188 (defun read-data-file (&optional (file (open-file-dialog t)))
189 "Args: (file)
190 Returns a list of all lisp objects in FILE. FILE can be a string or a symbol,
191 in which case the symbol'f print name is used."
192 (if file
193 (let ((eof (gensym)))
194 (with-open-file (f file)
195 (if f
196 (do* ((r (read f nil eof) (read f nil eof))
197 (x (list nil))
198 (tail x (cdr tail)))
199 ((eq r eof) (cdr x))
200 (setf (cdr tail) (list r))))))))
202 ;;; New definition to avoid stack size limit in apply
203 (defun read-data-columns (&optional (file (open-file-dialog t))
204 (cols (if file
205 (count-file-columns file))))
206 "Args: (&optional file cols)
207 Reads the data in FILE as COLS columns and returns a list of lists representing the columns."
208 (if (and file cols)
209 (transpose (split-list (read-data-file file) cols))))
212 ;;; FIXME:AJR: ALL THE FOLLOWING NEED TO BE SOLVED BY PLATFORM-INDEP PATHNAME WORK!
213 ;;; FIXME:AJR: use either string or pathname.
215 (defun path-string-to-path (p s)
216 (pathname (concatenate 'string (namestring p) s)))
218 (defun load-data (file)
219 "Args: (file) as string
220 Read in data file from the data examples library."
221 (if (load (path-string-to-path *lispstat-data-dir* file))
223 (load (path-string-to-path *lispstat-examples-dir* file))))
225 (defun load-example (file)
226 "Args: (file) as string
227 Read in lisp example file from the examples library."
228 (if (load (path-string-to-path *lispstat-examples-dir* file))
230 (load (path-string-to-path *lispstat-data-dir* file))))
232 ;;;;
233 ;;;; Listing and Saving Variables and Functions
234 ;;;;
236 (defvar *variables* nil)
237 (defvar *ask-on-redefine* nil)
239 (defmacro def (symbol value)
240 "Syntax: (def var form)
241 VAR is not evaluated and must be a symbol. Assigns the value of FORM to
242 VAR and adds VAR to the list *VARIABLES* of def'ed variables. Returns VAR.
243 If VAR is already bound and the global variable *ASK-ON-REDEFINE*
244 is not nil then you are asked if you want to redefine the variable."
245 `(unless (and *ask-on-redefine*
246 (boundp ',symbol)
247 (not (y-or-n-p "Variable has a value. Redefine?")))
248 (pushnew ',symbol *variables*)
249 (setf ,symbol ,value)
250 ',symbol))
252 (defun variables-list ()
253 (mapcar #'intern (sort-data (mapcar #'string *variables*))))
255 (defun variables ()
256 "Args:()
257 Returns a list of the names of all def'ed variables to STREAM"
258 (if *variables*
259 (mapcar #'intern (sort-data (mapcar #'string *variables*)))))
261 (defun savevar (vars file)
262 "Args: (vars file-name-root)
263 VARS is a symbol or a list of symbols. FILE-NAME-ROOT is a string (or a symbol
264 whose print name is used) not endinf in .lsp. The VARS and their current values
265 are written to the file FILE-NAME-ROOT.lsp in a form suitable for use with the
266 load command."
267 (with-open-file (f (concatenate 'string (namestring file) ".lsp")
268 :direction :output)
269 (let ((vars (if (consp vars) vars (list vars))))
270 (flet ((save-one (x)
271 (let ((v (symbol-value x)))
272 (if (objectp v)
273 (format f "(def ~s ~s)~%" x (send v :save))
274 (format f "(def ~s '~s)~%" x v)))))
275 (mapcar #'save-one vars))
276 vars)))
278 (defun undef (v)
279 "Args: (v)
280 If V is the symbol of a defined variable the variable it is unbound and
281 removed from the list of defined variables. If V is a list of variable
282 names each is unbound and removed. Returns V."
283 (dolist (s (if (listp v) v (list v)))
284 (when (member s *variables*)
285 (setq *variables* (delete s *variables*))
286 (makunbound s)))
289 ;;;;
290 ;;;; Miscellaneous Routines
291 ;;;;
293 (defun split-list (x n)
294 "Args: (list cols)
295 Returns a list of COLS lists of equal length of the elements of LIST.
296 Example: (split-list '(1 2 3 4 5 6) 2) returns ((1 2 3) (4 5 6))"
297 (check-one-fixnum n)
298 (if (/= (rem (length x) n) 0) (error "length not divisible by ~a" n))
299 (flet ((next-split ()
300 (let ((result nil)
301 (end nil))
302 (dotimes (i n result)
303 (declare (fixnum i))
304 (let ((c-elem (list (first x))))
305 (cond ((null result)
306 (setf result c-elem)
307 (setf end result))
309 (setf (rest end) c-elem)
310 (setf end (rest end)))))
311 (setf x (rest x))))))
312 (let ((result nil)
313 (end nil)
314 (k (/ (length x) n)))
315 (declare (fixnum k))
316 (dotimes (i k result)
317 (declare (fixnum i))
318 (let ((c-sub (list (next-split))))
319 (cond ((null result)
320 (setf result c-sub)
321 (setf end result))
323 (setf (rest end) c-sub)
324 (setf end (rest end)))))))))