CSV reader task entered
[CommonLispStat.git] / src / data / data.lisp
blob12f9f83a7cce90be29f0805eee0e48ffee09ebaa
1 ;;; -*- mode: lisp -*-
3 ;;; Time-stamp: <2008-11-16 20:24:10 tony>
4 ;;; Creation: <2005-08-xx 21:34:07 rossini>
5 ;;; File: data.lisp
6 ;;; Author: AJ Rossini <blindglobe@gmail.com>
7 ;;; Copyright: (c)2005--2008, AJ Rossini. BSD, LLGPL, or GPLv2, depending
8 ;;; on how it arrives.
9 ;;; Purpose: data package for lispstat
11 ;;; What is this talk of 'release'? Klingons do not make software
12 ;;; 'releases'. Our software 'escapes', leaving a bloody trail of
13 ;;; designers and quality assurance people in its wake.
15 ;;; This organization and structure is new to the 21st Century
16 ;;; version.
18 (in-package :lisp-stat-data)
20 ;;; conside that data has 3 genotypic chracteristrics. The first
21 ;;; would be form -- scalar, vector, array. second would be datarep
22 ;;; ("computer science simplistic data" type. in particular integer,
23 ;;; real, string, symbol. The last would be statistical type
24 ;;; ("usually handled by computer sicience approaches via metadata").
25 ;;; augmenting datarep type with use in a statistical context,
26 ;;; i.e. that would include nominal, ordinal, integer, continous,
27 ;;; interval (orderable subtypes). Clearly, the statistical type can
28 ;;; be inherited, likewise the numerical type as well. The form can
29 ;;; be pushed up or simplified as necessary, but this can be
30 ;;; challenging.
32 ;;; The first approach we will take for CLS is to handle this as
33 ;;; lisp-only structures. At the time of realization
34 ;;; (?instantiation?) of an "abstract" model, the data should be
35 ;;; pushed into an appropriate form (either "en masse", or
36 ;;; "on-demand") into a linear algebra framework.
38 ;;; There is some excellent material on this by John Chambers in one
39 ;;; of his earlier books. Reference is being ignored to encourage
40 ;;; people to read them all. With all due respect to John, they've
41 ;;; lasted quite well, but need to be updated.
44 ;;; The purpose of this package is to manage data which will be
45 ;;; processed by LispStat. In particular, it will be important to
46 ;;; register variables, datasets, relational structures, and other
47 ;;; objects which could be the target for statistical modeling and
48 ;;; inference.
50 (defvar *lisp-stat-data-table* (make-hash-table)
51 "Marks up the data the could be used by.")
53 (defvar *lisp-stat-data-count* 0
54 "number of items currently recorded.")
56 ;;; Data (storage) Types, dt-{.*}
57 ;;;
58 ;;; Data types are the representation of data from a computer-science
59 ;;; perspective, i.e. what it is that they contain. These types
60 ;;; include particular forms of compound types (i.e. dataframe is
61 ;;; array-like, but types differ, difference is row-wise, while array
62 ;;; is a compound of elements of the same type.
63 ;;;
64 ;;; This is completely subject to change.
66 ;;Examples:
67 ;; (defun equidimensional (a)
68 ;; (or (< (array-rank a) 2)
69 ;; (apply #'= (array-dimensions a)))) => EQUIDIMENSIONAL
70 ;; (deftype square-matrix (&optional type size)
71 ;; `(and (array ,type (,size ,size))
72 ;; (satisfies equidimensional))) => SQUARE-MATRIX
74 (defun array-of-equal-dt-scalar-type (x)
75 ;; return dt-scalar-type which fits (more precise that works)
76 (if x
77 'integer
78 nil))
80 (defun array-of-equal-dt-scalar-type-within-column (x)
81 ;; return dt-scalar-type which fits (more precise that works)
82 (if x
83 'integer
84 nil))
88 (deftype dt-scalar (&optional type)
89 `(or integer double complex symbol))
91 (deftype dt-array (&optional ndim dimlist)
92 `(satisfies array-of-equal-dt-scalar-type))
94 (deftype dt-dataframe (&optional )
95 `(satisfies array-of-equal-dt-scalar-type-within-column))
97 ;(deftype dt-relationaldata ()
98 ; `(satisfies (foreach unit in relationalUnit
99 ; (typep unit 'dt-dataframe))))
102 ;;; Statistical Variable Types, sv-{.*}
103 ;;;
104 ;;; Statistical variable types work to represent the statistical
105 ;;; category represented by the variable, i.e. nominal, ordinal,
106 ;;; integral, continous, ratio. This metadata can be used to hint at
107 ;;; appropriate analysis methods -- or perhaps more critically, to
108 ;;; define how these methods will fail in the final interrpretation.
110 (deftype sv-nominal (&optional n)
113 (deftype sv-ordinal (ordering &optional n)
116 (deftype sv-categorical ()
117 `(satisfies (or sv-nominal sv-ordinal)))
118 ;;(deftype sv-integer )
119 ;;(deftype sv-real ) ;; precision could be a secondary component of real, rational, complex.
120 ;;(deftype sv-rational )
121 ;;(deftype sv-complex )
122 ;;(deftype sv-continuous (or 'sv-integer 'sv-real 'sv-rational 'sv-complex)) ;; perhaps, call it "mostly contin..."
125 ;;; Data I/O
127 ;; We can read 2 types of data -- those which are pure data, and those
128 ;; which are impure (lisp-enabled, data as program as data thingy's).
130 (defparameter *lisp-stat-data-formats*
131 '(csv tsv))
133 ;; (defgeneric data-read (srce frmt)
134 ;; "read data from stream srce, in format frmt.")
136 ;; (defgeneric data-write (srce frmt)
137 ;; "read data from stream srce, in format frmt.")
139 ;; (defmacro with-data (body)
140 ;; "Stream-handling, maintaining I/O through object typing.")
142 ;; design-wise should these be replaced with a "with-data" form?
144 ;;; These need to be elsewhere...!
145 ;; DSV processing
146 ;; XML processing
148 ;;; DM operations should be somewhere else as well.
149 ;;; Data Management
151 ;; the goal is to have 2 operations which can be used to create new
152 ;; data formats out of old ones.
154 ;; (defgeneric data-subset (ds description)
155 ;; "Take a dataset and make it smaller.")
157 ;; (defgeneric data-relate (ds description)
158 ;; "Take 2 or more datasets, and grow them into a bigger one through
159 ;; relating them (i.e. merge is one example).")
161 ;;; What should we be able to do?
163 ;;; Actions on a single dataset
164 ;;* subset-dataset original-set
165 ;; :list-of-columns :list-of-rows :list-of-rows-and-columns
166 ;; :list-of-indices
167 ;;* resample-dataset original-set ;
168 ;; :by-rows :by-columns :row-weights :column-weights
169 ;; :new-number-of-columns :new-number-of-rows
172 ;;; Actions based on 2 or more datasets
173 ;;* concat-dataset set1 set2 ; no matching
174 ;; :by-row :by-column :kronecker-product
175 ;;* merge-dataset set1 set2
176 ;; :match-on-column :match-on-row
179 ;;; Data tools from "statistics.lsp"
181 ;;;;
182 ;;;; Data File Reading
183 ;;;;
185 (defun count-file-columns (fname)
186 "Args: (fname)
187 Returns the number of lisp items on the first nonblank line of file FNAME."
188 (with-open-file (f fname)
189 (if f
190 (let ((line (do ((line (read-line f) (read-line f)))
191 ((or (null line) (< 0 (length line))) line))))
192 (if line
193 (with-input-from-string (s line)
194 (do ((n 0 (+ n 1)) (eof (gensym)))
195 ((eq eof (read s nil eof)) n))))))))
197 #+xlisp (defvar *xlisptable* *readtable*)
199 (if (not (fboundp 'open-file-dialog))
200 #+dialogs
201 (defun open-file-dialog () ;; why?(&optional set)
202 (get-string-dialog "Enter a data file name:"))
203 #-dialogs
204 (defun open-file-dialog () ;; why? (&optional set)
205 (error "You must provide a file name explicitly")))
207 (defun read-data-file (&optional (file (open-file-dialog t)))
208 "Args: (file)
209 Returns a list of all lisp objects in FILE. FILE can be a string or a symbol,
210 in which case the symbol'f print name is used."
211 (if file
212 (let ((eof (gensym)))
213 (with-open-file (f file)
214 (if f
215 (do* ((r (read f nil eof) (read f nil eof))
216 (x (list nil))
217 (tail x (cdr tail)))
218 ((eq r eof) (cdr x))
219 (setf (cdr tail) (list r))))))))
221 ;;; New definition to avoid stack size limit in apply
222 (defun read-data-columns (&optional (file (open-file-dialog t))
223 (cols (if file
224 (count-file-columns file))))
225 "Args: (&optional file cols)
226 Reads the data in FILE as COLS columns and returns a list of lists representing the columns."
227 (if (and file cols)
228 (transpose (split-list (read-data-file file) cols))))
231 ;;; FIXME:AJR: ALL THE FOLLOWING NEED TO BE SOLVED BY PLATFORM-INDEP PATHNAME WORK!
232 ;;; FIXME:AJR: use either string or pathname.
234 (defun path-string-to-path (p s)
235 (pathname (concatenate 'string (namestring p) s)))
237 (defun load-data (file)
238 "Args: (file) as string
239 Read in data file from the data examples library."
240 (if (load (path-string-to-path *lispstat-data-dir* file))
242 (load (path-string-to-path *lispstat-data-dir* file))))
244 (defun load-example (file)
245 "Args: (file) as string
246 Read in lisp example file from the examples library."
247 (if (load (path-string-to-path *lispstat-example-dir* file))
249 (load (path-string-to-path *lispstat-example-dir* file))))
251 ;;;;
252 ;;;; Listing and Saving Variables and Functions
253 ;;;;
255 (defvar *variables* nil)
256 (defvar *ask-on-redefine* nil)
258 (defmacro def (symbol value)
259 "Syntax: (def var form)
260 VAR is not evaluated and must be a symbol. Assigns the value of FORM to
261 VAR and adds VAR to the list *VARIABLES* of def'ed variables. Returns VAR.
262 If VAR is already bound and the global variable *ASK-ON-REDEFINE*
263 is not nil then you are asked if you want to redefine the variable."
264 `(unless (and *ask-on-redefine*
265 (boundp ',symbol)
266 (not (y-or-n-p "Variable has a value. Redefine?")))
267 (if (boundp ',symbol)
268 (setf ,symbol ,value)
269 (defvar ,symbol ,value))
270 (pushnew ',symbol *variables*)
271 ',symbol))
273 (defun variables-list ()
274 (mapcar #'intern (sort-data (mapcar #'string *variables*))))
276 (defun variables ()
277 "Args:()
278 Returns a list of the names of all def'ed variables to STREAM"
279 (if *variables*
280 (mapcar #'intern (sort-data (mapcar #'string *variables*)))))
282 (defun savevar (vars file)
283 "Args: (vars file-name-root)
284 VARS is a symbol or a list of symbols. FILE-NAME-ROOT is a string (or a symbol
285 whose print name is used) not endinf in .lsp. The VARS and their current values
286 are written to the file FILE-NAME-ROOT.lsp in a form suitable for use with the
287 load command."
288 (with-open-file (f (concatenate 'string (namestring file) ".lsp")
289 :direction :output)
290 (let ((vars (if (consp vars) vars (list vars))))
291 (flet ((save-one (x)
292 (let ((v (symbol-value x)))
293 (if (objectp v)
294 (format f "(def ~s ~s)~%" x (send v :save))
295 (format f "(def ~s '~s)~%" x v)))))
296 (mapcar #'save-one vars))
297 vars)))
299 (defun undef (v)
300 "Args: (v)
301 If V is the symbol of a defined variable the variable it is unbound and
302 removed from the list of defined variables. If V is a list of variable
303 names each is unbound and removed. Returns V."
304 (dolist (s (if (listp v) v (list v)))
305 (when (member s *variables*)
306 (setq *variables* (delete s *variables*))
307 (makunbound s)))