From 0e0ce061c54b3eea544c8f583c85f59b54b809a2 Mon Sep 17 00:00:00 2001 From: AJ Rossini Date: Wed, 7 Jan 2009 17:44:06 +0100 Subject: [PATCH] added DSV reading code from cybertiggyr --- lispstat.asd | 11 +- src/data/dsv-cybertiggyr.lisp | 357 +++++++++++++++++++++++++++++++++++++++++ src/data/test-cybertiggyr.lisp | 165 +++++++++++++++++++ 3 files changed, 532 insertions(+), 1 deletion(-) create mode 100644 src/data/dsv-cybertiggyr.lisp create mode 100644 src/data/test-cybertiggyr.lisp diff --git a/lispstat.asd b/lispstat.asd index 35cbbd1..e75ed50 100644 --- a/lispstat.asd +++ b/lispstat.asd @@ -1,5 +1,5 @@ ;; -*- mode: lisp -*- -;;; Time-stamp: <2008-12-19 08:47:22 tony> +;;; Time-stamp: <2009-01-07 17:42:26 tony> ;;; Created: <2005-05-30 17:09:47 blindglobe> ;;; File: lispstat.asd ;;; Author: AJ Rossini @@ -164,6 +164,14 @@ |# )) + ;; reading in DSV files for data access + (:module + "csv-data" + :pathname "src/data/" + :components + ((:file "test-cybertiggyr") + (:file "dsv-cybertiggyr" :depends-on ( "test-cybertiggyr")))) + ;; prototype and CLOS approaches. (:module "stat-data" @@ -188,6 +196,7 @@ ((:lispstat-lsp-source-file "lsbasics"))) + (:module "descriptives" :pathname "src/describe/" diff --git a/src/data/dsv-cybertiggyr.lisp b/src/data/dsv-cybertiggyr.lisp new file mode 100644 index 0000000..5e432ff --- /dev/null +++ b/src/data/dsv-cybertiggyr.lisp @@ -0,0 +1,357 @@ +;;; +;;; $Header: /home/gene/library/website/docsrc/dsv/RCS/dsv.lisp,v 395.1 2008/04/20 17:25:46 gene Exp $ +;;; +;;; Copyright (c) 2005 Gene Michael Stover. All rights reserved. +;;; +;;; This program is free software; you can redistribute it and/or modify +;;; it under the terms of the GNU Lesser General Public License as +;;; published by the Free Software Foundation; either version 2 of the +;;; License, or (at your option) any later version. +;;; +;;; This program is distributed in the hope that it will be useful, +;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU Lesser General Public License for more details. +;;; +;;; You should have received a copy of the GNU Lesser General Public +;;; License along with this program; if not, write to the Free Software +;;; Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +;;; USA +;;; + +(defpackage "CYBERTIGGYR-DSV" + (:use "COMMON-LISP") + (:import-from "CYBERTIGGYR-TEST" "DEFTEST") + (:export "*END-OF-RECORD*" + "*ESCAPE*" + "*FIELD-SEPARATOR*" + "DO-ESCAPED" + "LOAD-ESCAPED" + "READ-ESCAPED")) + +(in-package "CYBERTIGGYR-DSV") + +;;; +;;; UNEXPORTED HELPER FUNCTIONS & STOOF +;;; + +(defun xpeek (strm) + "Return the next character without consuming it, or return STRM on +end-of-input or other error." + (peek-char nil strm nil strm)) + +(defun consume-leading-crap (strm crap) + "Read (consume) newlines until the next character is not a newline or +there is no next character (end-of-input, which isn't an error)." + (loop while (eql (xpeek strm) crap) do (read-char strm)) + 'consume-leading-crap) + +(defun read-escaped-field (strm terminators escape) + "Return the next field as a string. Return STRM if there is no next +field, which is when the stream is already at its end. Assumes caller +has already consumed white-space crap that might preceed the field. +Consumes the character which ends the field. TERMINATORS is a list +of characters & the stream which could terminate the field." + (if (eq (xpeek strm) strm) + strm ; already at end-of-input + ;; else, Consume & collect characters until we find a terminator (field + ;; terminator, record terminator, or end-of-input). Do not collect + ;; the terminator. + (coerce + (loop until (member (xpeek strm) terminators) + collect (if (eql (xpeek strm) escape) + ;; It's an escape, so discard it & use the next + ;; character, verbatim. + (progn (read-char strm) (read-char strm)) + ;; else, Use this character. + (read-char strm))) + 'string))) + +;;; +;;; +;;; + +(defvar *field-separator* #\: + "The default field separator character. It defaults to colon (:).") + +(defvar *end-of-record* #\Newline + "The end-of-record character. Defaults to Newline.") + +(defvar *escape* #\\ + "The default escape character for unix-style DSV files. It uses a single +escape character to allow the field separator character to occur +within fields. The escape character can be used to allow an end-of-line +character or an escape character to occur in fields, too. +Defaults to backslash (\\). You can change it with SETQ. If you do not +want to allow separator characters at all, bind it to NIL.") + +(defun read-escaped (strm &key (field-separator *field-separator*) + (end-of-record *end-of-record*) + (escape *escape*)) + "Read (consume) & return the next DSV record from STRM. The record +will be a list of fields. The fields will be strings. Field separator +& end-of-record characters may not occur within fields unless escaped. +If you don't want to allow any kind of escape, use NIL for the escape +character. Since NIL is not a character, it will never be equal to a +character read from STRM, so there will be no possible escape character. +In fact, you could use any non-character to disable the escape +character. Ignors empty lines. On end-of-input, returns STRM. It is +an error if an escape character is followed by end-of-input." + (consume-leading-crap strm end-of-record) + (if (eq (xpeek strm) strm) + strm ; normal end-of-input + ;; else, Let's collect fields until we have read an entire record. + (prog1 + (loop until (member (xpeek strm) (list strm end-of-record)) + collect (prog1 + (read-escaped-field strm + (list strm field-separator + end-of-record) + escape) + (when (eql (xpeek strm) field-separator) + ;; Consume the character which ended the field. + ;; Notice that we do not consume end-of-record + ;; characters. + (read-char strm)))) + (consume-leading-crap strm end-of-record)))) + +(defun load-escaped (pathname &key (field-separator *field-separator*) + (end-of-record *end-of-record*) + (escape *escape*) + (filter #'identity) + (trace nil)) + "Return the entire contents of an escaped DSV file as a list of +records. Each record is a list." + (with-open-file (strm pathname :direction :input) + (labels ((is-good (x) (funcall filter x)) + (xread () (read-escaped strm :field-separator field-separator + :end-of-record end-of-record + :escape escape))) + (do ((lst () (if (is-good x) (cons x lst) lst)) + (x (xread) (xread)) + (i 0 (1+ i))) + ((eq x strm) lst) + (when (and trace (zerop (mod i 1000))) + (format trace "~&~A: [~D] ~S" 'load-escaped i x)))))) + +;;; +;;; todo: new new new. Document me!!! +;;; +(defmacro do-escaped ((var pathname) &body body) + (let ((strm (gensym))) + `(with-open-file (,strm ,pathname :element-type 'character + :direction :input :if-does-not-exist :error) + (loop for ,var = (read-escaped ,strm) + while (not (eq ,var ,strm)) + do (progn ,@body)) + (truename ,pathname)))) + +;;; +;;; TESTS +;;; + +(deftest test0000 () + "Null test. Always succeeds." + 'test0000) + +(deftest test0010 () + "Test that XPEEK returns the correct character from a stream, does +not consume the character. The character is NOT the last in the stream." + (with-input-from-string (strm "abc") + (and (eql (xpeek strm) #\a) + (eql (read-char strm) #\a)))) + +(deftest test0011 () + "Like TEST0011 except that it tests XPEEK on the last character in the +stream. In other words, tests that XPEEK returns the correct value & +does not consume it, & that character is the last in the stream." + (with-input-from-string (strm "c") + (and (eql (xpeek strm) #\c) + (eql (read-char strm) #\c)))) + +(deftest test0012 () + "Test XPEEK on an empty stream." + (with-input-from-string (strm "") + (and (eq (xpeek strm) strm) + (eq (read-char strm nil strm) strm)))) + +(deftest test0015 () + "Test CONSUME-LEADING-CRAP on a stream that contains nothing but leading +crap." + (with-input-from-string (strm (format nil "~%~%~%")) + (and (eql (xpeek strm) #\Newline) ; not at end + (consume-leading-crap strm #\Newline) ; doesn't matter what it returns + (eq (read-char strm nil strm) strm)))) ; now we're at end + +(deftest test0016 () + "Test CONSUME-LEADING-CRAP on a streeam that starts with leading crap, +then has some non-crap." + (with-input-from-string (strm (format nil "~%~%~%a")) + (and (eql (xpeek strm) #\Newline) ; not at end + (consume-leading-crap strm #\Newline) ; doesn't matter what it returns + (eql (read-char strm) #\a)))) + +(deftest test0017 () + "Test CONSUME-LEADING-CRAP on a stream that starts with non-crap, then +has some crap. CONSUME-LEADING-CRAP should not consume the leading +non-crap." + (with-input-from-string (strm (format nil "a~%")) + (and (eql (xpeek strm) #\a) ; not at end + (consume-leading-crap strm #\Newline) ; doesn't matter what it returns + (eql (read-char strm) #\a)))) ; the "a" char should remain + +(deftest test0020 () + "Test READ-ESCAPED-FIELD on a stream that contains a single field +followed by end-of-input. Uses the default field separator, end-of-record +character, & escape character. Just test that the field is read, not that +the next READ-ESCAPED-FIELD indicates end-of-input." + (with-input-from-string (strm "abc") + (equal (read-escaped-field strm + (list strm *field-separator* *end-of-record*) + *escape*) + "abc"))) + +(deftest test0021 () + "Like TEST0020, but also checks that another call to READ-ESCAPED-FIELD +indicates end-of-input by returning STRM." + (with-input-from-string (strm "abc") + (let* ((a (read-escaped-field strm + (list strm *field-separator* *end-of-record*) + *escape*)) + (b (read-escaped-field strm + (list strm *field-separator* *end-of-record*) + *escape*))) + (unless (equal a "abc") + (format t "~&~A: First read should have returned" 'test0021) + (format t " ~S, but it returned ~S" "abc" a)) + (unless (eq b strm) + (format t "~&~A: Second read should have returned" 'test0021) + (format t " ~S, but it returned ~S" strm b)) + (and (equal a "abc") (eq b strm))))) + +(deftest test0025 () + "Test that READ-ESCAPED-FIELD works on two consecutive fields." + (let ((a "abc") (b "xyz")) + (with-input-from-string (strm (format nil "~A~A~A" a *field-separator* b)) + (let* ((terminators (list strm *field-separator* *end-of-record*)) + (xa (read-escaped-field strm terminators *escape*)) + (xseparator (read-char strm)) + (xb (read-escaped-field strm terminators *escape*)) + (xstrm (xpeek strm))) + (and (equal xa a) (eql xseparator *field-separator*) (equal xb b) + (eq xstrm strm)))))) + +(deftest test0026 () + "Test that READ-ESCAPED-FIELD works on two records of two fields each. +The second record does not end with an end-of-record character. It +ends with end-of-input on the stream." + (let* ((a "abc") (b "123") ; first record + (c "def") (d "456") ; second record + (string (format nil "~A~A~A~A~A~A~A" a *field-separator* b + *end-of-record* c *field-separator* d))) + (with-input-from-string (strm string) + (let* ((terminators (list strm *field-separator* *end-of-record*)) + (xa (read-escaped-field strm terminators *escape*)) + (xseparator0 (read-char strm)) + (xb (read-escaped-field strm terminators *escape*)) + (xend-of-record0 (read-char strm)) + (xc (read-escaped-field strm terminators *escape*)) + (xseparator1 (read-char strm)) + (xd (read-escaped-field strm terminators *escape*)) + (xstrm (xpeek strm))) + (and (equal xa a) + (eql xseparator0 *field-separator*) + (equal xb b) + (eql xend-of-record0 *end-of-record*) + (equal xc c) + (eql xseparator1 *field-separator*) + (equal xd d) + (eq xstrm strm)))))) + +(deftest test0027 () + "Like TEST0026 except that the second record ends with an end-of- +record character." + (let* ((a "abc") (b "123") ; first record + (c "def") (d "456") ; second record + (string (format nil "~A~A~A~A~A~A~A~A" a *field-separator* b + *end-of-record* c *field-separator* d + *end-of-record*))) + (with-input-from-string (strm string) + (let* ((terminators (list strm *field-separator* *end-of-record*)) + (xa (read-escaped-field strm terminators *escape*)) + (xseparator0 (read-char strm)) + (xb (read-escaped-field strm terminators *escape*)) + (xend-of-record0 (read-char strm)) + (xc (read-escaped-field strm terminators *escape*)) + (xseparator1 (read-char strm)) + (xd (read-escaped-field strm terminators *escape*)) + (xend-of-record1 (read-char strm)) + (xstrm (xpeek strm))) + (and (equal xa a) + (eql xseparator0 *field-separator*) + (equal xb b) + (eql xend-of-record0 *end-of-record*) + (equal xc c) + (eql xseparator1 *field-separator*) + (equal xd d) + (eql xend-of-record1 *end-of-record*) + (eq xstrm strm)))))) + +(deftest test0050 () + "Test READ-ESCAPED on an input stream containing a single record of a +single field." + (let* ((record (list "abc")) + (string (format nil "~A" (first record)))) + (with-input-from-string (strm string) + (let* ((xrecord (read-escaped strm)) + (xstrm (xpeek strm))) + (and (equal xrecord record) + (eq xstrm strm)))))) + +(deftest test0051 () + "Test READ-ESCAPED on an input stream containing a single record of two +fields." + (let* ((record (list "abc" "123")) + (string (format nil "~A~A~A" (first record) *field-separator* + (second record)))) + (with-input-from-string (strm string) + (let* ((xrecord (read-escaped strm)) + (xstrm (xpeek strm))) + (and (equal xrecord record) + (eq xstrm strm)))))) + +(deftest test0052 () + "Test READ-ESCAPED. After reading the single record of two fields, +the stream should be at its end. The record is followed by several +end-of-record characters, & the stream should be at its end after +reading the record because no records follow the record terminators." + (let* ((record (list "abc" "123")) + (string (format nil "~A~A~A~A~A~A" (first record) *field-separator* + (second record) *end-of-record* *end-of-record* + *end-of-record*))) + (with-input-from-string (strm string) + (let* ((xrecord (read-escaped strm)) + (xstrm (xpeek strm))) + (and (equal xrecord record) + (eq xstrm strm)))))) + +(deftest test0053 () + "Test READ-ESCAPED on an input of two, two-field records. The second +record is followed by one end-of-record character." + (let ((record0 '("aaa" "111")) + (record1 '("bbb" "222")) + (string (format nil "aaa~A111~Abbb~A222~A" + *field-separator* *end-of-record* + *field-separator* *end-of-record*))) + (with-input-from-string (strm string) + (let* ((xrecord0 (read-escaped strm)) + (xrecord1 (read-escaped strm))) + (unless (equal xrecord0 record0) + (format t "~&First record is ~S. Expected ~S." xrecord0 record0)) + (unless (equal xrecord1 record1) + (format t "~&Second record is ~S. Expected~S." xrecord1 record1)) + (and (equal xrecord0 record0) + (equal xrecord1 record1)))))) + +;;; --- end of file --- + diff --git a/src/data/test-cybertiggyr.lisp b/src/data/test-cybertiggyr.lisp new file mode 100644 index 0000000..345acbc --- /dev/null +++ b/src/data/test-cybertiggyr.lisp @@ -0,0 +1,165 @@ +;;; +;;; $Header: /home/gene/library/website/docsrc/lut/RCS/test.lisp,v 395.1 2008/04/20 17:25:47 gene Exp $ +;;; +;;; Copyright (c) 2005 Gene Michael Stover. All rights reserved. +;;; +;;; This program is free software; you can redistribute it and/or modify +;;; it under the terms of the GNU Lesser General Public License as +;;; published by the Free Software Foundation; either version 2 of the +;;; License, or (at your option) any later version. +;;; +;;; This program is distributed in the hope that it will be useful, +;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU Lesser General Public License for more details. +;;; +;;; You should have received a copy of the GNU Lesser General Public +;;; License along with this program; if not, write to the Free Software +;;; Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +;;; USA +;;; + +(defpackage "CYBERTIGGYR-TEST" + (:use "COMMON-LISP") + (:export "*EXCLUDED-PACKAGES*" + "*PREFIX*" + "CHECK" + "DEFTEST" + "DISPOSITION" + "IS-A-UNIT-TEST" + "NOT-A-UNIT-TEST" + "RATE" + "RATETABLE" + "RUN" + "TEST-FUNCTION-P" + "TEST-FUNCTIONS")) + +(in-package "CYBERTIGGYR-TEST") + +;;; +;;; unexported helper functions & stoof +;;; + +(defun symbol-name-starts-with (symbol starts-with) + "Return true if & only if the name of the symbol begins with +the string bound to STARTS-WITH." + (let ((len (length starts-with))) + (and (>= (length (symbol-name symbol)) len) + (equal (subseq (symbol-name symbol) 0 len) starts-with)))) + +(defun symbol-bigname (symbol) + "Return, as a string, the package name of the symbol & the name +of the symbol." + (format nil "~A::~A" (package-name (symbol-package symbol)) symbol)) + +(defun make-failed-test-p (max strm) + "Return a predicate which runs a test & tells whether it failed. +The predicate also prints a status to the character output stream +STRM." + (let ((i 0)) + #'(lambda (test) + ;; Show which test we're about to run & what percentage + ;; of the test suit has been run. + (format strm "~&~3D% ~A =>" (round (* (/ (incf i) max) 100)) + (symbol-bigname test)) + (finish-output strm) + (let ((is-good (funcall test))) ; run the test + ;; Show the test's result. + (format strm " ~A" (if is-good "good" "FAILED")) + (not is-good))))) ; compliment the result + +;;; +;;; You could alter these values to fine-tune the behaviour of +;;; TEST-FUNCTION-P. Adding packages to *EXCLUDED-PACKAGES* is +;;; safe, but altering *PREFIX* could be trouble. +;;; + +(defvar *prefix* "TEST" "String prefix of test function names.") + +(defvar *excluded-packages* + (remove (find-package "COMMON-LISP-USER") (list-all-packages)) + "Packages whose functions are not eligible to be test functions. +Defaults to the packages that were loaded before this package, less +COMMON-LISP-USER.") + +(defun test-function-p (symbol) + "Return true if & only if SYMBOL is bound to a test function." + (and (fboundp symbol) + (not (eq (get symbol 'disposition) 'not-a-unit-test)) + (not (member (symbol-package symbol) *excluded-packages*)) + (or (eq (get symbol 'disposition) 'is-a-unit-test) + (symbol-name-starts-with symbol *prefix*)))) +(setf (get 'test-function-p 'disposition) 'not-a-unit-test) + +(defun test-functions () + "Return a list of symbols bound to test functions in any package." + (let ((lst ())) + (do-all-symbols (symbol) + (when (test-function-p symbol) (push symbol lst))) + (remove-duplicates (sort lst #'string-lessp :key #'symbol-bigname)))) + +(setf (get 'test-functions 'disposition) 'not-a-unit-test) + +(defun run (&optional (strm *standard-output*)) + "Run all unit tests. Print results to STRM. Return true if & only +if all tests pass." + (null + (find-if + ;; Search for a test function which fails... + (make-failed-test-p (length (test-functions)) strm) + ;; ...from the suite of test functions. + (test-functions)))) + +(defmacro deftest (name &rest body) + "Declare a unit test function. For now, maps to DEFUN, but could +be implemented differently in the future." + (if (symbol-name-starts-with name *prefix*) + `(defun ,name ,@body) + ;; else, We'll need to set DISPOSITION + `(progn (setf (get ',name 'cybertiggyr-test:disposition) + 'cybertiggyr-test:is-a-unit-test) + (defun ,name ,@body)))) + +(defun rate (fn) + "Run function FN at least 3 times & at least 3 seconds. +Return triple whose FIRST is calls/second, SECOND is number +of calls, & THIRD is number of seconds. All three numbers +will be positive. They may be integers, ratios, or floating- +point, depending on details of the lisp system. Time are +measured with GET-INTERNAL-REAL-TIME, but they are reported in +seconds." + (declare (type function fn)) + (do ((start-time (get-internal-real-time)) + (seconds 0 (/ (- (get-internal-real-time) start-time) + internal-time-units-per-second)) + (count 0 (1+ count))) + ((and (>= count 3) (>= seconds 3)) + (list (/ count seconds) count seconds)) + (funcall fn))) + +(defun ratetable (names-and-fns strm) + "Run RATE on a bunch of functios & return a LaTeX table in a +string which shows the results of all of them. Each element +in NAMES-AND-FNS is a list whose FIRST is the name of the function +in a string & whose SECOND is a function of no arguments whose +performance is to be tested." + (format strm "\\begin{tabular}{|r|r|r|r|} \\hline") + (format strm "~%{\\bf function} & {\\bf count} &") + (format strm " {\\bf seconds} & {\\bf rate}") + (format strm " \\\\ \\hline") + (dolist (lst names-and-fns) + (destructuring-bind (rate count seconds) (rate (second lst)) + (format strm "~%~A & ~D & ~,2E & ~,2E \\\\ \\hline" + (first lst) count seconds rate))) + (format strm "~%\\end{tabular}") + strm) + +(defmacro check (expression) + `(if ,expression + t + ;; else + (progn + (format t "~&Failure: ~S" ',expression) + nil))) + +;;; --- end of file --- -- 2.11.4.GIT