encoding-fehler resignalisieren
[closure-common.git] / xstream.lisp
blobea5049ea81c3b9234ebeec62646ce4908057056a
1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; readtable: runes; Encoding: utf-8; -*-
2 ;;; ---------------------------------------------------------------------------
3 ;;; Title: Fast streams
4 ;;; Created: 1999-07-17
5 ;;; Author: Gilbert Baumann <unk6@rz.uni-karlsruhe.de>
6 ;;; License: LGPL (See file COPYING for details).
7 ;;; ---------------------------------------------------------------------------
8 ;;; © copyright 1999 by Gilbert Baumann
10 ;;; This library is free software; you can redistribute it and/or
11 ;;; modify it under the terms of the GNU Library General Public
12 ;;; License as published by the Free Software Foundation; either
13 ;;; version 2 of the License, or (at your option) any later version.
14 ;;;
15 ;;; This library is distributed in the hope that it will be useful,
16 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;;; Library General Public License for more details.
19 ;;;
20 ;;; You should have received a copy of the GNU Library General Public
21 ;;; License along with this library; if not, write to the
22 ;;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;;; Boston, MA 02111-1307 USA.
25 (in-package :runes)
27 ;;; API
28 ;;
29 ;; MAKE-XSTREAM cl-stream &key name! speed initial-speed initial-encoding
30 ;; [function]
31 ;; MAKE-ROD-XSTREAM rod &key name [function]
32 ;; CLOSE-XSTREAM xstream [function]
33 ;; XSTREAM-P object [function]
35 ;; READ-RUNE xstream [macro]
36 ;; PEEK-RUNE xstream [macro]
37 ;; FREAD-RUNE xstream [function]
38 ;; FPEEK-RUNE xstream [function]
39 ;; CONSUME-RUNE xstream [macro]
40 ;; UNREAD-RUNE rune xstream [function]
42 ;; XSTREAM-NAME xstream [accessor]
43 ;; XSTREAM-POSITION xstream [function]
44 ;; XSTREAM-LINE-NUMBER xstream [function]
45 ;; XSTREAM-COLUMN-NUMBER xstream [function]
46 ;; XSTREAM-PLIST xstream [accessor]
47 ;; XSTREAM-ENCODING xstream [accessor] <-- be careful here. [*]
48 ;; SET-TO-FULL-SPEED xstream [function]
50 ;; [*] switching the encoding on the fly is only possible when the
51 ;; stream's buffer is empty; therefore to be able to switch the
52 ;; encoding, while some runes are already read, set the stream's speed
53 ;; to 1 initially (via the initial-speed argument for MAKE-XSTREAM)
54 ;; and later set it to full speed. (The encoding of the runes
55 ;; sequence, you fetch off with READ-RUNE is always UTF-16 though).
56 ;; After switching the encoding, SET-TO-FULL-SPEED can be used to bump the
57 ;; speed up to a full buffer length.
59 ;; An encoding is simply something, which provides the DECODE-SEQUENCE
60 ;; method.
62 ;;; Controller protocol
64 ;; READ-OCTECTS sequence os-stream start end -> first-non-written
65 ;; XSTREAM/CLOSE os-stream
68 (eval-when (:compile-toplevel :load-toplevel :execute)
69 (defparameter *fast* '(optimize (speed 3) (safety 0))))
71 ;; Let us first define fast fixnum arithmetric get rid of type
72 ;; checks. (After all we know what we do here).
74 (defmacro fx-op (op &rest xs)
75 `(the fixnum (,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs))))
76 (defmacro fx-pred (op &rest xs)
77 `(,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs)))
79 (defmacro %+ (&rest xs) `(fx-op + ,@xs))
80 (defmacro %= (&rest xs) `(fx-pred = ,@xs))
82 (deftype buffer-index ()
83 `(unsigned-byte ,(integer-length array-total-size-limit)))
85 (deftype buffer-byte ()
86 `(unsigned-byte 16))
88 (deftype octet ()
89 `(unsigned-byte 8))
91 ;; The usage of a special marker for EOF is experimental and
92 ;; considered unhygenic.
94 (defconstant +end+ #xFFFF
95 "Special marker inserted into stream buffers to indicate end of buffered data.")
97 (defvar +null-buffer+ (make-array 0 :element-type 'buffer-byte))
98 (defvar +null-octet-buffer+ (make-array 0 :element-type 'octet))
100 (defstruct (xstream
101 (:constructor make-xstream/low)
102 (:copier nil)
103 (:print-function print-xstream))
105 ;;; Read buffer
107 ;; the buffer itself
108 (buffer +null-buffer+
109 :type (simple-array buffer-byte (*)))
110 ;; points to the next element of `buffer' containing the next rune
111 ;; about to be read.
112 (read-ptr 0 :type buffer-index)
113 ;; points to the first element of `buffer' not containing a rune to
114 ;; be read.
115 (fill-ptr 0 :type buffer-index)
117 ;;; OS buffer
119 ;; a scratch pad for READ-SEQUENCE
120 (os-buffer +null-octet-buffer+
121 :type (simple-array octet (*)))
123 ;; `os-left-start', `os-left-end' designate a region of os-buffer,
124 ;; which still contains some undecoded data. This is needed because
125 ;; of the DECODE-SEQUENCE protocol
126 (os-left-start 0 :type buffer-index)
127 (os-left-end 0 :type buffer-index)
129 ;; How much to read each time
130 (speed 0 :type buffer-index)
132 ;; Some stream object obeying to a certain protcol
133 os-stream
135 ;; The external format
136 ;; (some object offering the ENCODING protocol)
137 (encoding :utf-8)
139 ;;A STREAM-NAME object
140 (name nil)
142 ;; a plist a struct keeps the hack away
143 (plist nil)
145 ;; Stream Position
146 (line-number 1 :type integer) ;current line number
147 (line-start 0 :type integer) ;stream position the current line starts at
148 (buffer-start 0 :type integer) ;stream position the current buffer starts at
150 ;; There is no need to maintain a column counter for each character
151 ;; read, since we can easily compute it from `line-start' and
152 ;; `buffer-start'.
155 (defun print-xstream (self sink depth)
156 (declare (ignore depth))
157 (format sink "#<~S ~S>" (type-of self) (xstream-name self)))
159 (defmacro read-rune (input)
160 "Read a single rune off the xstream `input'. In case of end of file :EOF
161 is returned."
162 `((lambda (input)
163 (declare (type xstream input)
164 #.*fast*)
165 (let ((rp (xstream-read-ptr input)))
166 (declare (type buffer-index rp))
167 (let ((ch (aref (the (simple-array buffer-byte (*)) (xstream-buffer input))
168 rp)))
169 (declare (type buffer-byte ch))
170 (setf (xstream-read-ptr input) (%+ rp 1))
171 (cond ((%= ch +end+)
172 (the (or (member :eof) rune)
173 (xstream-underflow input)))
174 ((%= ch #x000A) ;line break
175 (account-for-line-break input)
176 (code-rune ch))
178 (code-rune ch))))))
179 ,input))
181 (defmacro peek-rune (input)
182 "Peek a single rune off the xstream `input'. In case of end of file :EOF
183 is returned."
184 `((lambda (input)
185 (declare (type xstream input)
186 #.*fast*)
187 (let ((rp (xstream-read-ptr input)))
188 (declare (type buffer-index rp))
189 (let ((ch (aref (the (simple-array buffer-byte (*)) (xstream-buffer input))
190 rp)))
191 (declare (type buffer-byte ch))
192 (cond ((%= ch +end+)
193 (prog1
194 (the (or (member :eof) rune) (xstream-underflow input))
195 (setf (xstream-read-ptr input) 0)))
197 (code-rune ch))))))
198 ,input))
200 (defmacro consume-rune (input)
201 "Like READ-RUNE, but does not actually return the read rune."
202 `((lambda (input)
203 (declare (type xstream input)
204 #.*fast*)
205 (let ((rp (xstream-read-ptr input)))
206 (declare (type buffer-index rp))
207 (let ((ch (aref (the (simple-array buffer-byte (*)) (xstream-buffer input))
208 rp)))
209 (declare (type buffer-byte ch))
210 (setf (xstream-read-ptr input) (%+ rp 1))
211 (when (%= ch +end+)
212 (xstream-underflow input))
213 (when (%= ch #x000A) ;line break
214 (account-for-line-break input) )))
215 nil)
216 ,input))
218 (definline unread-rune (rune input)
219 "Unread the last recently read rune; if there wasn't such a rune, you
220 deserve to lose."
221 (declare (ignore rune))
222 (decf (xstream-read-ptr input))
223 (when (rune= (peek-rune input) #/u+000A) ;was it a line break?
224 (unaccount-for-line-break input)))
226 (defun fread-rune (input)
227 (read-rune input))
229 (defun fpeek-rune (input)
230 (peek-rune input))
232 ;;; Line counting
234 (defun account-for-line-break (input)
235 (declare (type xstream input))
236 (incf (xstream-line-number input))
237 (setf (xstream-line-start input)
238 (+ (xstream-buffer-start input) (xstream-read-ptr input))))
240 (defun unaccount-for-line-break (input)
241 ;; incomplete!
242 ;; We better use a traditional lookahead technique or forbid unread-rune.
243 (decf (xstream-line-number input)))
245 ;; User API:
247 (defun xstream-position (input)
248 (+ (xstream-buffer-start input) (xstream-read-ptr input)))
250 ;; xstream-line-number is structure accessor
252 (defun xstream-column-number (input)
253 (+ (- (xstream-position input)
254 (xstream-line-start input))
257 ;;; Underflow
259 ;;(defun read-runes (sequence input))
261 (defun xstream-underflow (input)
262 (declare (type xstream input))
263 ;; we are about to fill new data into the buffer, so we need to
264 ;; adjust buffer-start.
265 (incf (xstream-buffer-start input)
266 (- (xstream-fill-ptr input) 0))
267 (let (n m)
268 ;; when there is something left in the os-buffer, we move it to
269 ;; the start of the buffer.
270 (setf m (- (xstream-os-left-end input) (xstream-os-left-start input)))
271 (unless (zerop m)
272 (replace (xstream-os-buffer input) (xstream-os-buffer input)
273 :start1 0 :end1 m
274 :start2 (xstream-os-left-start input)
275 :end2 (xstream-os-left-end input))
276 ;; then we take care that the buffer is large enough to carry at
277 ;; least 100 bytes (a random number)
279 ;; david: was heisst da random? ich nehme an, dass 100 einfach
280 ;; ausreichend sein soll, um die laengste utf-8 bytesequenz oder die
281 ;; beiden utf-16 surrogates zu halten? dann ist 100 ja wohl dicke
282 ;; ausreichend und koennte in make-xstream ordentlich geprueft werden.
283 ;; oder was geht hier vor?
284 (unless (>= (length (xstream-os-buffer input)) 100)
285 (error "You lost")))
286 (setf n
287 (read-octets (xstream-os-buffer input) (xstream-os-stream input)
288 m (min (1- (length (xstream-os-buffer input)))
289 (+ m (xstream-speed input)))))
290 (cond ((%= n 0)
291 (setf (xstream-read-ptr input) 0
292 (xstream-fill-ptr input) n)
293 (setf (aref (xstream-buffer input) (xstream-fill-ptr input)) +end+)
294 :eof)
296 (multiple-value-bind (fnw fnr)
297 (runes-encoding:decode-sequence
298 (xstream-encoding input)
299 (xstream-os-buffer input) 0 n
300 (xstream-buffer input) 0 (1- (length (xstream-buffer input)))
301 (= n m))
302 (setf (xstream-os-left-start input) fnr
303 (xstream-os-left-end input) n
304 (xstream-read-ptr input) 0
305 (xstream-fill-ptr input) fnw)
306 (setf (aref (xstream-buffer input) (xstream-fill-ptr input)) +end+)
307 (read-rune input))))))
309 ;;; constructor
311 (defun make-xstream (os-stream &key name
312 (speed 8192)
313 (initial-speed 1)
314 (initial-encoding :guess))
315 ;; XXX if initial-speed isn't 1, encoding will me munged up
316 (assert (eql initial-speed 1))
317 (multiple-value-bind (encoding preread)
318 (if (eq initial-encoding :guess)
319 (figure-encoding os-stream)
320 (values initial-encoding nil))
321 (let ((osbuf (make-array speed :element-type '(unsigned-byte 8))))
322 (replace osbuf preread)
323 (make-xstream/low
324 :buffer (let ((r (make-array speed :element-type 'buffer-byte)))
325 (setf (elt r 0) #xFFFF)
327 :read-ptr 0
328 :fill-ptr 0
329 :os-buffer osbuf
330 :speed initial-speed
331 :os-stream os-stream
332 :os-left-start 0
333 :os-left-end (length preread)
334 :encoding encoding
335 :name name))))
337 (defun make-rod-xstream (string &key name)
338 ;; XXX encoding is mis-handled by this kind of stream
339 (let ((n (length string)))
340 (let ((buffer (make-array (1+ n) :element-type 'buffer-byte)))
341 (declare (type (simple-array buffer-byte (*)) buffer))
342 ;; copy the rod
343 (do ((i (1- n) (- i 1)))
344 ((< i 0))
345 (declare (type fixnum i))
346 (setf (aref buffer i) (rune-code (%rune string i))))
347 (setf (aref buffer n) +end+)
349 (make-xstream/low :buffer buffer
350 :read-ptr 0
351 :fill-ptr n
352 ;; :os-buffer nil
353 :speed 1
354 :os-stream nil
355 :name name))))
357 (defmethod figure-encoding ((stream null))
358 (values :utf-8 nil))
360 (defmethod figure-encoding ((stream stream))
361 (let ((c0 (read-byte stream nil :eof)))
362 (cond ((eq c0 :eof)
363 (values :utf-8 nil))
365 (let ((c1 (read-byte stream nil :eof)))
366 (cond ((eq c1 :eof)
367 (values :utf-8 (list c0)))
369 (cond ((and (= c0 #xFE) (= c1 #xFF)) (values :utf-16-big-endian nil))
370 ((and (= c0 #xFF) (= c1 #xFE)) (values :utf-16-little-endian nil))
372 (values :utf-8 (list c0 c1)))))))))))
374 ;;; misc
376 (defun close-xstream (input)
377 (xstream/close (xstream-os-stream input)))
379 (defun set-to-full-speed (xstream)
380 (setf (xstream-speed xstream) (length (xstream-os-buffer xstream))))
382 ;;; controller implementations
384 (defmethod read-octets (sequence (stream stream) start end)
385 (#+CLISP ext:read-byte-sequence
386 #-CLISP read-sequence
387 sequence stream :start start :end end))
389 #+cmu
390 (defmethod read-octets :around (sequence (stream stream) start end)
391 ;; CMUCL <= 19a on non-SunOS accidentally triggers EFAULT in read(2)
392 ;; if SEQUENCE has been write protected by GC. Workaround: Touch all pages
393 ;; in SEQUENCE and make sure no GC happens between that and the read(2).
394 (ext::without-gcing
395 (loop for i from start below end
396 do (setf (elt sequence i) (elt sequence i)))
397 (call-next-method)))
399 (defmethod read-octets (sequence (stream null) start end)
400 (declare (ignore sequence start end))
403 (defmethod xstream/close ((stream stream))
404 (close stream))
406 (defmethod xstream/close ((stream null))
407 nil)