Allow non-simple strings in MAKE-ROD-XSTREAM, for the benefit of Drakma.
[closure-common.git] / encodings.lisp
blob5788adbf868cce4a63424e57b7b32c2463347a2f
1 (in-package :runes-encoding)
3 (define-condition encoding-error (simple-error) ())
5 (defun xerror (fmt &rest args)
6 (error 'encoding-error :format-control fmt :format-arguments args))
8 ;;;; ---------------------------------------------------------------------------
9 ;;;; Encoding names
10 ;;;;
12 (defvar *names* (make-hash-table :test #'eq))
14 (defun canon-name (string)
15 (with-output-to-string (bag)
16 (map nil (lambda (ch)
17 (cond ((char= ch #\_) (write-char #\- bag))
18 (t (write-char (char-upcase ch) bag))))
19 string)))
21 (defun canon-name-2 (string)
22 (with-output-to-string (bag)
23 (map nil (lambda (ch)
24 (cond ((char= ch #\_))
25 ((char= ch #\-))
26 (t (write-char (char-upcase ch) bag))))
27 string)))
29 (defmethod encoding-names ((encoding symbol))
30 (gethash encoding *names*))
32 (defmethod (setf encoding-names) (new-value (encoding symbol))
33 (setf (gethash encoding *names*) new-value))
35 (defun add-name (encoding name)
36 (pushnew (canon-name name) (encoding-names encoding) :test #'string=))
38 (defun resolve-name (string)
39 (cond ((symbolp string)
40 string)
42 (setq string (canon-name string))
43 (or
44 (block nil
45 (maphash (lambda (x y)
46 (when (member string y :test #'string=)
47 (return x)))
48 *names*)
49 nil)
50 (block nil
51 (maphash (lambda (x y)
52 (when (member string y
53 :test #'(lambda (x y)
54 (string= (canon-name-2 x)
55 (canon-name-2 y))))
56 (return x)))
57 *names*)
58 nil)))))
60 ;;;; ---------------------------------------------------------------------------
61 ;;;; Encodings
62 ;;;;
64 (defvar *encodings* (make-hash-table :test #'eq))
66 (defmacro define-encoding (name init-form)
67 `(progn
68 (setf (gethash ',name *encodings*)
69 (list nil (lambda () ,init-form)))
70 ',name))
72 (defun find-encoding (name)
73 (let ((x (gethash (resolve-name name) *encodings*)))
74 (and x
75 (or (first x)
76 (setf (first x) (funcall (second x)))))))
78 (defclass encoding () ())
80 (defclass simple-8-bit-encoding (encoding)
81 ((table :initarg :table)))
83 (defun make-simple-8-bit-encoding (&key charset)
84 (make-instance 'simple-8-bit-encoding
85 :table (coerce (to-unicode-table charset) '(simple-array (unsigned-byte 16) (256)))))
87 ;;;;;;;
89 (defmacro fx-op (op &rest xs)
90 `(the fixnum (,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs))))
91 (defmacro fx-pred (op &rest xs)
92 `(,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs)))
94 (defmacro %+ (&rest xs) `(fx-op + ,@xs))
95 (defmacro %- (&rest xs) `(fx-op - ,@xs))
96 (defmacro %* (&rest xs) `(fx-op * ,@xs))
97 (defmacro %/ (&rest xs) `(fx-op floor ,@xs))
98 (defmacro %and (&rest xs) `(fx-op logand ,@xs))
99 (defmacro %ior (&rest xs) `(fx-op logior ,@xs))
100 (defmacro %xor (&rest xs) `(fx-op logxor ,@xs))
101 (defmacro %ash (&rest xs) `(fx-op ash ,@xs))
102 (defmacro %mod (&rest xs) `(fx-op mod ,@xs))
104 (defmacro %= (&rest xs) `(fx-pred = ,@xs))
105 (defmacro %<= (&rest xs) `(fx-pred <= ,@xs))
106 (defmacro %>= (&rest xs) `(fx-pred >= ,@xs))
107 (defmacro %< (&rest xs) `(fx-pred < ,@xs))
108 (defmacro %> (&rest xs) `(fx-pred > ,@xs))
110 ;;; Decoders
112 ;; The decoders share a common signature:
114 ;; DECODE input input-start input-end
115 ;; output output-start output-end
116 ;; eof-p
117 ;; -> first-not-written ; first-not-read
119 ;; These decode functions should decode as much characters off `input'
120 ;; into the `output' as possible and return the indexes to the first
121 ;; not read and first not written element of `input' and `output'
122 ;; respectively. If there are not enough bytes in `input' to decode a
123 ;; full character, decoding shold be abandomed; the caller has to
124 ;; ensure that the remaining bytes of `input' are passed to the
125 ;; decoder again with more bytes appended.
127 ;; `eof-p' now in turn indicates, if the given input sequence, is all
128 ;; the producer does have and might be used to produce error messages
129 ;; in case of incomplete codes or decided what to do.
131 ;; Decoders are expected to handle the various CR/NL conventions and
132 ;; canonicalize each end of line into a single NL rune (#xA) in good
133 ;; old Lisp tradition.
136 ;; TODO: change this to an encoding class, which then might carry
137 ;; additional state. Stateless encodings could been represented by
138 ;; keywords. e.g.
140 ;; defmethod DECODE-SEQUENCE ((encoding (eql :utf-8)) ...)
143 (defmethod decode-sequence ((encoding (eql :utf-16-big-endian))
144 in in-start in-end out out-start out-end eof?)
145 ;; -> new wptr, new rptr
146 (let ((wptr out-start)
147 (rptr in-start))
148 (loop
149 (when (%= wptr out-end)
150 (return))
151 (when (>= (%+ rptr 1) in-end)
152 (return))
153 (let ((hi (aref in rptr))
154 (lo (aref in (%+ 1 rptr))))
155 (setf rptr (%+ 2 rptr))
156 ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
157 ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
158 ;; Haelfte fehlt!
159 (let ((x (logior (ash hi 8) lo)))
160 (when (or (eql x #xFFFE) (eql x #xFFFF))
161 (xerror "not a valid code point: #x~X" x))
162 (setf (aref out wptr) x))
163 (setf wptr (%+ 1 wptr))))
164 (values wptr rptr)))
166 (defmethod decode-sequence ((encoding (eql :utf-16-little-endian))
167 in in-start in-end out out-start out-end eof?)
168 ;; -> new wptr, new rptr
169 (let ((wptr out-start)
170 (rptr in-start))
171 (loop
172 (when (%= wptr out-end)
173 (return))
174 (when (>= (%+ rptr 1) in-end)
175 (return))
176 (let ((lo (aref in (%+ 0 rptr)))
177 (hi (aref in (%+ 1 rptr))))
178 (setf rptr (%+ 2 rptr))
179 ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
180 ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
181 ;; Haelfte fehlt!
182 (let ((x (logior (ash hi 8) lo)))
183 (when (or (eql x #xFFFE) (eql x #xFFFF))
184 (xerror "not a valid code point: #x~X" x))
185 (setf (aref out wptr) x))
186 (setf wptr (%+ 1 wptr))))
187 (values wptr rptr)))
189 (defmethod decode-sequence ((encoding (eql :utf-8))
190 in in-start in-end out out-start out-end eof?)
191 (declare (optimize (speed 3) (safety 0))
192 (type (simple-array (unsigned-byte 8) (*)) in)
193 (type (simple-array (unsigned-byte 16) (*)) out)
194 (type fixnum in-start in-end out-start out-end))
195 (let ((wptr out-start)
196 (rptr in-start)
197 byte0)
198 (macrolet ((put (x)
199 `((lambda (x)
200 (when (or (<= #xD800 x #xDBFF)
201 (<= #xDC00 x #xDFFF))
202 (xerror "surrogate encoded in UTF-8: #x~X." x))
203 (cond ((or (%> x #x10FFFF)
204 (eql x #xFFFE)
205 (eql x #xFFFF))
206 (xerror "not a valid code point: #x~X" x))
207 ((%> x #xFFFF)
208 (setf (aref out (%+ 0 wptr)) (%+ #xD7C0 (ash x -10))
209 (aref out (%+ 1 wptr)) (%ior #xDC00 (%and x #x3FF)))
210 (setf wptr (%+ wptr 2)))
212 (setf (aref out wptr) x)
213 (setf wptr (%+ wptr 1)))))
214 ,x))
215 (put1 (x)
216 `(progn
217 (setf (aref out wptr) ,x)
218 (setf wptr (%+ wptr 1)))))
219 (loop
220 (when (%= (+ wptr 1) out-end) (return))
221 (when (%>= rptr in-end) (return))
222 (setq byte0 (aref in rptr))
223 (cond ((= byte0 #x0D)
224 ;; CR handling
225 ;; we need to know the following character
226 (cond ((>= (%+ rptr 1) in-end)
227 ;; no characters in buffer
228 (cond (eof?
229 ;; at EOF, pass it as NL
230 (put #x0A)
231 (setf rptr (%+ rptr 1)))
233 ;; demand more characters
234 (return))))
235 ((= (aref in (%+ rptr 1)) #x0A)
236 ;; we see CR NL, so forget this CR and the next NL will be
237 ;; inserted literally
238 (setf rptr (%+ rptr 1)))
240 ;; singleton CR, pass it as NL
241 (put #x0A)
242 (setf rptr (%+ rptr 1)))))
244 ((%<= #|#b00000000|# byte0 #b01111111)
245 (put1 byte0)
246 (setf rptr (%+ rptr 1)))
248 ((%<= #|#b10000000|# byte0 #b10111111)
249 (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)
250 (setf rptr (%+ rptr 1)))
252 ((%<= #|#b11000000|# byte0 #b11011111)
253 (cond ((<= (%+ rptr 2) in-end)
254 (put
255 (dpb (ldb (byte 5 0) byte0) (byte 5 6)
256 (dpb (ldb (byte 6 0) (aref in (%+ rptr 1))) (byte 6 0)
257 0)))
258 (setf rptr (%+ rptr 2)))
260 (return))))
262 ((%<= #|#b11100000|# byte0 #b11101111)
263 (cond ((<= (%+ rptr 3) in-end)
264 (put
265 (dpb (ldb (byte 4 0) byte0) (byte 4 12)
266 (dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 6)
267 (dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 0)
268 0))))
269 (setf rptr (%+ rptr 3)))
271 (return))))
273 ((%<= #|#b11110000|# byte0 #b11110111)
274 (cond ((<= (%+ rptr 4) in-end)
275 (put
276 (dpb (ldb (byte 3 0) byte0) (byte 3 18)
277 (dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 12)
278 (dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 6)
279 (dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 0)
280 0)))))
281 (setf rptr (%+ rptr 4)))
283 (return))))
285 ((%<= #|#b11111000|# byte0 #b11111011)
286 (cond ((<= (%+ rptr 5) in-end)
287 (put
288 (dpb (ldb (byte 2 0) byte0) (byte 2 24)
289 (dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 18)
290 (dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 12)
291 (dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 6)
292 (dpb (ldb (byte 6 0) (aref in (%+ 4 rptr))) (byte 6 0)
293 0))))))
294 (setf rptr (%+ rptr 5)))
296 (return))))
298 ((%<= #|#b11111100|# byte0 #b11111101)
299 (cond ((<= (%+ rptr 6) in-end)
300 (put
301 (dpb (ldb (byte 1 0) byte0) (byte 1 30)
302 (dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 24)
303 (dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 18)
304 (dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 12)
305 (dpb (ldb (byte 6 0) (aref in (%+ 4 rptr))) (byte 6 6)
306 (dpb (ldb (byte 6 0) (aref in (%+ 5 rptr))) (byte 6 0)
307 0)))))))
308 (setf rptr (%+ rptr 6)))
310 (return))))
313 (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)) ) ))
314 (values wptr rptr)) )
316 (defmethod encoding-p ((object (eql :utf-16-little-endian))) t)
317 (defmethod encoding-p ((object (eql :utf-16-big-endian))) t)
318 (defmethod encoding-p ((object (eql :utf-8))) t)
320 (defmethod encoding-p ((object encoding)) t)
322 (defmethod decode-sequence ((encoding simple-8-bit-encoding)
323 in in-start in-end
324 out out-start out-end
325 eof?)
326 (declare (optimize (speed 3) (safety 0))
327 (type (simple-array (unsigned-byte 8) (*)) in)
328 (type (simple-array (unsigned-byte 16) (*)) out)
329 (type fixnum in-start in-end out-start out-end))
330 (let ((wptr out-start)
331 (rptr in-start)
332 (byte 0)
333 (table (slot-value encoding 'table)))
334 (declare (type fixnum wptr rptr)
335 (type (unsigned-byte 8) byte)
336 (type (simple-array (unsigned-byte 16) (*)) table))
337 (loop
338 (when (%= wptr out-end) (return))
339 (when (%>= rptr in-end) (return))
340 (setq byte (aref in rptr))
341 (cond ((= byte #x0D)
342 ;; CR handling
343 ;; we need to know the following character
344 (cond ((>= (%+ rptr 1) in-end)
345 ;; no characters in buffer
346 (cond (eof?
347 ;; at EOF, pass it as NL
348 (setf (aref out wptr) #x0A)
349 (setf wptr (%+ wptr 1))
350 (setf rptr (%+ rptr 1)))
352 ;; demand more characters
353 (return))))
354 ((= (aref in (%+ rptr 1)) #x0A)
355 ;; we see CR NL, so forget this CR and the next NL will be
356 ;; inserted literally
357 (setf rptr (%+ rptr 1)))
359 ;; singleton CR, pass it as NL
360 (setf (aref out wptr) #x0A)
361 (setf wptr (%+ wptr 1))
362 (setf rptr (%+ rptr 1)))))
365 (setf (aref out wptr) (aref table byte))
366 (setf wptr (%+ wptr 1))
367 (setf rptr (%+ rptr 1))) ))
368 (values wptr rptr)))
370 ;;;; ---------------------------------------------------------------------------
371 ;;;; Character sets
372 ;;;;
374 (defvar *charsets* (make-hash-table :test #'eq))
376 (defclass 8-bit-charset ()
377 ((name :initarg :name)
378 (to-unicode-table
379 :initarg :to-unicode-table
380 :reader to-unicode-table)))
382 (defmacro define-8-bit-charset (name &rest codes)
383 (assert (= 256 (length codes)))
384 `(progn
385 (setf (gethash ',name *charsets*)
386 (make-instance '8-bit-charset
387 :name ',name
388 :to-unicode-table
389 ',(make-array 256
390 :element-type '(unsigned-byte 16)
391 :initial-contents codes)))
392 ',name))
394 (defun find-charset (name)
395 (or (gethash name *charsets*)
396 (xerror "There is no character set named ~S." name)))