Fix asd for cmucl with unicode
[closure-common.git] / encodings.lisp
blobf81fed6bc9cffd900386cf8a14b3ec1e1217f075
1 (in-package :runes-encoding)
3 (eval-when (:compile-toplevel :load-toplevel :execute)
4 (defparameter +buffer-byte+
5 #+rune-is-utf-16 '(unsigned-byte 16)
6 #-rune-is-utf-16 '(unsigned-byte 32)))
8 (define-condition encoding-error (simple-error) ())
10 (defun xerror (fmt &rest args)
11 (error 'encoding-error :format-control fmt :format-arguments args))
13 ;;;; ---------------------------------------------------------------------------
14 ;;;; Encoding names
15 ;;;;
17 (defvar *names* (make-hash-table :test #'eq))
19 (defun canon-name (string)
20 (with-output-to-string (bag)
21 (map nil (lambda (ch)
22 (cond ((char= ch #\_) (write-char #\- bag))
23 (t (write-char (char-upcase ch) bag))))
24 string)))
26 (defun canon-name-2 (string)
27 (with-output-to-string (bag)
28 (map nil (lambda (ch)
29 (cond ((char= ch #\_))
30 ((char= ch #\-))
31 (t (write-char (char-upcase ch) bag))))
32 string)))
34 (defmethod encoding-names ((encoding symbol))
35 (gethash encoding *names*))
37 (defmethod (setf encoding-names) (new-value (encoding symbol))
38 (setf (gethash encoding *names*) new-value))
40 (defun add-name (encoding name)
41 (pushnew (canon-name name) (encoding-names encoding) :test #'string=))
43 (defun resolve-name (string)
44 (cond ((symbolp string)
45 string)
47 (setq string (canon-name string))
48 (or
49 (block nil
50 (maphash (lambda (x y)
51 (when (member string y :test #'string=)
52 (return x)))
53 *names*)
54 nil)
55 (block nil
56 (maphash (lambda (x y)
57 (when (member string y
58 :test #'(lambda (x y)
59 (string= (canon-name-2 x)
60 (canon-name-2 y))))
61 (return x)))
62 *names*)
63 nil)))))
65 ;;;; ---------------------------------------------------------------------------
66 ;;;; Encodings
67 ;;;;
69 (defvar *encodings* (make-hash-table :test #'eq))
71 (defmacro define-encoding (name init-form)
72 `(progn
73 (setf (gethash ',name *encodings*)
74 (list nil (lambda () ,init-form)))
75 ',name))
77 (defun find-encoding (name)
78 (let ((x (gethash (resolve-name name) *encodings*)))
79 (and x
80 (or (first x)
81 (setf (first x) (funcall (second x)))))))
83 (defclass encoding () ())
85 (defclass simple-8-bit-encoding (encoding)
86 ((table :initarg :table)))
88 (defun make-simple-8-bit-encoding (&key charset)
89 (make-instance 'simple-8-bit-encoding
90 :table (coerce (to-unicode-table charset) '(simple-array #.+buffer-byte+ (256)))))
92 ;;;;;;;
94 (defmacro fx-op (op &rest xs)
95 `(the fixnum (,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs))))
96 (defmacro fx-pred (op &rest xs)
97 `(,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs)))
99 (defmacro %+ (&rest xs) `(fx-op + ,@xs))
100 (defmacro %- (&rest xs) `(fx-op - ,@xs))
101 (defmacro %* (&rest xs) `(fx-op * ,@xs))
102 (defmacro %/ (&rest xs) `(fx-op floor ,@xs))
103 (defmacro %and (&rest xs) `(fx-op logand ,@xs))
104 (defmacro %ior (&rest xs) `(fx-op logior ,@xs))
105 (defmacro %xor (&rest xs) `(fx-op logxor ,@xs))
106 (defmacro %ash (&rest xs) `(fx-op ash ,@xs))
107 (defmacro %mod (&rest xs) `(fx-op mod ,@xs))
109 (defmacro %= (&rest xs) `(fx-pred = ,@xs))
110 (defmacro %<= (&rest xs) `(fx-pred <= ,@xs))
111 (defmacro %>= (&rest xs) `(fx-pred >= ,@xs))
112 (defmacro %< (&rest xs) `(fx-pred < ,@xs))
113 (defmacro %> (&rest xs) `(fx-pred > ,@xs))
115 ;;; Decoders
117 ;; The decoders share a common signature:
119 ;; DECODE input input-start input-end
120 ;; output output-start output-end
121 ;; eof-p
122 ;; -> first-not-written ; first-not-read
124 ;; These decode functions should decode as much characters off `input'
125 ;; into the `output' as possible and return the indexes to the first
126 ;; not read and first not written element of `input' and `output'
127 ;; respectively. If there are not enough bytes in `input' to decode a
128 ;; full character, decoding shold be abandomed; the caller has to
129 ;; ensure that the remaining bytes of `input' are passed to the
130 ;; decoder again with more bytes appended.
132 ;; `eof-p' now in turn indicates, if the given input sequence, is all
133 ;; the producer does have and might be used to produce error messages
134 ;; in case of incomplete codes or decided what to do.
136 ;; Decoders are expected to handle the various CR/NL conventions and
137 ;; canonicalize each end of line into a single NL rune (#xA) in good
138 ;; old Lisp tradition.
141 ;; TODO: change this to an encoding class, which then might carry
142 ;; additional state. Stateless encodings could been represented by
143 ;; keywords. e.g.
145 ;; defmethod DECODE-SEQUENCE ((encoding (eql :utf-8)) ...)
148 (defmethod decode-sequence ((encoding (eql :utf-16-big-endian))
149 in in-start in-end out out-start out-end eof?)
150 ;; -> new wptr, new rptr
151 (let ((wptr out-start)
152 (rptr in-start))
153 (loop
154 (when (%= wptr out-end)
155 (return))
156 (when (>= (%+ rptr 1) in-end)
157 (return))
158 (let* ((hi (aref in rptr))
159 (lo (aref in (%+ 1 rptr)))
160 (x (logior (ash hi 8) lo)))
161 (when (or (eql x #xFFFE) (eql x #xFFFF))
162 (xerror "not a valid code point: #x~X" x))
163 (when (<= #xDC00 x #xDFFF)
164 (xerror "unexpected high surrogate: #x~X" x))
165 (when (<= #xD800 x #xDBFF)
166 ;; seen low surrogate, look for high surrogate now
167 (when (>= (%+ rptr 3) in-end)
168 (return))
169 (let* ((hi2 (aref in (%+ 2 rptr)))
170 (lo2 (aref in (%+ 3 rptr)))
171 (y (logior (ash hi2 8) lo2)))
172 (unless (<= #xDC00 x #xDFFF)
173 (xerror "expected a high surrogate but found: #x~X" x))
174 #-rune-is-utf-16
175 (progn
176 (setf x (logior (ash (%- x #xd7c0) 10) (%and y #x3FF)))
177 (setf rptr (%+ 2 rptr))))
178 ;; end of surrogate handling
180 (setf (aref out wptr) x)
181 (setf rptr (%+ 2 rptr))
182 (setf wptr (%+ 1 wptr))))
183 (values wptr rptr)))
185 (defmethod decode-sequence ((encoding (eql :utf-16-little-endian))
186 in in-start in-end out out-start out-end eof?)
187 ;; -> new wptr, new rptr
188 (let ((wptr out-start)
189 (rptr in-start))
190 (loop
191 (when (%= wptr out-end)
192 (return))
193 (when (>= (%+ rptr 1) in-end)
194 (return))
195 (let* ((lo (aref in rptr))
196 (hi (aref in (%+ 1 rptr)))
197 (x (logior (ash hi 8) lo)))
198 (when (or (eql x #xFFFE) (eql x #xFFFF))
199 (xerror "not a valid code point: #x~X" x))
200 (when (<= #xDC00 x #xDFFF)
201 (xerror "unexpected high surrogate: #x~X" x))
202 (when (<= #xD800 x #xDBFF)
203 ;; seen low surrogate, look for high surrogate now
204 (when (>= (%+ rptr 3) in-end)
205 (return))
206 (let* ((lo2 (aref in (%+ 2 rptr)))
207 (hi2 (aref in (%+ 3 rptr)))
208 (y (logior (ash hi2 8) lo2)))
209 (unless (<= #xDC00 x #xDFFF)
210 (xerror "expected a high surrogate but found: #x~X" x))
211 #-rune-is-utf-16
212 (progn
213 (setf x (logior (ash (%- x #xd7c0) 10) (%and y #x3FF)))
214 (setf rptr (%+ 2 rptr))))
215 ;; end of surrogate handling
217 (setf (aref out wptr) x)
218 (setf rptr (%+ 2 rptr))
219 (setf wptr (%+ 1 wptr))))
220 (values wptr rptr)))
222 (defmethod decode-sequence ((encoding (eql :utf-8))
223 in in-start in-end out out-start out-end eof?)
224 (declare (optimize (speed 3) (safety 0))
225 (type (simple-array (unsigned-byte 8) (*)) in)
226 (type (simple-array #.+buffer-byte+ (*))
227 out)
228 (type fixnum in-start in-end out-start out-end))
229 (let ((wptr out-start)
230 (rptr in-start)
231 byte0)
232 (macrolet ((put (x)
233 `((lambda (x)
234 (when (or (<= #xD800 x #xDBFF)
235 (<= #xDC00 x #xDFFF))
236 (xerror "surrogate encoded in UTF-8: #x~X." x))
237 (cond ((or (%> x #x10FFFF)
238 (eql x #xFFFE)
239 (eql x #xFFFF))
240 (xerror "not a valid code point: #x~X" x))
241 #+rune-is-utf-16
242 ((%> x #xFFFF)
243 (setf (aref out (%+ 0 wptr)) (%+ #xD7C0 (ash x -10))
244 (aref out (%+ 1 wptr)) (%ior #xDC00 (%and x #x3FF)))
245 (setf wptr (%+ wptr 2)))
247 (setf (aref out wptr) x)
248 (setf wptr (%+ wptr 1)))))
249 ,x))
250 (put1 (x)
251 `(progn
252 (setf (aref out wptr) ,x)
253 (setf wptr (%+ wptr 1)))))
254 (loop
255 (when (%= (+ wptr 1) out-end) (return))
256 (when (%>= rptr in-end) (return))
257 (setq byte0 (aref in rptr))
258 (cond ((= byte0 #x0D)
259 ;; CR handling
260 ;; we need to know the following character
261 (cond ((>= (%+ rptr 1) in-end)
262 ;; no characters in buffer
263 (cond (eof?
264 ;; at EOF, pass it as NL
265 (put #x0A)
266 (setf rptr (%+ rptr 1)))
268 ;; demand more characters
269 (return))))
270 ((= (aref in (%+ rptr 1)) #x0A)
271 ;; we see CR NL, so forget this CR and the next NL will be
272 ;; inserted literally
273 (setf rptr (%+ rptr 1)))
275 ;; singleton CR, pass it as NL
276 (put #x0A)
277 (setf rptr (%+ rptr 1)))))
279 ((%<= #|#b00000000|# byte0 #b01111111)
280 (put1 byte0)
281 (setf rptr (%+ rptr 1)))
283 ((%<= #|#b10000000|# byte0 #b10111111)
284 (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)
285 (setf rptr (%+ rptr 1)))
287 ((%<= #|#b11000000|# byte0 #b11011111)
288 (cond ((<= (%+ rptr 2) in-end)
289 (put
290 (dpb (ldb (byte 5 0) byte0) (byte 5 6)
291 (dpb (ldb (byte 6 0) (aref in (%+ rptr 1))) (byte 6 0)
292 0)))
293 (setf rptr (%+ rptr 2)))
295 (return))))
297 ((%<= #|#b11100000|# byte0 #b11101111)
298 (cond ((<= (%+ rptr 3) in-end)
299 (put
300 (dpb (ldb (byte 4 0) byte0) (byte 4 12)
301 (dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 6)
302 (dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 0)
303 0))))
304 (setf rptr (%+ rptr 3)))
306 (return))))
308 ((%<= #|#b11110000|# byte0 #b11110111)
309 (cond ((<= (%+ rptr 4) in-end)
310 (put
311 (dpb (ldb (byte 3 0) byte0) (byte 3 18)
312 (dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 12)
313 (dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 6)
314 (dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 0)
315 0)))))
316 (setf rptr (%+ rptr 4)))
318 (return))))
320 ((%<= #|#b11111000|# byte0 #b11111011)
321 (cond ((<= (%+ rptr 5) in-end)
322 (put
323 (dpb (ldb (byte 2 0) byte0) (byte 2 24)
324 (dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 18)
325 (dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 12)
326 (dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 6)
327 (dpb (ldb (byte 6 0) (aref in (%+ 4 rptr))) (byte 6 0)
328 0))))))
329 (setf rptr (%+ rptr 5)))
331 (return))))
333 ((%<= #|#b11111100|# byte0 #b11111101)
334 (cond ((<= (%+ rptr 6) in-end)
335 (put
336 (dpb (ldb (byte 1 0) byte0) (byte 1 30)
337 (dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 24)
338 (dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 18)
339 (dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 12)
340 (dpb (ldb (byte 6 0) (aref in (%+ 4 rptr))) (byte 6 6)
341 (dpb (ldb (byte 6 0) (aref in (%+ 5 rptr))) (byte 6 0)
342 0)))))))
343 (setf rptr (%+ rptr 6)))
345 (return))))
348 (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)) ) ))
349 (values wptr rptr)) )
351 (defmethod encoding-p ((object (eql :utf-16-little-endian))) t)
352 (defmethod encoding-p ((object (eql :utf-16-big-endian))) t)
353 (defmethod encoding-p ((object (eql :utf-8))) t)
355 (defmethod encoding-p ((object encoding)) t)
357 (defmethod decode-sequence ((encoding simple-8-bit-encoding)
358 in in-start in-end
359 out out-start out-end
360 eof?)
361 (declare (optimize (speed 3) (safety 0))
362 (type (simple-array (unsigned-byte 8) (*)) in)
363 (type (simple-array #.+buffer-byte+ (*)) out)
364 (type fixnum in-start in-end out-start out-end))
365 (let ((wptr out-start)
366 (rptr in-start)
367 (byte 0)
368 (table (slot-value encoding 'table)))
369 (declare (type fixnum wptr rptr)
370 (type (unsigned-byte 8) byte)
371 (type (simple-array #.+buffer-byte+ (*)) table))
372 (loop
373 (when (%= wptr out-end) (return))
374 (when (%>= rptr in-end) (return))
375 (setq byte (aref in rptr))
376 (cond ((= byte #x0D)
377 ;; CR handling
378 ;; we need to know the following character
379 (cond ((>= (%+ rptr 1) in-end)
380 ;; no characters in buffer
381 (cond (eof?
382 ;; at EOF, pass it as NL
383 (setf (aref out wptr) #x0A)
384 (setf wptr (%+ wptr 1))
385 (setf rptr (%+ rptr 1)))
387 ;; demand more characters
388 (return))))
389 ((= (aref in (%+ rptr 1)) #x0A)
390 ;; we see CR NL, so forget this CR and the next NL will be
391 ;; inserted literally
392 (setf rptr (%+ rptr 1)))
394 ;; singleton CR, pass it as NL
395 (setf (aref out wptr) #x0A)
396 (setf wptr (%+ wptr 1))
397 (setf rptr (%+ rptr 1)))))
400 (setf (aref out wptr) (aref table byte))
401 (setf wptr (%+ wptr 1))
402 (setf rptr (%+ rptr 1))) ))
403 (values wptr rptr)))
405 ;;;; ---------------------------------------------------------------------------
406 ;;;; Character sets
407 ;;;;
409 (defvar *charsets* (make-hash-table :test #'eq))
411 (defclass 8-bit-charset ()
412 ((name :initarg :name)
413 (to-unicode-table
414 :initarg :to-unicode-table
415 :reader to-unicode-table)))
417 (defmacro define-8-bit-charset (name &rest codes)
418 (assert (= 256 (length codes)))
419 `(progn
420 (setf (gethash ',name *charsets*)
421 (make-instance '8-bit-charset
422 :name ',name
423 :to-unicode-table
424 ',(make-array 256
425 :element-type '#.+buffer-byte+
426 :initial-contents codes)))
427 ',name))
429 (defun find-charset (name)
430 (or (gethash name *charsets*)
431 (xerror "There is no character set named ~S." name)))