1 (in-package :runes-encoding
)
3 (eval-when (:compile-toplevel
:load-toplevel
:execute
)
4 (defparameter +buffer-byte
+
5 #+rune-is-utf-16
'(unsigned-byte 16)
6 #-rune-is-utf-16
'(unsigned-byte 32)))
8 (define-condition encoding-error
(simple-error) ())
10 (defun xerror (fmt &rest args
)
11 (error 'encoding-error
:format-control fmt
:format-arguments args
))
13 ;;;; ---------------------------------------------------------------------------
17 (defvar *names
* (make-hash-table :test
#'eq
))
19 (defun canon-name (string)
20 (with-output-to-string (bag)
22 (cond ((char= ch
#\_
) (write-char #\- bag
))
23 (t (write-char (char-upcase ch
) bag
))))
26 (defun canon-name-2 (string)
27 (with-output-to-string (bag)
29 (cond ((char= ch
#\_
))
31 (t (write-char (char-upcase ch
) bag
))))
34 (defmethod encoding-names ((encoding symbol
))
35 (gethash encoding
*names
*))
37 (defmethod (setf encoding-names
) (new-value (encoding symbol
))
38 (setf (gethash encoding
*names
*) new-value
))
40 (defun add-name (encoding name
)
41 (pushnew (canon-name name
) (encoding-names encoding
) :test
#'string
=))
43 (defun resolve-name (string)
44 (cond ((symbolp string
)
47 (setq string
(canon-name string
))
50 (maphash (lambda (x y
)
51 (when (member string y
:test
#'string
=)
56 (maphash (lambda (x y
)
57 (when (member string y
59 (string= (canon-name-2 x
)
65 ;;;; ---------------------------------------------------------------------------
69 (defvar *encodings
* (make-hash-table :test
#'eq
))
71 (defmacro define-encoding
(name init-form
)
73 (setf (gethash ',name
*encodings
*)
74 (list nil
(lambda () ,init-form
)))
77 (defun find-encoding (name)
78 (let ((x (gethash (resolve-name name
) *encodings
*)))
81 (setf (first x
) (funcall (second x
)))))))
83 (defclass encoding
() ())
85 (defclass simple-8-bit-encoding
(encoding)
86 ((table :initarg
:table
)))
88 (defun make-simple-8-bit-encoding (&key charset
)
89 (make-instance 'simple-8-bit-encoding
90 :table
(coerce (to-unicode-table charset
) '(simple-array #.
+buffer-byte
+ (256)))))
94 (defmacro fx-op
(op &rest xs
)
95 `(the fixnum
(,op
,@(mapcar (lambda (x) `(the fixnum
,x
)) xs
))))
96 (defmacro fx-pred
(op &rest xs
)
97 `(,op
,@(mapcar (lambda (x) `(the fixnum
,x
)) xs
)))
99 (defmacro %
+ (&rest xs
) `(fx-op + ,@xs
))
100 (defmacro %-
(&rest xs
) `(fx-op -
,@xs
))
101 (defmacro %
* (&rest xs
) `(fx-op * ,@xs
))
102 (defmacro %
/ (&rest xs
) `(fx-op floor
,@xs
))
103 (defmacro %and
(&rest xs
) `(fx-op logand
,@xs
))
104 (defmacro %ior
(&rest xs
) `(fx-op logior
,@xs
))
105 (defmacro %xor
(&rest xs
) `(fx-op logxor
,@xs
))
106 (defmacro %ash
(&rest xs
) `(fx-op ash
,@xs
))
107 (defmacro %mod
(&rest xs
) `(fx-op mod
,@xs
))
109 (defmacro %
= (&rest xs
) `(fx-pred = ,@xs
))
110 (defmacro %
<= (&rest xs
) `(fx-pred <= ,@xs
))
111 (defmacro %
>= (&rest xs
) `(fx-pred >= ,@xs
))
112 (defmacro %
< (&rest xs
) `(fx-pred < ,@xs
))
113 (defmacro %
> (&rest xs
) `(fx-pred > ,@xs
))
117 ;; The decoders share a common signature:
119 ;; DECODE input input-start input-end
120 ;; output output-start output-end
122 ;; -> first-not-written ; first-not-read
124 ;; These decode functions should decode as much characters off `input'
125 ;; into the `output' as possible and return the indexes to the first
126 ;; not read and first not written element of `input' and `output'
127 ;; respectively. If there are not enough bytes in `input' to decode a
128 ;; full character, decoding shold be abandomed; the caller has to
129 ;; ensure that the remaining bytes of `input' are passed to the
130 ;; decoder again with more bytes appended.
132 ;; `eof-p' now in turn indicates, if the given input sequence, is all
133 ;; the producer does have and might be used to produce error messages
134 ;; in case of incomplete codes or decided what to do.
136 ;; Decoders are expected to handle the various CR/NL conventions and
137 ;; canonicalize each end of line into a single NL rune (#xA) in good
138 ;; old Lisp tradition.
141 ;; TODO: change this to an encoding class, which then might carry
142 ;; additional state. Stateless encodings could been represented by
145 ;; defmethod DECODE-SEQUENCE ((encoding (eql :utf-8)) ...)
148 (defmethod decode-sequence ((encoding (eql :utf-16-big-endian
))
149 in in-start in-end out out-start out-end eof?
)
150 ;; -> new wptr, new rptr
151 (let ((wptr out-start
)
154 (when (%
= wptr out-end
)
156 (when (>= (%
+ rptr
1) in-end
)
158 (let* ((hi (aref in rptr
))
159 (lo (aref in
(%
+ 1 rptr
)))
160 (x (logior (ash hi
8) lo
)))
161 (when (or (eql x
#xFFFE
) (eql x
#xFFFF
))
162 (xerror "not a valid code point: #x~X" x
))
163 (when (<= #xDC00 x
#xDFFF
)
164 (xerror "unexpected high surrogate: #x~X" x
))
165 (when (<= #xD800 x
#xDBFF
)
166 ;; seen low surrogate, look for high surrogate now
167 (when (>= (%
+ rptr
3) in-end
)
169 (let* ((hi2 (aref in
(%
+ 2 rptr
)))
170 (lo2 (aref in
(%
+ 3 rptr
)))
171 (y (logior (ash hi2
8) lo2
)))
172 (unless (<= #xDC00 x
#xDFFF
)
173 (xerror "expected a high surrogate but found: #x~X" x
))
176 (setf x
(logior (ash (%- x
#xd7c0
) 10) (%and y
#x3FF
)))
177 (setf rptr
(%
+ 2 rptr
))))
178 ;; end of surrogate handling
180 (setf (aref out wptr
) x
)
181 (setf rptr
(%
+ 2 rptr
))
182 (setf wptr
(%
+ 1 wptr
))))
185 (defmethod decode-sequence ((encoding (eql :utf-16-little-endian
))
186 in in-start in-end out out-start out-end eof?
)
187 ;; -> new wptr, new rptr
188 (let ((wptr out-start
)
191 (when (%
= wptr out-end
)
193 (when (>= (%
+ rptr
1) in-end
)
195 (let* ((lo (aref in rptr
))
196 (hi (aref in
(%
+ 1 rptr
)))
197 (x (logior (ash hi
8) lo
)))
198 (when (or (eql x
#xFFFE
) (eql x
#xFFFF
))
199 (xerror "not a valid code point: #x~X" x
))
200 (when (<= #xDC00 x
#xDFFF
)
201 (xerror "unexpected high surrogate: #x~X" x
))
202 (when (<= #xD800 x
#xDBFF
)
203 ;; seen low surrogate, look for high surrogate now
204 (when (>= (%
+ rptr
3) in-end
)
206 (let* ((lo2 (aref in
(%
+ 2 rptr
)))
207 (hi2 (aref in
(%
+ 3 rptr
)))
208 (y (logior (ash hi2
8) lo2
)))
209 (unless (<= #xDC00 x
#xDFFF
)
210 (xerror "expected a high surrogate but found: #x~X" x
))
213 (setf x
(logior (ash (%- x
#xd7c0
) 10) (%and y
#x3FF
)))
214 (setf rptr
(%
+ 2 rptr
))))
215 ;; end of surrogate handling
217 (setf (aref out wptr
) x
)
218 (setf rptr
(%
+ 2 rptr
))
219 (setf wptr
(%
+ 1 wptr
))))
222 (defmethod decode-sequence ((encoding (eql :utf-8
))
223 in in-start in-end out out-start out-end eof?
)
224 (declare (optimize (speed 3) (safety 0))
225 (type (simple-array (unsigned-byte 8) (*)) in
)
226 (type (simple-array #.
+buffer-byte
+ (*))
228 (type fixnum in-start in-end out-start out-end
))
229 (let ((wptr out-start
)
234 (when (or (<= #xD800 x
#xDBFF
)
235 (<= #xDC00 x
#xDFFF
))
236 (xerror "surrogate encoded in UTF-8: #x~X." x
))
237 (cond ((or (%
> x
#x10FFFF
)
240 (xerror "not a valid code point: #x~X" x
))
243 (setf (aref out
(%
+ 0 wptr
)) (%
+ #xD7C0
(ash x -
10))
244 (aref out
(%
+ 1 wptr
)) (%ior
#xDC00
(%and x
#x3FF
)))
245 (setf wptr
(%
+ wptr
2)))
247 (setf (aref out wptr
) x
)
248 (setf wptr
(%
+ wptr
1)))))
252 (setf (aref out wptr
) ,x
)
253 (setf wptr
(%
+ wptr
1)))))
255 (when (%
= (+ wptr
1) out-end
) (return))
256 (when (%
>= rptr in-end
) (return))
257 (setq byte0
(aref in rptr
))
258 (cond ((= byte0
#x0D
)
260 ;; we need to know the following character
261 (cond ((>= (%
+ rptr
1) in-end
)
262 ;; no characters in buffer
264 ;; at EOF, pass it as NL
266 (setf rptr
(%
+ rptr
1)))
268 ;; demand more characters
270 ((= (aref in
(%
+ rptr
1)) #x0A
)
271 ;; we see CR NL, so forget this CR and the next NL will be
272 ;; inserted literally
273 (setf rptr
(%
+ rptr
1)))
275 ;; singleton CR, pass it as NL
277 (setf rptr
(%
+ rptr
1)))))
279 ((%
<= #|
#b00000000|
# byte0
#b01111111
)
281 (setf rptr
(%
+ rptr
1)))
283 ((%
<= #|
#b10000000|
# byte0
#b10111111
)
284 (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0
)
285 (setf rptr
(%
+ rptr
1)))
287 ((%
<= #|
#b11000000|
# byte0
#b11011111
)
288 (cond ((<= (%
+ rptr
2) in-end
)
290 (dpb (ldb (byte 5 0) byte0
) (byte 5 6)
291 (dpb (ldb (byte 6 0) (aref in
(%
+ rptr
1))) (byte 6 0)
293 (setf rptr
(%
+ rptr
2)))
297 ((%
<= #|
#b11100000|
# byte0
#b11101111
)
298 (cond ((<= (%
+ rptr
3) in-end
)
300 (dpb (ldb (byte 4 0) byte0
) (byte 4 12)
301 (dpb (ldb (byte 6 0) (aref in
(%
+ 1 rptr
))) (byte 6 6)
302 (dpb (ldb (byte 6 0) (aref in
(%
+ 2 rptr
))) (byte 6 0)
304 (setf rptr
(%
+ rptr
3)))
308 ((%
<= #|
#b11110000|
# byte0
#b11110111
)
309 (cond ((<= (%
+ rptr
4) in-end
)
311 (dpb (ldb (byte 3 0) byte0
) (byte 3 18)
312 (dpb (ldb (byte 6 0) (aref in
(%
+ 1 rptr
))) (byte 6 12)
313 (dpb (ldb (byte 6 0) (aref in
(%
+ 2 rptr
))) (byte 6 6)
314 (dpb (ldb (byte 6 0) (aref in
(%
+ 3 rptr
))) (byte 6 0)
316 (setf rptr
(%
+ rptr
4)))
320 ((%
<= #|
#b11111000|
# byte0
#b11111011
)
321 (cond ((<= (%
+ rptr
5) in-end
)
323 (dpb (ldb (byte 2 0) byte0
) (byte 2 24)
324 (dpb (ldb (byte 6 0) (aref in
(%
+ 1 rptr
))) (byte 6 18)
325 (dpb (ldb (byte 6 0) (aref in
(%
+ 2 rptr
))) (byte 6 12)
326 (dpb (ldb (byte 6 0) (aref in
(%
+ 3 rptr
))) (byte 6 6)
327 (dpb (ldb (byte 6 0) (aref in
(%
+ 4 rptr
))) (byte 6 0)
329 (setf rptr
(%
+ rptr
5)))
333 ((%
<= #|
#b11111100|
# byte0
#b11111101
)
334 (cond ((<= (%
+ rptr
6) in-end
)
336 (dpb (ldb (byte 1 0) byte0
) (byte 1 30)
337 (dpb (ldb (byte 6 0) (aref in
(%
+ 1 rptr
))) (byte 6 24)
338 (dpb (ldb (byte 6 0) (aref in
(%
+ 2 rptr
))) (byte 6 18)
339 (dpb (ldb (byte 6 0) (aref in
(%
+ 3 rptr
))) (byte 6 12)
340 (dpb (ldb (byte 6 0) (aref in
(%
+ 4 rptr
))) (byte 6 6)
341 (dpb (ldb (byte 6 0) (aref in
(%
+ 5 rptr
))) (byte 6 0)
343 (setf rptr
(%
+ rptr
6)))
348 (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0
)) ) ))
349 (values wptr rptr
)) )
351 (defmethod encoding-p ((object (eql :utf-16-little-endian
))) t
)
352 (defmethod encoding-p ((object (eql :utf-16-big-endian
))) t
)
353 (defmethod encoding-p ((object (eql :utf-8
))) t
)
355 (defmethod encoding-p ((object encoding
)) t
)
357 (defmethod decode-sequence ((encoding simple-8-bit-encoding
)
359 out out-start out-end
361 (declare (optimize (speed 3) (safety 0))
362 (type (simple-array (unsigned-byte 8) (*)) in
)
363 (type (simple-array #.
+buffer-byte
+ (*)) out
)
364 (type fixnum in-start in-end out-start out-end
))
365 (let ((wptr out-start
)
368 (table (slot-value encoding
'table
)))
369 (declare (type fixnum wptr rptr
)
370 (type (unsigned-byte 8) byte
)
371 (type (simple-array #.
+buffer-byte
+ (*)) table
))
373 (when (%
= wptr out-end
) (return))
374 (when (%
>= rptr in-end
) (return))
375 (setq byte
(aref in rptr
))
378 ;; we need to know the following character
379 (cond ((>= (%
+ rptr
1) in-end
)
380 ;; no characters in buffer
382 ;; at EOF, pass it as NL
383 (setf (aref out wptr
) #x0A
)
384 (setf wptr
(%
+ wptr
1))
385 (setf rptr
(%
+ rptr
1)))
387 ;; demand more characters
389 ((= (aref in
(%
+ rptr
1)) #x0A
)
390 ;; we see CR NL, so forget this CR and the next NL will be
391 ;; inserted literally
392 (setf rptr
(%
+ rptr
1)))
394 ;; singleton CR, pass it as NL
395 (setf (aref out wptr
) #x0A
)
396 (setf wptr
(%
+ wptr
1))
397 (setf rptr
(%
+ rptr
1)))))
400 (setf (aref out wptr
) (aref table byte
))
401 (setf wptr
(%
+ wptr
1))
402 (setf rptr
(%
+ rptr
1))) ))
405 ;;;; ---------------------------------------------------------------------------
409 (defvar *charsets
* (make-hash-table :test
#'eq
))
411 (defclass 8-bit-charset
()
412 ((name :initarg
:name
)
414 :initarg
:to-unicode-table
415 :reader to-unicode-table
)))
417 (defmacro define-8-bit-charset
(name &rest codes
)
418 (assert (= 256 (length codes
)))
420 (setf (gethash ',name
*charsets
*)
421 (make-instance '8-bit-charset
425 :element-type
'#.
+buffer-byte
+
426 :initial-contents codes
)))
429 (defun find-charset (name)
430 (or (gethash name
*charsets
*)
431 (xerror "There is no character set named ~S." name
)))