1 (in-package :runes-encoding
)
3 (define-condition encoding-error
(simple-error) ())
5 (defun xerror (fmt &rest args
)
6 (error 'encoding-error
:format-control fmt
:format-arguments args
))
8 ;;;; ---------------------------------------------------------------------------
12 (defvar *names
* (make-hash-table :test
#'eq
))
14 (defun canon-name (string)
15 (with-output-to-string (bag)
17 (cond ((char= ch
#\_
) (write-char #\- bag
))
18 (t (write-char (char-upcase ch
) bag
))))
21 (defun canon-name-2 (string)
22 (with-output-to-string (bag)
24 (cond ((char= ch
#\_
))
26 (t (write-char (char-upcase ch
) bag
))))
29 (defmethod encoding-names ((encoding symbol
))
30 (gethash encoding
*names
*))
32 (defmethod (setf encoding-names
) (new-value (encoding symbol
))
33 (setf (gethash encoding
*names
*) new-value
))
35 (defun add-name (encoding name
)
36 (pushnew (canon-name name
) (encoding-names encoding
) :test
#'string
=))
38 (defun resolve-name (string)
39 (cond ((symbolp string
)
42 (setq string
(canon-name string
))
45 (maphash (lambda (x y
)
46 (when (member string y
:test
#'string
=)
51 (maphash (lambda (x y
)
52 (when (member string y
54 (string= (canon-name-2 x
)
60 ;;;; ---------------------------------------------------------------------------
64 (defvar *encodings
* (make-hash-table :test
#'eq
))
66 (defmacro define-encoding
(name init-form
)
68 (setf (gethash ',name
*encodings
*)
69 (list nil
(lambda () ,init-form
)))
72 (defun find-encoding (name)
73 (let ((x (gethash (resolve-name name
) *encodings
*)))
76 (setf (first x
) (funcall (second x
)))))))
78 (defclass encoding
() ())
80 (defclass simple-8-bit-encoding
(encoding)
81 ((table :initarg
:table
)))
83 (defun make-simple-8-bit-encoding (&key charset
)
84 (make-instance 'simple-8-bit-encoding
85 :table
(coerce (to-unicode-table charset
) '(simple-array (unsigned-byte 16) (256)))))
89 (defmacro fx-op
(op &rest xs
)
90 `(the fixnum
(,op
,@(mapcar (lambda (x) `(the fixnum
,x
)) xs
))))
91 (defmacro fx-pred
(op &rest xs
)
92 `(,op
,@(mapcar (lambda (x) `(the fixnum
,x
)) xs
)))
94 (defmacro %
+ (&rest xs
) `(fx-op + ,@xs
))
95 (defmacro %-
(&rest xs
) `(fx-op -
,@xs
))
96 (defmacro %
* (&rest xs
) `(fx-op * ,@xs
))
97 (defmacro %
/ (&rest xs
) `(fx-op floor
,@xs
))
98 (defmacro %and
(&rest xs
) `(fx-op logand
,@xs
))
99 (defmacro %ior
(&rest xs
) `(fx-op logior
,@xs
))
100 (defmacro %xor
(&rest xs
) `(fx-op logxor
,@xs
))
101 (defmacro %ash
(&rest xs
) `(fx-op ash
,@xs
))
102 (defmacro %mod
(&rest xs
) `(fx-op mod
,@xs
))
104 (defmacro %
= (&rest xs
) `(fx-pred = ,@xs
))
105 (defmacro %
<= (&rest xs
) `(fx-pred <= ,@xs
))
106 (defmacro %
>= (&rest xs
) `(fx-pred >= ,@xs
))
107 (defmacro %
< (&rest xs
) `(fx-pred < ,@xs
))
108 (defmacro %
> (&rest xs
) `(fx-pred > ,@xs
))
110 (defmethod decode-sequence ((encoding (eql :utf-16-big-endian
))
111 in in-start in-end out out-start out-end eof?
)
112 ;; -> new wptr, new rptr
113 (let ((wptr out-start
)
116 (when (%
= wptr out-end
)
118 (when (>= (%
+ rptr
1) in-end
)
120 (let ((hi (aref in rptr
))
121 (lo (aref in
(%
+ 1 rptr
))))
122 (setf rptr
(%
+ 2 rptr
))
123 ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
124 ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
126 (setf (aref out wptr
) (logior (ash hi
8) lo
))
127 (setf wptr
(%
+ 1 wptr
))))
130 (defmethod decode-sequence ((encoding (eql :utf-16-little-endian
))
131 in in-start in-end out out-start out-end eof?
)
132 ;; -> new wptr, new rptr
133 (let ((wptr out-start
)
136 (when (%
= wptr out-end
)
138 (when (>= (%
+ rptr
1) in-end
)
140 (let ((lo (aref in
(%
+ 0 rptr
)))
141 (hi (aref in
(%
+ 1 rptr
))))
142 (setf rptr
(%
+ 2 rptr
))
143 ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
144 ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
146 (setf (aref out wptr
) (logior (ash hi
8) lo
))
147 (setf wptr
(%
+ 1 wptr
))))
150 (defmethod decode-sequence ((encoding (eql :utf-8
))
151 in in-start in-end out out-start out-end eof?
)
152 (declare (optimize (speed 3) (safety 0))
153 (type (simple-array (unsigned-byte 8) (*)) in
)
154 (type (simple-array (unsigned-byte 16) (*)) out
)
155 (type fixnum in-start in-end out-start out-end
))
156 (let ((wptr out-start
)
161 (when (or (<= #xD800 x
#xDBFF
)
162 (<= #xDC00 x
#xDFFF
))
163 (xerror "surrogate encoded in UTF-8: #x~x." x
))
165 (setf (aref out
(%
+ 0 wptr
)) (%
+ #xD7C0
(ash x -
10))
166 (aref out
(%
+ 1 wptr
)) (%ior
#xDC00
(%and x
#x3FF
)))
167 (setf wptr
(%
+ wptr
2)))
169 (setf (aref out wptr
) x
)
170 (setf wptr
(%
+ wptr
1)))))
174 (setf (aref out wptr
) ,x
)
175 (setf wptr
(%
+ wptr
1)))))
177 (when (%
= (+ wptr
1) out-end
) (return))
178 (when (%
>= rptr in-end
) (return))
179 (setq byte0
(aref in rptr
))
180 (cond ((= byte0
#x0D
)
182 ;; we need to know the following character
183 (cond ((>= (%
+ rptr
1) in-end
)
184 ;; no characters in buffer
186 ;; at EOF, pass it as NL
188 (setf rptr
(%
+ rptr
1)))
190 ;; demand more characters
192 ((= (aref in
(%
+ rptr
1)) #x0A
)
193 ;; we see CR NL, so forget this CR and the next NL will be
194 ;; inserted literally
195 (setf rptr
(%
+ rptr
1)))
197 ;; singleton CR, pass it as NL
199 (setf rptr
(%
+ rptr
1)))))
201 ((%
<= #|
#b00000000|
# byte0
#b01111111
)
203 (setf rptr
(%
+ rptr
1)))
205 ((%
<= #|
#b10000000|
# byte0
#b10111111
)
206 (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0
)
207 (setf rptr
(%
+ rptr
1)))
209 ((%
<= #|
#b11000000|
# byte0
#b11011111
)
210 (cond ((< (%
+ rptr
2) in-end
)
212 (dpb (ldb (byte 5 0) byte0
) (byte 5 6)
213 (dpb (ldb (byte 6 0) (aref in
(%
+ rptr
1))) (byte 6 0)
215 (setf rptr
(%
+ rptr
2)))
219 ((%
<= #|
#b11100000|
# byte0
#b11101111
)
220 (cond ((< (%
+ rptr
3) in-end
)
222 (dpb (ldb (byte 4 0) byte0
) (byte 4 12)
223 (dpb (ldb (byte 6 0) (aref in
(%
+ 1 rptr
))) (byte 6 6)
224 (dpb (ldb (byte 6 0) (aref in
(%
+ 2 rptr
))) (byte 6 0)
226 (setf rptr
(%
+ rptr
3)))
230 ((%
<= #|
#b11110000|
# byte0
#b11110111
)
231 (cond ((< (%
+ rptr
4) in-end
)
233 (dpb (ldb (byte 3 0) byte0
) (byte 3 18)
234 (dpb (ldb (byte 6 0) (aref in
(%
+ 1 rptr
))) (byte 6 12)
235 (dpb (ldb (byte 6 0) (aref in
(%
+ 2 rptr
))) (byte 6 6)
236 (dpb (ldb (byte 6 0) (aref in
(%
+ 3 rptr
))) (byte 6 0)
238 (setf rptr
(%
+ rptr
4)))
242 ((%
<= #|
#b11111000|
# byte0
#b11111011
)
243 (cond ((< (%
+ rptr
5) in-end
)
245 (dpb (ldb (byte 2 0) byte0
) (byte 2 24)
246 (dpb (ldb (byte 6 0) (aref in
(%
+ 1 rptr
))) (byte 6 18)
247 (dpb (ldb (byte 6 0) (aref in
(%
+ 2 rptr
))) (byte 6 12)
248 (dpb (ldb (byte 6 0) (aref in
(%
+ 3 rptr
))) (byte 6 6)
249 (dpb (ldb (byte 6 0) (aref in
(%
+ 4 rptr
))) (byte 6 0)
251 (setf rptr
(%
+ rptr
5)))
255 ((%
<= #|
#b11111100|
# byte0
#b11111101
)
256 (cond ((< (%
+ rptr
6) in-end
)
258 (dpb (ldb (byte 1 0) byte0
) (byte 1 30)
259 (dpb (ldb (byte 6 0) (aref in
(%
+ 1 rptr
))) (byte 6 24)
260 (dpb (ldb (byte 6 0) (aref in
(%
+ 2 rptr
))) (byte 6 18)
261 (dpb (ldb (byte 6 0) (aref in
(%
+ 3 rptr
))) (byte 6 12)
262 (dpb (ldb (byte 6 0) (aref in
(%
+ 4 rptr
))) (byte 6 6)
263 (dpb (ldb (byte 6 0) (aref in
(%
+ 5 rptr
))) (byte 6 0)
265 (setf rptr
(%
+ rptr
6)))
270 (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0
)) ) ))
271 (values wptr rptr
)) )
273 (defmethod encoding-p ((object (eql :utf-16-little-endian
))) t
)
274 (defmethod encoding-p ((object (eql :utf-16-big-endian
))) t
)
275 (defmethod encoding-p ((object (eql :utf-8
))) t
)
277 (defmethod encoding-p ((object encoding
)) t
)
279 (defmethod decode-sequence ((encoding simple-8-bit-encoding
)
281 out out-start out-end
283 (declare (optimize (speed 3) (safety 0))
284 (type (simple-array (unsigned-byte 8) (*)) in
)
285 (type (simple-array (unsigned-byte 16) (*)) out
)
286 (type fixnum in-start in-end out-start out-end
))
287 (let ((wptr out-start
)
290 (table (slot-value encoding
'table
)))
291 (declare (type fixnum wptr rptr
)
292 (type (unsigned-byte 8) byte
)
293 (type (simple-array (unsigned-byte 16) (*)) table
))
295 (when (%
= wptr out-end
) (return))
296 (when (%
>= rptr in-end
) (return))
297 (setq byte
(aref in rptr
))
300 ;; we need to know the following character
301 (cond ((>= (%
+ rptr
1) in-end
)
302 ;; no characters in buffer
304 ;; at EOF, pass it as NL
305 (setf (aref out wptr
) #x0A
)
306 (setf wptr
(%
+ wptr
1))
307 (setf rptr
(%
+ rptr
1)))
309 ;; demand more characters
311 ((= (aref in
(%
+ rptr
1)) #x0A
)
312 ;; we see CR NL, so forget this CR and the next NL will be
313 ;; inserted literally
314 (setf rptr
(%
+ rptr
1)))
316 ;; singleton CR, pass it as NL
317 (setf (aref out wptr
) #x0A
)
318 (setf wptr
(%
+ wptr
1))
319 (setf rptr
(%
+ rptr
1)))))
322 (setf (aref out wptr
) (aref table byte
))
323 (setf wptr
(%
+ wptr
1))
324 (setf rptr
(%
+ rptr
1))) ))
327 ;;;; ---------------------------------------------------------------------------
331 (defvar *charsets
* (make-hash-table :test
#'eq
))
333 (defclass 8-bit-charset
()
334 ((name :initarg
:name
)
336 :initarg
:to-unicode-table
337 :reader to-unicode-table
)))
339 (defmacro define-8-bit-charset
(name &rest codes
)
340 (assert (= 256 (length codes
)))
342 (setf (gethash ',name
*charsets
*)
343 (make-instance '8-bit-charset
347 :element-type
'(unsigned-byte 16)
348 :initial-contents codes
)))
351 (defun find-charset (name)
352 (or (gethash name
*charsets
*)
353 (xerror "There is no character set named ~S." name
)))