encoding-fehler resignalisieren
[closure-common.git] / encodings.lisp
blob04ddd9333b66c447be20529bc8ac6ff33d555582
1 (in-package :runes-encoding)
3 (define-condition encoding-error (simple-error) ())
5 (defun xerror (fmt &rest args)
6 (error 'encoding-error :format-control fmt :format-arguments args))
8 ;;;; ---------------------------------------------------------------------------
9 ;;;; Encoding names
10 ;;;;
12 (defvar *names* (make-hash-table :test #'eq))
14 (defun canon-name (string)
15 (with-output-to-string (bag)
16 (map nil (lambda (ch)
17 (cond ((char= ch #\_) (write-char #\- bag))
18 (t (write-char (char-upcase ch) bag))))
19 string)))
21 (defun canon-name-2 (string)
22 (with-output-to-string (bag)
23 (map nil (lambda (ch)
24 (cond ((char= ch #\_))
25 ((char= ch #\-))
26 (t (write-char (char-upcase ch) bag))))
27 string)))
29 (defmethod encoding-names ((encoding symbol))
30 (gethash encoding *names*))
32 (defmethod (setf encoding-names) (new-value (encoding symbol))
33 (setf (gethash encoding *names*) new-value))
35 (defun add-name (encoding name)
36 (pushnew (canon-name name) (encoding-names encoding) :test #'string=))
38 (defun resolve-name (string)
39 (cond ((symbolp string)
40 string)
42 (setq string (canon-name string))
43 (or
44 (block nil
45 (maphash (lambda (x y)
46 (when (member string y :test #'string=)
47 (return x)))
48 *names*)
49 nil)
50 (block nil
51 (maphash (lambda (x y)
52 (when (member string y
53 :test #'(lambda (x y)
54 (string= (canon-name-2 x)
55 (canon-name-2 y))))
56 (return x)))
57 *names*)
58 nil)))))
60 ;;;; ---------------------------------------------------------------------------
61 ;;;; Encodings
62 ;;;;
64 (defvar *encodings* (make-hash-table :test #'eq))
66 (defmacro define-encoding (name init-form)
67 `(progn
68 (setf (gethash ',name *encodings*)
69 (list nil (lambda () ,init-form)))
70 ',name))
72 (defun find-encoding (name)
73 (let ((x (gethash (resolve-name name) *encodings*)))
74 (and x
75 (or (first x)
76 (setf (first x) (funcall (second x)))))))
78 (defclass encoding () ())
80 (defclass simple-8-bit-encoding (encoding)
81 ((table :initarg :table)))
83 (defun make-simple-8-bit-encoding (&key charset)
84 (make-instance 'simple-8-bit-encoding
85 :table (coerce (to-unicode-table charset) '(simple-array (unsigned-byte 16) (256)))))
87 ;;;;;;;
89 (defmacro fx-op (op &rest xs)
90 `(the fixnum (,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs))))
91 (defmacro fx-pred (op &rest xs)
92 `(,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs)))
94 (defmacro %+ (&rest xs) `(fx-op + ,@xs))
95 (defmacro %- (&rest xs) `(fx-op - ,@xs))
96 (defmacro %* (&rest xs) `(fx-op * ,@xs))
97 (defmacro %/ (&rest xs) `(fx-op floor ,@xs))
98 (defmacro %and (&rest xs) `(fx-op logand ,@xs))
99 (defmacro %ior (&rest xs) `(fx-op logior ,@xs))
100 (defmacro %xor (&rest xs) `(fx-op logxor ,@xs))
101 (defmacro %ash (&rest xs) `(fx-op ash ,@xs))
102 (defmacro %mod (&rest xs) `(fx-op mod ,@xs))
104 (defmacro %= (&rest xs) `(fx-pred = ,@xs))
105 (defmacro %<= (&rest xs) `(fx-pred <= ,@xs))
106 (defmacro %>= (&rest xs) `(fx-pred >= ,@xs))
107 (defmacro %< (&rest xs) `(fx-pred < ,@xs))
108 (defmacro %> (&rest xs) `(fx-pred > ,@xs))
110 (defmethod decode-sequence ((encoding (eql :utf-16-big-endian))
111 in in-start in-end out out-start out-end eof?)
112 ;; -> new wptr, new rptr
113 (let ((wptr out-start)
114 (rptr in-start))
115 (loop
116 (when (%= wptr out-end)
117 (return))
118 (when (>= (%+ rptr 1) in-end)
119 (return))
120 (let ((hi (aref in rptr))
121 (lo (aref in (%+ 1 rptr))))
122 (setf rptr (%+ 2 rptr))
123 ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
124 ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
125 ;; Haelfte fehlt!
126 (setf (aref out wptr) (logior (ash hi 8) lo))
127 (setf wptr (%+ 1 wptr))))
128 (values wptr rptr)))
130 (defmethod decode-sequence ((encoding (eql :utf-16-little-endian))
131 in in-start in-end out out-start out-end eof?)
132 ;; -> new wptr, new rptr
133 (let ((wptr out-start)
134 (rptr in-start))
135 (loop
136 (when (%= wptr out-end)
137 (return))
138 (when (>= (%+ rptr 1) in-end)
139 (return))
140 (let ((lo (aref in (%+ 0 rptr)))
141 (hi (aref in (%+ 1 rptr))))
142 (setf rptr (%+ 2 rptr))
143 ;; FIXME: Wenn wir hier ein Surrogate sehen, muessen wir das naechste
144 ;; Zeichen abwarten und nachgucken, dass nicht etwa die andere
145 ;; Haelfte fehlt!
146 (setf (aref out wptr) (logior (ash hi 8) lo))
147 (setf wptr (%+ 1 wptr))))
148 (values wptr rptr)))
150 (defmethod decode-sequence ((encoding (eql :utf-8))
151 in in-start in-end out out-start out-end eof?)
152 (declare (optimize (speed 3) (safety 0))
153 (type (simple-array (unsigned-byte 8) (*)) in)
154 (type (simple-array (unsigned-byte 16) (*)) out)
155 (type fixnum in-start in-end out-start out-end))
156 (let ((wptr out-start)
157 (rptr in-start)
158 byte0)
159 (macrolet ((put (x)
160 `((lambda (x)
161 (when (or (<= #xD800 x #xDBFF)
162 (<= #xDC00 x #xDFFF))
163 (xerror "surrogate encoded in UTF-8: #x~x." x))
164 (cond ((%> x #xFFFF)
165 (setf (aref out (%+ 0 wptr)) (%+ #xD7C0 (ash x -10))
166 (aref out (%+ 1 wptr)) (%ior #xDC00 (%and x #x3FF)))
167 (setf wptr (%+ wptr 2)))
169 (setf (aref out wptr) x)
170 (setf wptr (%+ wptr 1)))))
171 ,x))
172 (put1 (x)
173 `(progn
174 (setf (aref out wptr) ,x)
175 (setf wptr (%+ wptr 1)))))
176 (loop
177 (when (%= (+ wptr 1) out-end) (return))
178 (when (%>= rptr in-end) (return))
179 (setq byte0 (aref in rptr))
180 (cond ((= byte0 #x0D)
181 ;; CR handling
182 ;; we need to know the following character
183 (cond ((>= (%+ rptr 1) in-end)
184 ;; no characters in buffer
185 (cond (eof?
186 ;; at EOF, pass it as NL
187 (put #x0A)
188 (setf rptr (%+ rptr 1)))
190 ;; demand more characters
191 (return))))
192 ((= (aref in (%+ rptr 1)) #x0A)
193 ;; we see CR NL, so forget this CR and the next NL will be
194 ;; inserted literally
195 (setf rptr (%+ rptr 1)))
197 ;; singleton CR, pass it as NL
198 (put #x0A)
199 (setf rptr (%+ rptr 1)))))
201 ((%<= #|#b00000000|# byte0 #b01111111)
202 (put1 byte0)
203 (setf rptr (%+ rptr 1)))
205 ((%<= #|#b10000000|# byte0 #b10111111)
206 (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)
207 (setf rptr (%+ rptr 1)))
209 ((%<= #|#b11000000|# byte0 #b11011111)
210 (cond ((< (%+ rptr 2) in-end)
211 (put
212 (dpb (ldb (byte 5 0) byte0) (byte 5 6)
213 (dpb (ldb (byte 6 0) (aref in (%+ rptr 1))) (byte 6 0)
214 0)))
215 (setf rptr (%+ rptr 2)))
217 (return))))
219 ((%<= #|#b11100000|# byte0 #b11101111)
220 (cond ((< (%+ rptr 3) in-end)
221 (put
222 (dpb (ldb (byte 4 0) byte0) (byte 4 12)
223 (dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 6)
224 (dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 0)
225 0))))
226 (setf rptr (%+ rptr 3)))
228 (return))))
230 ((%<= #|#b11110000|# byte0 #b11110111)
231 (cond ((< (%+ rptr 4) in-end)
232 (put
233 (dpb (ldb (byte 3 0) byte0) (byte 3 18)
234 (dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 12)
235 (dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 6)
236 (dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 0)
237 0)))))
238 (setf rptr (%+ rptr 4)))
240 (return))))
242 ((%<= #|#b11111000|# byte0 #b11111011)
243 (cond ((< (%+ rptr 5) in-end)
244 (put
245 (dpb (ldb (byte 2 0) byte0) (byte 2 24)
246 (dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 18)
247 (dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 12)
248 (dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 6)
249 (dpb (ldb (byte 6 0) (aref in (%+ 4 rptr))) (byte 6 0)
250 0))))))
251 (setf rptr (%+ rptr 5)))
253 (return))))
255 ((%<= #|#b11111100|# byte0 #b11111101)
256 (cond ((< (%+ rptr 6) in-end)
257 (put
258 (dpb (ldb (byte 1 0) byte0) (byte 1 30)
259 (dpb (ldb (byte 6 0) (aref in (%+ 1 rptr))) (byte 6 24)
260 (dpb (ldb (byte 6 0) (aref in (%+ 2 rptr))) (byte 6 18)
261 (dpb (ldb (byte 6 0) (aref in (%+ 3 rptr))) (byte 6 12)
262 (dpb (ldb (byte 6 0) (aref in (%+ 4 rptr))) (byte 6 6)
263 (dpb (ldb (byte 6 0) (aref in (%+ 5 rptr))) (byte 6 0)
264 0)))))))
265 (setf rptr (%+ rptr 6)))
267 (return))))
270 (xerror "Corrupted UTF-8 input (initial byte was #b~8,'0B)" byte0)) ) ))
271 (values wptr rptr)) )
273 (defmethod encoding-p ((object (eql :utf-16-little-endian))) t)
274 (defmethod encoding-p ((object (eql :utf-16-big-endian))) t)
275 (defmethod encoding-p ((object (eql :utf-8))) t)
277 (defmethod encoding-p ((object encoding)) t)
279 (defmethod decode-sequence ((encoding simple-8-bit-encoding)
280 in in-start in-end
281 out out-start out-end
282 eof?)
283 (declare (optimize (speed 3) (safety 0))
284 (type (simple-array (unsigned-byte 8) (*)) in)
285 (type (simple-array (unsigned-byte 16) (*)) out)
286 (type fixnum in-start in-end out-start out-end))
287 (let ((wptr out-start)
288 (rptr in-start)
289 (byte 0)
290 (table (slot-value encoding 'table)))
291 (declare (type fixnum wptr rptr)
292 (type (unsigned-byte 8) byte)
293 (type (simple-array (unsigned-byte 16) (*)) table))
294 (loop
295 (when (%= wptr out-end) (return))
296 (when (%>= rptr in-end) (return))
297 (setq byte (aref in rptr))
298 (cond ((= byte #x0D)
299 ;; CR handling
300 ;; we need to know the following character
301 (cond ((>= (%+ rptr 1) in-end)
302 ;; no characters in buffer
303 (cond (eof?
304 ;; at EOF, pass it as NL
305 (setf (aref out wptr) #x0A)
306 (setf wptr (%+ wptr 1))
307 (setf rptr (%+ rptr 1)))
309 ;; demand more characters
310 (return))))
311 ((= (aref in (%+ rptr 1)) #x0A)
312 ;; we see CR NL, so forget this CR and the next NL will be
313 ;; inserted literally
314 (setf rptr (%+ rptr 1)))
316 ;; singleton CR, pass it as NL
317 (setf (aref out wptr) #x0A)
318 (setf wptr (%+ wptr 1))
319 (setf rptr (%+ rptr 1)))))
322 (setf (aref out wptr) (aref table byte))
323 (setf wptr (%+ wptr 1))
324 (setf rptr (%+ rptr 1))) ))
325 (values wptr rptr)))
327 ;;;; ---------------------------------------------------------------------------
328 ;;;; Character sets
329 ;;;;
331 (defvar *charsets* (make-hash-table :test #'eq))
333 (defclass 8-bit-charset ()
334 ((name :initarg :name)
335 (to-unicode-table
336 :initarg :to-unicode-table
337 :reader to-unicode-table)))
339 (defmacro define-8-bit-charset (name &rest codes)
340 (assert (= 256 (length codes)))
341 `(progn
342 (setf (gethash ',name *charsets*)
343 (make-instance '8-bit-charset
344 :name ',name
345 :to-unicode-table
346 ',(make-array 256
347 :element-type '(unsigned-byte 16)
348 :initial-contents codes)))
349 ',name))
351 (defun find-charset (name)
352 (or (gethash name *charsets*)
353 (xerror "There is no character set named ~S." name)))