3 (defmacro define-multibyte-mapper
(name list
)
4 (let ((list (sort (copy-list list
) #'< :key
#'car
))
5 (hi (loop for x in list maximize
(max (car x
) (cadr x
)))))
6 `(defconstant-eqx ,name
7 (make-array '(,(length list
) 2)
8 :element-type
'(integer 0 ,hi
)
9 :initial-contents
',list
)
12 (defun get-multibyte-mapper (table code
)
13 (declare (optimize speed
(safety 0))
14 (type (array * (* 2)) table
)
16 (labels ((recur (start end
)
17 (declare (type fixnum start end
))
18 (let* ((m (ash (+ start end
) -
1))
20 (declare (type fixnum m x
))
23 ((and (< x code
) (< m end
))
25 ((and (> x code
) (> m start
))
26 (recur start
(1- m
)))))))
27 (recur 0 (1- (array-dimension table
0)))))
29 (eval-when (:compile-toplevel
:load-toplevel
:execute
)
30 ;; FIXME: better to change make-od-name() to accept multiple
31 ;; arguments in octets.lisp?
32 (defun make-od-name-list (&rest syms
)
33 (reduce #'make-od-name syms
))
35 (defun define-bytes-per-mb-character-1 (accessor type format
36 mb-len mb-continuation-byte-p
)
37 (let ((name (make-od-name-list 'bytes-per format
'character accessor
))
38 (invalid-mb-starter-byte
39 (make-od-name-list 'invalid format
'starter-byte
))
40 (invalid-mb-continuation-byte
41 (make-od-name-list 'invalid format
'continuation-byte
)))
43 ;;(declaim (inline ,name))
44 (defun ,name
(array pos end
)
45 (declare (optimize speed
(safety 0))
47 (type array-range pos end
))
48 ;; returns the number of bytes consumed and nil if it's a
49 ;; valid character or the number of bytes consumed and a
50 ;; replacement string if it's not.
51 (let ((initial-byte (,accessor array pos
))
54 (remaining-bytes (- end pos
)))
55 (declare (type array-range reject-position remaining-bytes
))
56 (labels ((valid-starter-byte-p (b)
57 (declare (type (unsigned-byte 8) b
))
58 (let ((ok (,mb-len b
)))
60 (setf reject-reason
',invalid-mb-starter-byte
))
62 (enough-bytes-left-p (x)
63 (let ((ok (> end
(+ pos
(1- x
)))))
65 (setf reject-reason
'end-of-input-in-character
))
67 (valid-secondary-p (x)
68 (let* ((idx (the array-range
(+ pos x
)))
69 (b (,accessor array idx
))
70 (ok (,mb-continuation-byte-p b
)))
72 (setf reject-reason
',invalid-mb-continuation-byte
)
73 (setf reject-position idx
))
75 (preliminary-ok-for-length (maybe-len len
)
76 (and (eql maybe-len len
)
77 ;; Has to be done in this order so that
78 ;; certain broken sequences (e.g., the
79 ;; two-byte sequence `"initial (length 3)"
80 ;; "non-continuation"' -- `#xef #x32')
81 ;; signal only part of that sequence as
83 (loop for i from
1 below
(min len remaining-bytes
)
84 always
(valid-secondary-p i
))
85 (enough-bytes-left-p len
))))
86 (declare (inline valid-starter-byte-p
89 preliminary-ok-for-length
))
90 (let ((maybe-len (valid-starter-byte-p initial-byte
)))
91 (cond ((eql maybe-len
1)
93 ((preliminary-ok-for-length maybe-len
2)
95 ((preliminary-ok-for-length maybe-len
3)
98 (let* ((bad-end (ecase reject-reason
99 (,invalid-mb-starter-byte
101 (end-of-input-in-character
103 (,invalid-mb-continuation-byte
105 (bad-len (- bad-end pos
)))
106 (declare (type array-range bad-end bad-len
))
107 (let ((replacement (decoding-error array pos bad-end
,format reject-reason reject-position
)))
108 (values bad-len replacement
))))))))))))
110 (defun define-simple-get-mb-char-1 (accessor type format mb-to-ucs
)
111 (let ((name (make-od-name-list 'simple-get format
'char accessor
))
112 (malformed (make-od-name 'malformed format
)))
114 (declaim (inline ,name
))
115 (defun ,name
(array pos bytes
)
116 (declare (optimize speed
(safety 0))
118 (type array-range pos
)
119 (type (integer 1 3) bytes
))
121 (,accessor array
(the array-range
(+ pos x
)))))
122 (declare (inline cref
))
123 (let ((code (,mb-to-ucs
(ecase bytes
125 (2 (logior (ash (cref 0) 8) (cref 1)))
126 (3 (logior (ash (cref 0) 16)
131 (decoding-error array pos
(+ pos bytes
) ,format
132 ',malformed pos
))))))))
134 (defun define-mb->string-1
(accessor type format
)
136 (make-od-name-list format
'>string accessor
))
137 (bytes-per-mb-character
138 (make-od-name-list 'bytes-per format
'character accessor
))
140 (make-od-name-list 'simple-get format
'char accessor
)))
142 (defun ,name
(array astart aend
)
143 (declare (optimize speed
(safety 0))
145 (type array-range astart aend
))
146 (let ((string (make-array 0 :adjustable t
:fill-pointer
0 :element-type
'character
)))
147 (loop with pos
= astart
149 do
(multiple-value-bind (bytes invalid
)
150 (,bytes-per-mb-character array pos aend
)
151 (declare (type (or null string
) invalid
))
154 (let ((thing (,simple-get-mb-char array pos bytes
)))
156 (character (vector-push-extend thing string
))
158 (dotimes (i (length thing
))
159 (vector-push-extend (char thing i
) string
))))))
161 (dotimes (i (length invalid
))
162 (vector-push-extend (char invalid i
) string
))))
164 (coerce string
'simple-string
))))))
166 (declaim (inline mb-char-len
))
167 (defun mb-char-len (code)
168 (declare (optimize speed
(safety 0))
170 (cond ((< code
0) (bug "can't happen"))
173 ((< code
#x1000000
) 3)
174 (t (bug "can't happen"))))
177 (defmacro define-multibyte-encoding
(format aliases
179 mb-len mb-continuation-byte-p
)
180 (let ((char->mb
(make-od-name 'char-
> format
))
181 (string->mb
(make-od-name 'string-
> format
))
182 (define-bytes-per-mb-character
183 (make-od-name-list 'define-bytes-per format
'character
))
184 (define-simple-get-mb-char
185 (make-od-name-list 'define-simple-get format
'char
))
187 (make-od-name-list 'define format
'>string
)))
190 (define-condition ,(make-od-name 'malformed format
)
191 (octet-decoding-error) ())
192 (define-condition ,(make-od-name-list 'invalid format
'starter-byte
)
193 (octet-decoding-error) ())
194 (define-condition ,(make-od-name-list 'invalid format
'continuation-byte
)
195 (octet-decoding-error) ())
197 (declaim (inline ,char-
>mb
))
198 (defun ,char-
>mb
(char dest string pos
)
199 (declare (optimize speed
(safety 0))
200 (type (array (unsigned-byte 8) (*)) dest
))
201 (let ((code (,ucs-to-mb
(char-code char
))))
204 (declare (type (unsigned-byte 8) b
))
205 (vector-push-extend b dest
)))
206 (declare (inline add-byte
))
207 (setf code
(the fixnum code
))
208 (ecase (mb-char-len code
)
212 (add-byte (ldb (byte 8 8) code
))
213 (add-byte (ldb (byte 8 0) code
)))
215 (add-byte (ldb (byte 8 16) code
))
216 (add-byte (ldb (byte 8 8) code
))
217 (add-byte (ldb (byte 8 0) code
)))))
218 (encoding-error ,format string pos
))))
220 (defun ,string-
>mb
(string sstart send additional-space
)
221 (declare (optimize speed
(safety 0))
222 (type simple-string string
)
223 (type array-range sstart send additional-space
))
224 (let ((array (make-array (+ additional-space
(- send sstart
))
225 :element-type
'(unsigned-byte 8)
228 (loop for i from sstart below send
229 do
(,char-
>mb
(char string i
) array string i
))
230 (dotimes (i additional-space
)
231 (vector-push-extend 0 array
))
232 (coerce array
'(simple-array (unsigned-byte 8) (*)))))
234 (defmacro ,define-bytes-per-mb-character
(accessor type
)
235 (define-bytes-per-mb-character-1 accessor type
',format
236 ',mb-len
',mb-continuation-byte-p
))
238 (instantiate-octets-definition ,define-bytes-per-mb-character
)
240 (defmacro ,define-simple-get-mb-char
(accessor type
)
241 (define-simple-get-mb-char-1 accessor type
',format
',mb-to-ucs
))
243 (instantiate-octets-definition ,define-simple-get-mb-char
)
245 (defmacro ,define-mb-
>string
(accessor type
)
246 (define-mb->string-1 accessor type
',format
))
248 (instantiate-octets-definition ,define-mb-
>string
)
250 ;; for fd-stream.lisp
251 (define-external-format/variable-width
,aliases t
252 ;; KLUDGE: it so happens that at present (2009-10-22) none of
253 ;; the external formats defined with
254 ;; define-multibyte-encoding can encode the unicode
255 ;; replacement character, so we hardcode the preferred
259 (mb-char-len (or (,ucs-to-mb
(char-code byte
))
260 (return-from size
0))))
261 (let ((mb (,ucs-to-mb bits
)))
263 (external-format-encoding-error stream byte
)
265 (1 (setf (sap-ref-8 sap tail
) mb
))
266 (2 (setf (sap-ref-8 sap tail
) (ldb (byte 8 8) mb
)
267 (sap-ref-8 sap
(1+ tail
)) (ldb (byte 8 0) mb
)))
268 (3 (setf (sap-ref-8 sap tail
) (ldb (byte 8 16) mb
)
269 (sap-ref-8 sap
(1+ tail
)) (ldb (byte 8 8) mb
)
270 (sap-ref-8 sap
(+ 2 tail
)) (ldb (byte 8 0) mb
))))))
272 (let* ((mb (ecase size
274 (2 (let ((byte2 (sap-ref-8 sap
(1+ head
))))
275 (unless (,mb-continuation-byte-p byte2
)
276 (return-from decode-break-reason
2))
277 (dpb byte
(byte 8 8) byte2
)))
278 (3 (let ((byte2 (sap-ref-8 sap
(1+ head
)))
279 (byte3 (sap-ref-8 sap
(+ 2 head
))))
280 (unless (,mb-continuation-byte-p byte2
)
281 (return-from decode-break-reason
2))
282 (unless (,mb-continuation-byte-p byte3
)
283 (return-from decode-break-reason
3))
284 (dpb byte
(byte 8 16) (dpb byte2
(byte 8 8) byte3
))))))
285 (ucs (,mb-to-ucs mb
)))
287 (return-from decode-break-reason
1)
289 ,(make-od-name format
'>string-aref
)