Defer compililng external-formats until warm load
[sbcl.git] / src / code / external-formats / mb-util.lisp
blob0ffca0112ee64a3499dbbd2167cd01fe8a77cf28
1 (in-package "SB-IMPL")
3 (defmacro define-multibyte-mapper (name list)
4 (let ((list (sort (copy-list list) #'< :key #'car))
5 (hi (loop for x in list maximize (max (car x) (cadr x)))))
6 `(defconstant-eqx ,name
7 (make-array '(,(length list) 2)
8 :element-type '(integer 0 ,hi)
9 :initial-contents ',list)
10 #'equalp)))
12 (defun get-multibyte-mapper (table code)
13 (declare (optimize speed (safety 0))
14 (type (array * (* 2)) table)
15 (type fixnum code))
16 (labels ((recur (start end)
17 (declare (type fixnum start end))
18 (let* ((m (ash (+ start end) -1))
19 (x (aref table m 0)))
20 (declare (type fixnum m x))
21 (cond ((= x code)
22 (aref table m 1))
23 ((and (< x code) (< m end))
24 (recur (1+ m) end))
25 ((and (> x code) (> m start))
26 (recur start (1- m)))))))
27 (recur 0 (1- (array-dimension table 0)))))
29 (eval-when (:compile-toplevel :load-toplevel :execute)
30 ;; FIXME: better to change make-od-name() to accept multiple
31 ;; arguments in octets.lisp?
32 (defun make-od-name-list (&rest syms)
33 (reduce #'make-od-name syms))
35 (defun define-bytes-per-mb-character-1 (accessor type format
36 mb-len mb-continuation-byte-p)
37 (let ((name (make-od-name-list 'bytes-per format 'character accessor))
38 (invalid-mb-starter-byte
39 (make-od-name-list 'invalid format 'starter-byte))
40 (invalid-mb-continuation-byte
41 (make-od-name-list 'invalid format 'continuation-byte)))
42 `(progn
43 ;;(declaim (inline ,name))
44 (defun ,name (array pos end)
45 (declare (optimize speed (safety 0))
46 (type ,type array)
47 (type array-range pos end))
48 ;; returns the number of bytes consumed and nil if it's a
49 ;; valid character or the number of bytes consumed and a
50 ;; replacement string if it's not.
51 (let ((initial-byte (,accessor array pos))
52 (reject-reason nil)
53 (reject-position pos)
54 (remaining-bytes (- end pos)))
55 (declare (type array-range reject-position remaining-bytes))
56 (labels ((valid-starter-byte-p (b)
57 (declare (type (unsigned-byte 8) b))
58 (let ((ok (,mb-len b)))
59 (unless ok
60 (setf reject-reason ',invalid-mb-starter-byte))
61 ok))
62 (enough-bytes-left-p (x)
63 (let ((ok (> end (+ pos (1- x)))))
64 (unless ok
65 (setf reject-reason 'end-of-input-in-character))
66 ok))
67 (valid-secondary-p (x)
68 (let* ((idx (the array-range (+ pos x)))
69 (b (,accessor array idx))
70 (ok (,mb-continuation-byte-p b)))
71 (unless ok
72 (setf reject-reason ',invalid-mb-continuation-byte)
73 (setf reject-position idx))
74 ok))
75 (preliminary-ok-for-length (maybe-len len)
76 (and (eql maybe-len len)
77 ;; Has to be done in this order so that
78 ;; certain broken sequences (e.g., the
79 ;; two-byte sequence `"initial (length 3)"
80 ;; "non-continuation"' -- `#xef #x32')
81 ;; signal only part of that sequence as
82 ;; erroneous.
83 (loop for i from 1 below (min len remaining-bytes)
84 always (valid-secondary-p i))
85 (enough-bytes-left-p len))))
86 (declare (inline valid-starter-byte-p
87 enough-bytes-left-p
88 valid-secondary-p
89 preliminary-ok-for-length))
90 (let ((maybe-len (valid-starter-byte-p initial-byte)))
91 (cond ((eql maybe-len 1)
92 (values 1 nil))
93 ((preliminary-ok-for-length maybe-len 2)
94 (values 2 nil))
95 ((preliminary-ok-for-length maybe-len 3)
96 (values 3 nil))
98 (let* ((bad-end (ecase reject-reason
99 (,invalid-mb-starter-byte
100 (1+ pos))
101 (end-of-input-in-character
102 end)
103 (,invalid-mb-continuation-byte
104 reject-position)))
105 (bad-len (- bad-end pos)))
106 (declare (type array-range bad-end bad-len))
107 (let ((replacement (decoding-error array pos bad-end ,format reject-reason reject-position)))
108 (values bad-len replacement))))))))))))
110 (defun define-simple-get-mb-char-1 (accessor type format mb-to-ucs)
111 (let ((name (make-od-name-list 'simple-get format 'char accessor))
112 (malformed (make-od-name 'malformed format)))
113 `(progn
114 (declaim (inline ,name))
115 (defun ,name (array pos bytes)
116 (declare (optimize speed (safety 0))
117 (type ,type array)
118 (type array-range pos)
119 (type (integer 1 3) bytes))
120 (flet ((cref (x)
121 (,accessor array (the array-range (+ pos x)))))
122 (declare (inline cref))
123 (let ((code (,mb-to-ucs (ecase bytes
124 (1 (cref 0))
125 (2 (logior (ash (cref 0) 8) (cref 1)))
126 (3 (logior (ash (cref 0) 16)
127 (ash (cref 1) 8)
128 (cref 2)))))))
129 (if code
130 (code-char code)
131 (decoding-error array pos (+ pos bytes) ,format
132 ',malformed pos))))))))
134 (defun define-mb->string-1 (accessor type format)
135 (let ((name
136 (make-od-name-list format '>string accessor))
137 (bytes-per-mb-character
138 (make-od-name-list 'bytes-per format 'character accessor))
139 (simple-get-mb-char
140 (make-od-name-list 'simple-get format 'char accessor)))
141 `(progn
142 (defun ,name (array astart aend)
143 (declare (optimize speed (safety 0))
144 (type ,type array)
145 (type array-range astart aend))
146 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
147 (loop with pos = astart
148 while (< pos aend)
149 do (multiple-value-bind (bytes invalid)
150 (,bytes-per-mb-character array pos aend)
151 (declare (type (or null string) invalid))
152 (cond
153 ((null invalid)
154 (let ((thing (,simple-get-mb-char array pos bytes)))
155 (typecase thing
156 (character (vector-push-extend thing string))
157 (string
158 (dotimes (i (length thing))
159 (vector-push-extend (char thing i) string))))))
161 (dotimes (i (length invalid))
162 (vector-push-extend (char invalid i) string))))
163 (incf pos bytes)))
164 (coerce string 'simple-string))))))
166 (declaim (inline mb-char-len))
167 (defun mb-char-len (code)
168 (declare (optimize speed (safety 0))
169 (type fixnum code))
170 (cond ((< code 0) (bug "can't happen"))
171 ((< code #x100) 1)
172 ((< code #x10000) 2)
173 ((< code #x1000000) 3)
174 (t (bug "can't happen"))))
177 (defmacro define-multibyte-encoding (format aliases
178 ucs-to-mb mb-to-ucs
179 mb-len mb-continuation-byte-p)
180 (let ((char->mb (make-od-name 'char-> format))
181 (string->mb (make-od-name 'string-> format))
182 (define-bytes-per-mb-character
183 (make-od-name-list 'define-bytes-per format 'character))
184 (define-simple-get-mb-char
185 (make-od-name-list 'define-simple-get format 'char))
186 (define-mb->string
187 (make-od-name-list 'define format '>string)))
188 `(progn
189 ;; for octets.lisp
190 (define-condition ,(make-od-name 'malformed format)
191 (octet-decoding-error) ())
192 (define-condition ,(make-od-name-list 'invalid format 'starter-byte)
193 (octet-decoding-error) ())
194 (define-condition ,(make-od-name-list 'invalid format 'continuation-byte)
195 (octet-decoding-error) ())
197 (declaim (inline ,char->mb))
198 (defun ,char->mb (char dest string pos)
199 (declare (optimize speed (safety 0))
200 (type (array (unsigned-byte 8) (*)) dest))
201 (let ((code (,ucs-to-mb (char-code char))))
202 (if code
203 (flet ((add-byte (b)
204 (declare (type (unsigned-byte 8) b))
205 (vector-push-extend b dest)))
206 (declare (inline add-byte))
207 (setf code (the fixnum code))
208 (ecase (mb-char-len code)
210 (add-byte code))
212 (add-byte (ldb (byte 8 8) code))
213 (add-byte (ldb (byte 8 0) code)))
215 (add-byte (ldb (byte 8 16) code))
216 (add-byte (ldb (byte 8 8) code))
217 (add-byte (ldb (byte 8 0) code)))))
218 (encoding-error ,format string pos))))
220 (defun ,string->mb (string sstart send additional-space)
221 (declare (optimize speed (safety 0))
222 (type simple-string string)
223 (type array-range sstart send additional-space))
224 (let ((array (make-array (+ additional-space (- send sstart))
225 :element-type '(unsigned-byte 8)
226 :adjustable t
227 :fill-pointer 0)))
228 (loop for i from sstart below send
229 do (,char->mb (char string i) array string i))
230 (dotimes (i additional-space)
231 (vector-push-extend 0 array))
232 (coerce array '(simple-array (unsigned-byte 8) (*)))))
234 (defmacro ,define-bytes-per-mb-character (accessor type)
235 (define-bytes-per-mb-character-1 accessor type ',format
236 ',mb-len ',mb-continuation-byte-p))
238 (instantiate-octets-definition ,define-bytes-per-mb-character)
240 (defmacro ,define-simple-get-mb-char (accessor type)
241 (define-simple-get-mb-char-1 accessor type ',format ',mb-to-ucs))
243 (instantiate-octets-definition ,define-simple-get-mb-char)
245 (defmacro ,define-mb->string (accessor type)
246 (define-mb->string-1 accessor type ',format))
248 (instantiate-octets-definition ,define-mb->string)
250 ;; for fd-stream.lisp
251 (define-external-format/variable-width ,aliases t
252 ;; KLUDGE: it so happens that at present (2009-10-22) none of
253 ;; the external formats defined with
254 ;; define-multibyte-encoding can encode the unicode
255 ;; replacement character, so we hardcode the preferred
256 ;; replacement here.
258 (block size
259 (mb-char-len (or (,ucs-to-mb (char-code byte))
260 (return-from size 0))))
261 (let ((mb (,ucs-to-mb bits)))
262 (if (null mb)
263 (external-format-encoding-error stream byte)
264 (ecase size
265 (1 (setf (sap-ref-8 sap tail) mb))
266 (2 (setf (sap-ref-8 sap tail) (ldb (byte 8 8) mb)
267 (sap-ref-8 sap (1+ tail)) (ldb (byte 8 0) mb)))
268 (3 (setf (sap-ref-8 sap tail) (ldb (byte 8 16) mb)
269 (sap-ref-8 sap (1+ tail)) (ldb (byte 8 8) mb)
270 (sap-ref-8 sap (+ 2 tail)) (ldb (byte 8 0) mb))))))
271 (1 (,mb-len byte))
272 (let* ((mb (ecase size
273 (1 byte)
274 (2 (let ((byte2 (sap-ref-8 sap (1+ head))))
275 (unless (,mb-continuation-byte-p byte2)
276 (return-from decode-break-reason 2))
277 (dpb byte (byte 8 8) byte2)))
278 (3 (let ((byte2 (sap-ref-8 sap (1+ head)))
279 (byte3 (sap-ref-8 sap (+ 2 head))))
280 (unless (,mb-continuation-byte-p byte2)
281 (return-from decode-break-reason 2))
282 (unless (,mb-continuation-byte-p byte3)
283 (return-from decode-break-reason 3))
284 (dpb byte (byte 8 16) (dpb byte2 (byte 8 8) byte3))))))
285 (ucs (,mb-to-ucs mb)))
286 (if (null ucs)
287 (return-from decode-break-reason 1)
288 (code-char ucs)))
289 ,(make-od-name format '>string-aref)
290 ,string->mb))))