1.0.9.53: trivial typo fixes
[sbcl/lichteblau.git] / src / code / external-formats / mb-util.lisp
blobe60dfed55adbf745109576e300c5cbe134b73d7d
1 (in-package "SB!IMPL")
3 (defun make-multibyte-mapper (list)
4 (let ((list (sort (copy-list list) #'< :key #'car))
5 (hi (loop for x in list maximize (max (car x) (cadr x)))))
6 (make-array (list (length list) 2)
7 :element-type (list 'integer 0 hi)
8 :initial-contents list)))
10 (defmacro define-multibyte-mapper (name list)
11 `(defparameter ,name
12 (make-multibyte-mapper ,list)))
14 (defun get-multibyte-mapper (table code)
15 (declare (optimize speed (safety 0))
16 (type (array * (* 2)) table)
17 (type fixnum code))
18 (labels ((recur (start end)
19 (declare (type fixnum start end))
20 (let* ((m (ash (+ start end) -1))
21 (x (aref table m 0)))
22 (declare (type fixnum m x))
23 (cond ((= x code)
24 (aref table m 1))
25 ((and (< x code) (< m end))
26 (recur (1+ m) end))
27 ((and (> x code) (> m start))
28 (recur start (1- m)))))))
29 (recur 0 (1- (array-dimension table 0)))))
31 (eval-when (:compile-toplevel :load-toplevel :execute)
32 ;; FIXME: better to change make-od-name() to accept multiple
33 ;; arguments in octets.lisp?
34 (defun make-od-name-list (&rest syms)
35 (reduce #'make-od-name syms))
37 (defun define-bytes-per-mb-character-1 (accessor type format
38 mb-len mb-continuation-byte-p)
39 (let ((name (make-od-name-list 'bytes-per format 'character accessor))
40 (invalid-mb-starter-byte
41 (make-od-name-list 'invalid format 'starter-byte))
42 (invalid-mb-continuation-byte
43 (make-od-name-list 'invalid format 'continuation-byte)))
44 `(progn
45 ;;(declaim (inline ,name))
46 (defun ,name (array pos end)
47 (declare (optimize speed (safety 0))
48 (type ,type array)
49 (type array-range pos end))
50 ;; returns the number of bytes consumed and nil if it's a
51 ;; valid character or the number of bytes consumed and a
52 ;; replacement string if it's not.
53 (let ((initial-byte (,accessor array pos))
54 (reject-reason nil)
55 (reject-position pos)
56 (remaining-bytes (- end pos)))
57 (declare (type array-range reject-position remaining-bytes))
58 (labels ((valid-starter-byte-p (b)
59 (declare (type (unsigned-byte 8) b))
60 (let ((ok (,mb-len b)))
61 (unless ok
62 (setf reject-reason ',invalid-mb-starter-byte))
63 ok))
64 (enough-bytes-left-p (x)
65 (let ((ok (> end (+ pos (1- x)))))
66 (unless ok
67 (setf reject-reason 'end-of-input-in-character))
68 ok))
69 (valid-secondary-p (x)
70 (let* ((idx (the array-range (+ pos x)))
71 (b (,accessor array idx))
72 (ok (,mb-continuation-byte-p b)))
73 (unless ok
74 (setf reject-reason ',invalid-mb-continuation-byte)
75 (setf reject-position idx))
76 ok))
77 (preliminary-ok-for-length (maybe-len len)
78 (and (eql maybe-len len)
79 ;; Has to be done in this order so that
80 ;; certain broken sequences (e.g., the
81 ;; two-byte sequence `"initial (length 3)"
82 ;; "non-continuation"' -- `#xef #x32')
83 ;; signal only part of that sequence as
84 ;; erroneous.
85 (loop for i from 1 below (min len remaining-bytes)
86 always (valid-secondary-p i))
87 (enough-bytes-left-p len))))
88 (declare (inline valid-starter-byte-p
89 enough-bytes-left-p
90 valid-secondary-p
91 preliminary-ok-for-length))
92 (let ((maybe-len (valid-starter-byte-p initial-byte)))
93 (cond ((eql maybe-len 1)
94 (values 1 nil))
95 ((preliminary-ok-for-length maybe-len 2)
96 (values 2 nil))
97 ((preliminary-ok-for-length maybe-len 3)
98 (values 3 nil))
100 (let* ((bad-end (ecase reject-reason
101 (,invalid-mb-starter-byte
102 (1+ pos))
103 (end-of-input-in-character
104 end)
105 (,invalid-mb-continuation-byte
106 reject-position)))
107 (bad-len (- bad-end pos)))
108 (declare (type array-range bad-end bad-len))
109 (let ((replacement (decoding-error array pos bad-end ,format reject-reason reject-position)))
110 (values bad-len replacement))))))))))))
112 (defun define-simple-get-mb-char-1 (accessor type format mb-to-ucs)
113 (let ((name (make-od-name-list 'simple-get format 'char accessor))
114 (malformed (make-od-name 'malformed format)))
115 `(progn
116 (declaim (inline ,name))
117 (defun ,name (array pos bytes)
118 (declare (optimize speed (safety 0))
119 (type ,type array)
120 (type array-range pos)
121 (type (integer 1 3) bytes))
122 (flet ((cref (x)
123 (,accessor array (the array-range (+ pos x)))))
124 (declare (inline cref))
125 (let ((code (,mb-to-ucs (ecase bytes
126 (1 (cref 0))
127 (2 (logior (ash (cref 0) 8) (cref 1)))
128 (3 (logior (ash (cref 0) 16)
129 (ash (cref 1) 8)
130 (cref 2)))))))
131 (if code
132 (code-char code)
133 (decoding-error array pos (+ pos bytes) ,format
134 ',malformed pos))))))))
136 (defun define-mb->string-1 (accessor type format)
137 (let ((name
138 (make-od-name-list format '>string accessor))
139 (bytes-per-mb-character
140 (make-od-name-list 'bytes-per format 'character accessor))
141 (simple-get-mb-char
142 (make-od-name-list 'simple-get format 'char accessor)))
143 `(progn
144 (defun ,name (array astart aend)
145 (declare (optimize speed (safety 0))
146 (type ,type array)
147 (type array-range astart aend))
148 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
149 (loop with pos = astart
150 while (< pos aend)
151 do (multiple-value-bind (bytes invalid)
152 (,bytes-per-mb-character array pos aend)
153 (declare (type (or null string) invalid))
154 (cond
155 ((null invalid)
156 (vector-push-extend (,simple-get-mb-char array pos bytes) string))
158 (dotimes (i (length invalid))
159 (vector-push-extend (char invalid i) string))))
160 (incf pos bytes)))
161 (coerce string 'simple-string))))))
163 (declaim (inline mb-char-len))
164 (defun mb-char-len (code)
165 (declare (optimize speed (safety 0))
166 (type fixnum code))
167 (cond ((< code 0) (bug "can't happen"))
168 ((< code #x100) 1)
169 ((< code #x10000) 2)
170 ((< code #x1000000) 3)
171 (t (bug "can't happen"))))
174 (defmacro define-multibyte-encoding (format aliases
175 ucs-to-mb mb-to-ucs
176 mb-len mb-continuation-byte-p)
177 (let ((char->mb (make-od-name 'char-> format))
178 (string->mb (make-od-name 'string-> format))
179 (define-bytes-per-mb-character
180 (make-od-name-list 'define-bytes-per format 'character))
181 (define-simple-get-mb-char
182 (make-od-name-list 'define-simple-get format 'char))
183 (define-mb->string
184 (make-od-name-list 'define format '>string)))
185 `(progn
186 ;; for fd-stream.lisp
187 (define-external-format/variable-width ,aliases t
188 (mb-char-len (or (,ucs-to-mb (char-code byte)) -1))
189 (let ((mb (,ucs-to-mb bits)))
190 (if (null mb)
191 (external-format-encoding-error stream byte)
192 (ecase size
193 (1 (setf (sap-ref-8 sap tail) mb))
194 (2 (setf (sap-ref-8 sap tail) (ldb (byte 8 8) mb)
195 (sap-ref-8 sap (1+ tail)) (ldb (byte 8 0) mb)))
196 (3 (setf (sap-ref-8 sap tail) (ldb (byte 8 16) mb)
197 (sap-ref-8 sap (1+ tail)) (ldb (byte 8 8) mb)
198 (sap-ref-8 sap (+ 2 tail)) (ldb (byte 8 0) mb))))))
199 (,mb-len byte)
200 (let* ((mb (ecase size
201 (1 byte)
202 (2 (let ((byte2 (sap-ref-8 sap (1+ head))))
203 (unless (,mb-continuation-byte-p byte2)
204 (return-from decode-break-reason 2))
205 (dpb byte (byte 8 8) byte2)))
206 (3 (let ((byte2 (sap-ref-8 sap (1+ head)))
207 (byte3 (sap-ref-8 sap (+ 2 head))))
208 (unless (,mb-continuation-byte-p byte2)
209 (return-from decode-break-reason 2))
210 (unless (,mb-continuation-byte-p byte3)
211 (return-from decode-break-reason 3))
212 (dpb byte (byte 8 16) (dpb byte2 (byte 8 8) byte3))))))
213 (ucs (,mb-to-ucs mb)))
214 (if (null ucs)
215 (return-from decode-break-reason 1)
216 (code-char ucs))))
218 ;; for octets.lisp
219 (define-condition ,(make-od-name 'malformed format)
220 (octet-decoding-error) ())
221 (define-condition ,(make-od-name-list 'invalid format 'starter-byte)
222 (octet-decoding-error) ())
223 (define-condition ,(make-od-name-list 'invalid format 'continuation-byte)
224 (octet-decoding-error) ())
226 (declaim (inline ,char->mb))
227 (defun ,char->mb (char dest string pos)
228 (declare (optimize speed (safety 0))
229 (type (array (unsigned-byte 8) (*)) dest))
230 (let ((code (,ucs-to-mb (char-code char))))
231 (if code
232 (flet ((add-byte (b)
233 (declare (type (unsigned-byte 8) b))
234 (vector-push-extend b dest)))
235 (declare (inline add-byte))
236 (setf code (the fixnum code))
237 (ecase (mb-char-len code)
239 (add-byte code))
241 (add-byte (ldb (byte 8 8) code))
242 (add-byte (ldb (byte 8 0) code)))
244 (add-byte (ldb (byte 8 16) code))
245 (add-byte (ldb (byte 8 8) code))
246 (add-byte (ldb (byte 8 0) code)))))
247 (encoding-error ,format string pos))))
249 (defun ,string->mb (string sstart send additional-space)
250 (declare (optimize speed (safety 0))
251 (type simple-string string)
252 (type array-range sstart send additional-space))
253 (let ((array (make-array (+ additional-space (- send sstart))
254 :element-type '(unsigned-byte 8)
255 :adjustable t
256 :fill-pointer 0)))
257 (loop for i from sstart below send
258 do (,char->mb (char string i) array string i))
259 (dotimes (i additional-space)
260 (vector-push-extend 0 array))
261 (coerce array '(simple-array (unsigned-byte 8) (*)))))
263 (defmacro ,define-bytes-per-mb-character (accessor type)
264 (define-bytes-per-mb-character-1 accessor type ',format
265 ',mb-len ',mb-continuation-byte-p))
267 (instantiate-octets-definition ,define-bytes-per-mb-character)
269 (defmacro ,define-simple-get-mb-char (accessor type)
270 (define-simple-get-mb-char-1 accessor type ',format ',mb-to-ucs))
272 (instantiate-octets-definition ,define-simple-get-mb-char)
274 (defmacro ,define-mb->string (accessor type)
275 (define-mb->string-1 accessor type ',format))
277 (instantiate-octets-definition ,define-mb->string)
279 (add-external-format-funs ',aliases
280 '(,(make-od-name format '>string-aref)
281 ,string->mb))