1 ;;;; Unicode Transformation Format (UTF) encodings
3 ;;;; In our interpretation, these are distinct from UCS encodings: the
4 ;;;; UCS encodings are a direct encoding of the code point, in 16- and
5 ;;;; 32-bit variants; by contrast, the UTF encodings handle Unicode
6 ;;;; surrogate code points specially.
8 ;;;; This software is part of the SBCL system. See the README file for
11 ;;;; This software is derived from the CMU CL system, which was
12 ;;;; written at Carnegie Mellon University and released into the
13 ;;;; public domain. The software is in the public domain and is
14 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
15 ;;;; files for more information.
17 (in-package "SB-IMPL")
20 (declaim (inline utf-noncharacter-code-p
))
21 (defun utf-noncharacter-code-p (code)
22 (or (<= #xd800 code
#xdfff
)
23 (<= #xfdd0 code
#xfdef
)
24 (= (logand code
#xfffe
) #xfffe
)))
26 ;;; Conversion to UTF-16{LE,BE}
27 (declaim (inline char-
>utf-16le
))
28 (defun char->utf-16le
(char dest string pos
)
29 (declare (optimize speed
(safety 0))
30 (type (array (unsigned-byte 8) (*)) dest
))
31 (let ((code (char-code char
)))
32 (if (utf-noncharacter-code-p code
)
33 (let ((replacement (encoding-error :utf-16le string pos
)))
34 (declare (type (simple-array (unsigned-byte 8) (*)) replacement
))
35 (dotimes (i (length replacement
))
36 (vector-push-extend (aref replacement i
) dest
)))
38 (declare (type (unsigned-byte 8) b
))
39 (vector-push-extend b dest
)))
40 (declare (inline add-byte
))
43 (add-byte (ldb (byte 8 0) code
))
44 (add-byte (ldb (byte 8 8) code
)))
46 (let* ((codeoid (- code
#x10000
))
47 (high (dpb (ldb (byte 10 10) codeoid
) (byte 10 0) #xd800
))
48 (low (dpb (ldb (byte 10 0) codeoid
) (byte 10 0) #xdc00
)))
49 (add-byte (ldb (byte 8 0) high
))
50 (add-byte (ldb (byte 8 8) high
))
51 (add-byte (ldb (byte 8 0) low
))
52 (add-byte (ldb (byte 8 8) low
)))))))))
54 (declaim (inline char-
>utf-16be
))
55 (defun char->utf-16be
(char dest string pos
)
56 (declare (optimize speed
(safety 0))
57 (type (array (unsigned-byte 8) (*)) dest
))
58 (let ((code (char-code char
)))
59 (if (utf-noncharacter-code-p code
)
60 (let ((replacement (encoding-error :utf-16be string pos
)))
61 (declare (type (simple-array (unsigned-byte 8) (*)) replacement
))
62 (dotimes (i (length replacement
))
63 (vector-push-extend (aref replacement i
) dest
)))
65 (declare (type (unsigned-byte 8) b
))
66 (vector-push-extend b dest
)))
67 (declare (inline add-byte
))
70 (add-byte (ldb (byte 8 8) code
))
71 (add-byte (ldb (byte 8 0) code
)))
73 (let* ((codeoid (- code
#x10000
))
74 (high (dpb (ldb (byte 10 10) codeoid
) (byte 10 0) #xd800
))
75 (low (dpb (ldb (byte 10 0) codeoid
) (byte 10 0) #xdc00
)))
76 (add-byte (ldb (byte 8 8) high
))
77 (add-byte (ldb (byte 8 0) high
))
78 (add-byte (ldb (byte 8 8) low
))
79 (add-byte (ldb (byte 8 0) low
)))))))))
81 (defun string->utf-16le
(string sstart send additional-space
)
82 (declare (optimize speed
(safety 0))
83 (type simple-string string
)
84 (type array-range sstart send additional-space
))
85 (let ((array (make-array (* 2 (+ additional-space
(- send sstart
)))
86 :element-type
'(unsigned-byte 8)
87 :fill-pointer
0 :adjustable t
)))
88 (loop for i from sstart below send
89 do
(char->utf-16le
(char string i
) array string i
))
90 (dotimes (i (* 2 additional-space
))
91 (vector-push-extend 0 array
))
92 (coerce array
'(simple-array (unsigned-byte 8) (*)))))
94 (defun string->utf-16be
(string sstart send additional-space
)
95 (declare (optimize speed
(safety 0))
96 (type simple-string string
)
97 (type array-range sstart send additional-space
))
98 (let ((array (make-array (* 2 (+ additional-space
(- send sstart
)))
99 :element-type
'(unsigned-byte 8)
100 :fill-pointer
0 :adjustable t
)))
101 (loop for i from sstart below send
102 do
(char->utf-16be
(char string i
) array string i
))
103 (dotimes (i (* 2 additional-space
))
104 (vector-push-extend 0 array
))
105 (coerce array
'(simple-array (unsigned-byte 8) (*)))))
107 ;; Conversion from UTF-16{LE,BE}
108 (defmacro define-bytes-per-utf16-character
(accessor type
)
109 (declare (ignore type
))
110 (let ((name-le (make-od-name 'bytes-per-utf-16le-character accessor
))
111 (name-be (make-od-name 'bytes-per-utf-16be-character accessor
)))
113 (defun ,name-le
(array pos end
)
114 (let ((remaining (- end pos
)))
115 (when (< remaining
2)
116 (return-from ,name-le
(values remaining
(decoding-error array pos end
:utf-16le
'octet-decoding-error pos
))))
117 (let ((low (dpb (,accessor array
(+ pos
1)) (byte 8 8) (,accessor array pos
))))
118 (if (<= #xd800 low
#xdbff
)
120 (values remaining
(decoding-error array pos end
:utf-16le
'octet-decoding-error pos
))
121 (let ((high (dpb (,accessor array
(+ pos
3)) (byte 8 8) (,accessor array
(+ pos
2)))))
122 (if (<= #xdc00 high
#xdfff
)
123 (let ((code (dpb (ldb (byte 10 0) low
) (byte 10 10) (ldb (byte 10 0) high
))))
124 (if (= (logand code
#xfffe
) #xfffe
)
125 (values 4 (decoding-error array pos
(+ pos
4) :utf-16le
'octet-decoding-error pos
))
127 (values 2 (decoding-error array pos
(+ pos
2) :utf-16le
'octet-decoding-error pos
)))))
128 (if (or (<= #xdc00 low
#xdfff
)
129 (<= #xfdd0 low
#xfdef
)
130 (= (logand low
#xfffe
) #xfffe
))
131 (values 2 (decoding-error array pos
(+ pos
2) :utf-16le
'octet-decoding-error pos
))
133 (defun ,name-be
(array pos end
)
134 (let ((remaining (- end pos
)))
135 (when (< remaining
2)
136 (return-from ,name-be
(values remaining
(decoding-error array pos end
:utf-16le
'octet-decoding-error pos
))))
137 (let ((low (dpb (,accessor array pos
) (byte 8 8) (,accessor array
(+ pos
1)))))
138 (if (<= #xd800 low
#xdbff
)
140 (values remaining
(decoding-error array pos end
:utf-16le
'octet-decoding-error pos
))
141 (let ((high (dpb (,accessor array
(+ pos
2)) (byte 8 8) (,accessor array
(+ pos
3)))))
142 (if (<= #xdc00 high
#xdfff
)
143 (let ((code (dpb (ldb (byte 10 0) low
) (byte 10 10) (ldb (byte 10 0) high
))))
144 (if (= (logand code
#xfffe
) #xfffe
)
145 (values 4 (decoding-error array pos
(+ pos
4) :utf-16le
'octet-decoding-error pos
))
147 (values 2 (decoding-error array pos
(+ pos
2) :utf-16le
'octet-decoding-error pos
)))))
148 (if (or (<= #xdc00 low
#xdfff
)
149 (<= #xfdd0 low
#xfdef
)
150 (= (logand low
#xfffe
) #xfffe
))
151 (values 2 (decoding-error array pos
(+ pos
2) :utf-16le
'octet-decoding-error pos
))
152 (values 2 nil
)))))))))
153 (instantiate-octets-definition define-bytes-per-utf16-character
)
155 (defmacro define-simple-get-utf16-character
(accessor type
)
156 (let ((name-le (make-od-name 'simple-get-utf-16le-char accessor
))
157 (name-be (make-od-name 'simple-get-utf-16be-char accessor
)))
159 (defun ,name-le
(array pos bytes
)
160 (declare (optimize speed
(safety 0))
162 (type array-range pos
)
163 (type (integer 1 4) bytes
)
165 ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-16LE that
166 ;; reads two bytes at once on some architectures.
167 (let ((code ,(if (and (eq accessor
'sap-ref-8
)
168 (eq type
'system-area-pointer
))
169 '(sap-ref-16le array pos
)
171 (,accessor array
(the array-range
(+ pos x
)))))
172 (declare (inline cref
))
173 (dpb (cref 1) (byte 8 8) (cref 0))))))
174 (if (<= #xd800 code
#xdbff
)
175 (let ((next ,(if (and (eq accessor
'sap-ref-8
)
176 (eq type
'system-area-pointer
))
177 '(sap-ref-16le array
(+ pos
2))
179 (,accessor array
(the array-range
(+ pos x
)))))
180 (declare (inline cref
))
181 (dpb (cref 3) (byte 8 8) (cref 2))))))
182 (code-char (+ #x10000
(dpb (ldb (byte 10 0) code
) (byte 10 10) (ldb (byte 10 0) next
)))))
184 (defun ,name-be
(array pos bytes
)
185 (declare (optimize speed
(safety 0))
187 (type array-range pos
)
188 (type (integer 1 4) bytes
)
190 ;; Use SAP-REF-16BE even if it is not optimized
191 (let ((code ,(if (and (eq accessor
'sap-ref-8
)
192 (eq type
'system-area-pointer
))
193 '(sap-ref-16be array pos
)
195 (,accessor array
(the array-range
(+ pos x
)))))
196 (declare (inline cref
))
197 (dpb (cref 0) (byte 8 8) (cref 1))))))
198 (if (<= #xd800 code
#xdbff
)
199 (let ((next ,(if (and (eq accessor
'sap-ref-8
)
200 (eq type
'system-area-pointer
))
201 '(sap-ref-16be array
(+ pos
2))
203 (,accessor array
(the array-range
(+ pos x
)))))
204 (declare (inline cref
))
205 (dpb (cref 2) (byte 8 8) (cref 3))))))
206 (code-char (+ #x10000
(dpb (ldb (byte 10 0) code
) (byte 10 10) (ldb (byte 10 0) next
)))))
207 (code-char code
)))))))
209 (instantiate-octets-definition define-simple-get-utf16-character
)
211 (defmacro define-utf-16-
>string
(accessor type
)
212 (let ((name-le (make-od-name 'utf-16le-
>string accessor
))
213 (name-be (make-od-name 'utf-16be-
>string accessor
)))
215 (defun ,name-le
(array astart aend
)
216 (declare (optimize speed
(safety 0))
218 (type array-range astart aend
))
219 (let ((string (make-array 0 :adjustable t
:fill-pointer
0 :element-type
'character
)))
220 (loop with pos
= astart
222 do
(multiple-value-bind (bytes invalid
)
223 (,(make-od-name 'bytes-per-utf-16le-character accessor
) array pos aend
)
224 (declare (type (or null string
) invalid
))
228 (,(make-od-name 'simple-get-utf-16le-char accessor
)
231 (t (dotimes (i (length invalid
))
232 (vector-push-extend (char invalid i
) string
))))
235 (defun ,name-be
(array astart aend
)
236 (declare (optimize speed
(safety 0))
238 (type array-range astart aend
))
239 (let ((string (make-array 0 :adjustable t
:fill-pointer
0 :element-type
'character
)))
240 (loop with pos
= astart
242 do
(multiple-value-bind (bytes invalid
)
243 (,(make-od-name 'bytes-per-utf-16be-character accessor
) array pos aend
)
244 (declare (type (or null string
) invalid
))
248 (,(make-od-name 'simple-get-utf-16be-char accessor
)
251 (t (dotimes (i (length invalid
))
252 (vector-push-extend (char invalid i
) string
))))
256 (instantiate-octets-definition define-utf-16-
>string
)
258 (define-external-format/variable-width
(:utf-16le
:utf16le
) t
260 (let ((bits (char-code byte
)))
261 (if (< bits
#x10000
) 2 4))
264 (if (utf-noncharacter-code-p bits
)
265 (external-format-encoding-error stream bits
)
266 (setf (sap-ref-16le sap tail
) bits
)))
267 (t (if (= (logand bits
#xfffe
) #xfffe
)
268 (external-format-encoding-error stream bits
)
269 (let* ((new-bits (- bits
#x10000
))
270 (high (ldb (byte 10 10) new-bits
))
271 (low (ldb (byte 10 0) new-bits
)))
272 (setf (sap-ref-16le sap tail
) (dpb high
(byte 10 0) #xd800
))
273 (setf (sap-ref-16le sap
(+ tail
2)) (dpb low
(byte 10 0) #xdc00
))))))
274 (2 (if (<= #xd800
(sap-ref-16le sap head
) #xdbff
) 4 2))
275 (let ((bits (sap-ref-16le sap head
)))
277 ((or (<= #xdc00 bits
#xdfff
)
278 (<= #xfdd0 bits
#xfdef
)
279 (= (logand bits
#xfffe
) #xfffe
))
280 (return-from decode-break-reason
2))
281 ((<= #xd800 bits
#xdbff
)
282 (let ((next (sap-ref-16le sap
(+ head
2))))
283 (unless (<= #xdc00 next
#xdfff
)
284 (return-from decode-break-reason
2))
285 (let ((code (dpb (ldb (byte 10 0) bits
) (byte 10 10) (ldb (byte 10 0) next
))))
286 (if (= (logand code
#xfffe
) #xfffe
)
287 (return-from decode-break-reason
4)
288 (code-char (+ #x10000 code
))))))
289 (t (code-char bits
))))
290 utf-16le-
>string-aref
293 (define-external-format/variable-width
(:utf-16be
:utf16be
) t
295 (let ((bits (char-code byte
)))
296 (if (< bits
#x10000
) 2 4))
299 (if (utf-noncharacter-code-p bits
)
300 (external-format-encoding-error stream bits
)
301 (setf (sap-ref-16be sap tail
) bits
)))
302 (t (if (= (logand bits
#xfffe
) #xfffe
)
303 (external-format-encoding-error stream bits
)
304 (let* ((new-bits (- bits
#x10000
))
305 (high (ldb (byte 10 10) new-bits
))
306 (low (ldb (byte 10 0) new-bits
)))
307 (setf (sap-ref-16be sap tail
) (dpb high
(byte 10 0) #xd800
))
308 (setf (sap-ref-16be sap
(+ tail
2)) (dpb low
(byte 10 0) #xdc00
))))))
309 (2 (if (<= #xd800
(sap-ref-16be sap head
) #xdbff
) 4 2))
310 (let ((bits (sap-ref-16be sap head
)))
312 ((or (<= #xdc00 bits
#xdfff
)
313 (<= #xfdd0 bits
#xfdef
)
314 (= (logand bits
#xfffe
) #xfffe
))
315 (return-from decode-break-reason
2))
316 ((<= #xd800 bits
#xdbff
)
317 (let ((next (sap-ref-16be sap
(+ head
2))))
318 (unless (<= #xdc00 next
#xdfff
)
319 (return-from decode-break-reason
2))
320 (let ((code (dpb (ldb (byte 10 0) bits
) (byte 10 10) (ldb (byte 10 0) next
))))
321 (if (= (logand code
#xfffe
) #xfffe
)
322 (return-from decode-break-reason
4)
323 (code-char (+ #x10000 code
))))))
324 (t (code-char bits
))))
325 utf-16be-
>string-aref
328 (declaim (inline char-
>utf-32le
))
329 (defun char->utf-32le
(char dest string pos
)
330 (declare (optimize speed
(safety 0))
331 (type (array (unsigned-byte 8) (*)) dest
))
332 (let ((code (char-code char
)))
333 (if (utf-noncharacter-code-p code
)
334 (let ((replacement (encoding-error :utf-32le string pos
)))
335 (declare (type (simple-array (unsigned-byte 8) (*)) replacement
))
336 (dotimes (i (length replacement
))
337 (vector-push-extend (aref replacement i
) dest
)))
339 (declare (type (unsigned-byte 8) b
))
340 (vector-push-extend b dest
)))
341 (declare (inline add-byte
))
342 (add-byte (ldb (byte 8 0) code
))
343 (add-byte (ldb (byte 8 8) code
))
344 (add-byte (ldb (byte 8 16) code
))
345 (add-byte (ldb (byte 8 24) code
))))))
347 (declaim (inline char-
>utf-32be
))
348 (defun char->utf-32be
(char dest string pos
)
349 (declare (optimize speed
(safety 0))
350 (type (array (unsigned-byte 8) (*)) dest
))
351 (let ((code (char-code char
)))
352 (if (utf-noncharacter-code-p code
)
353 (let ((replacement (encoding-error :utf-32be string pos
)))
354 (declare (type (simple-array (unsigned-byte 8) (*)) replacement
))
355 (dotimes (i (length replacement
))
356 (vector-push-extend (aref replacement i
) dest
)))
358 (declare (type (unsigned-byte 8) b
))
359 (vector-push-extend b dest
)))
360 (declare (inline add-byte
))
361 (add-byte (ldb (byte 8 24) code
))
362 (add-byte (ldb (byte 8 16) code
))
363 (add-byte (ldb (byte 8 8) code
))
364 (add-byte (ldb (byte 8 0) code
))))))
366 (defun string->utf-32le
(string sstart send additional-space
)
367 (declare (optimize speed
(safety 0))
368 (type simple-string string
)
369 (type array-range sstart send additional-space
))
370 (let ((array (make-array (* 4 (+ additional-space
(- send sstart
)))
371 :element-type
'(unsigned-byte 8)
372 :fill-pointer
0 :adjustable t
)))
373 (loop for i from sstart below send
374 do
(char->utf-32le
(char string i
) array string i
))
375 (dotimes (i (* 4 additional-space
))
376 (vector-push-extend 0 array
))
377 (coerce array
'(simple-array (unsigned-byte 8) (*)))))
379 (defun string->utf-32be
(string sstart send additional-space
)
380 (declare (optimize speed
(safety 0))
381 (type simple-string string
)
382 (type array-range sstart send additional-space
))
383 (let ((array (make-array (* 4 (+ additional-space
(- send sstart
)))
384 :element-type
'(unsigned-byte 8)
385 :fill-pointer
0 :adjustable t
)))
386 (loop for i from sstart below send
387 do
(char->utf-32be
(char string i
) array string i
))
388 (dotimes (i (* 4 additional-space
))
389 (vector-push-extend 0 array
))
390 (coerce array
'(simple-array (unsigned-byte 8) (*)))))
392 ;; Conversion from UTF-32{LE,BE}
393 (defmacro define-bytes-per-utf32-character
(accessor type
)
394 (declare (ignore type
))
395 (let ((name-le (make-od-name 'bytes-per-utf-32le-character accessor
))
396 (name-be (make-od-name 'bytes-per-utf-32be-character accessor
)))
398 (defun ,name-le
(array pos end
)
399 (declare (ignore array pos end
))
401 (defun ,name-be
(array pos end
)
402 (declare (ignore array pos end
))
404 (instantiate-octets-definition define-bytes-per-utf32-character
)
406 (defmacro define-simple-get-utf32-character
(accessor type
)
407 (let ((name-le (make-od-name 'simple-get-utf-32le-char accessor
))
408 (name-be (make-od-name 'simple-get-utf-32be-char accessor
)))
410 (defun ,name-le
(array pos bytes
)
411 (declare (optimize speed
(safety 0))
413 (type array-range pos
)
414 (type (integer 1 4) bytes
))
415 ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-32LE that
416 ;; reads four bytes at once on some architectures.
417 (let ((code ,(if (and (eq accessor
'sap-ref-8
)
418 (eq type
'system-area-pointer
))
419 '(sap-ref-32le array pos
)
421 (,accessor array
(the array-range
(+ pos x
)))))
422 (declare (inline cref
))
423 (dpb (cref 3) (byte 8 24)
424 (dpb (cref 2) (byte 8 16)
425 (dpb (cref 1) (byte 8 8) (cref 0))))))))
426 (if (and (< code char-code-limit
)
427 (not (utf-noncharacter-code-p code
)))
429 (decoding-error array pos
(+ pos bytes
) :utf-32le
430 'octet-decoding-error pos
))))
431 (defun ,name-be
(array pos bytes
)
432 (declare (optimize speed
(safety 0))
434 (type array-range pos
)
435 (type (integer 1 4) bytes
))
436 ;; Use SAP-REF-32BE even if it is not optimized
437 (let ((code ,(if (and (eq accessor
'sap-ref-8
)
438 (eq type
'system-area-pointer
))
439 '(sap-ref-32be array pos
)
441 (,accessor array
(the array-range
(+ pos x
)))))
442 (declare (inline cref
))
443 (dpb (cref 0) (byte 8 24)
444 (dpb (cref 1) (byte 8 16)
445 (dpb (cref 2) (byte 8 8) (cref 3))))))))
446 (if (and (< code char-code-limit
)
447 (not (utf-noncharacter-code-p code
)))
449 (decoding-error array pos
(+ pos bytes
) :utf-32be
450 'octet-decoding-error pos
)))))))
452 (declaim (muffle-conditions compiler-note
))
453 (instantiate-octets-definition define-simple-get-utf32-character
)
455 (defmacro define-utf-32-
>string
(accessor type
)
456 (let ((name-le (make-od-name 'utf-32le-
>string accessor
))
457 (name-be (make-od-name 'utf-32be-
>string accessor
)))
459 (defun ,name-le
(array astart aend
)
460 (declare (optimize speed
(safety 0))
462 (type array-range astart aend
))
463 (let ((string (make-array 0 :adjustable t
:fill-pointer
0 :element-type
'character
)))
464 (loop with pos
= astart
466 do
(multiple-value-bind (bytes invalid
)
467 (,(make-od-name 'bytes-per-utf-32le-character accessor
) array pos aend
)
468 (declare (type (or null string
) invalid
))
469 (aver (null invalid
))
470 (let ((thing (,(make-od-name 'simple-get-utf-32le-char accessor
) array pos bytes
)))
472 (character (vector-push-extend thing string
))
473 (string (dotimes (i (length thing
))
474 (vector-push-extend (char thing i
) string
)))))
477 (defun ,name-be
(array astart aend
)
478 (declare (optimize speed
(safety 0))
480 (type array-range astart aend
))
481 (let ((string (make-array 0 :adjustable t
:fill-pointer
0 :element-type
'character
)))
482 (loop with pos
= astart
484 do
(multiple-value-bind (bytes invalid
)
485 (,(make-od-name 'bytes-per-utf-32be-character accessor
) array pos aend
)
486 (declare (type (or null string
) invalid
))
487 (aver (null invalid
))
488 (let ((thing (,(make-od-name 'simple-get-utf-32be-char accessor
) array pos bytes
)))
490 (character (vector-push-extend thing string
))
491 (string (dotimes (i (length thing
))
492 (vector-push-extend (char thing i
) string
)))))
496 (instantiate-octets-definition define-utf-32-
>string
)
498 (define-external-format/variable-width
(:utf-32le
:utf32le
) t
501 (if (utf-noncharacter-code-p bits
)
502 (external-format-encoding-error stream bits
)
503 (setf (sap-ref-32le sap tail
) bits
))
505 (let ((code (sap-ref-32le sap head
)))
506 (if (and (< code char-code-limit
)
507 (not (utf-noncharacter-code-p code
)))
509 (return-from decode-break-reason
4)))
510 utf-32le-
>string-aref
513 (define-external-format/variable-width
(:utf-32be
:utf32be
) t
516 (if (utf-noncharacter-code-p bits
)
517 (external-format-encoding-error stream bits
)
518 (setf (sap-ref-32be sap tail
) bits
))
520 (let ((code (sap-ref-32be sap head
)))
521 (if (and (< code char-code-limit
)
522 (not (utf-noncharacter-code-p code
)))
524 (return-from decode-break-reason
4)))
525 utf-32be-
>string-aref