1 ;;;; Universal Character Set (UCS) encodings
3 ;;;; In our interpretation, these are distinct from UTF encodings: the
4 ;;;; UCS encodings are a direct encoding of the code point, in 16- and
5 ;;;; 32-bit variants; by contrast, the UTF encodings handle Unicode
6 ;;;; surrogate code points specially.
8 ;;;; This software is part of the SBCL system. See the README file for
11 ;;;; This software is derived from the CMU CL system, which was
12 ;;;; written at Carnegie Mellon University and released into the
13 ;;;; public domain. The software is in the public domain and is
14 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
15 ;;;; files for more information.
17 (in-package "SB!IMPL")
19 ;;; TODO Macro for generating different variants:
20 ;;; :ucs-2le (little endian) sap-ref-16le
21 ;;; :ucs-2be (big endian) sap-ref-16be
22 ;;; :ucs-2 (native) sap-ref-16
27 (inline sap-ref-16le
(setf sap-ref-16le
) sap-ref-16be
(setf sap-ref-16be
)
28 sap-ref-32le
(setf sap-ref-32le
) sap-ref-32be
(setf sap-ref-32be
)))
30 ;;; Define feature LITTLE-ENDIAN-AND-MISALIGNED-READ?
31 (defun sap-ref-16le (sap offset
)
33 (sap-ref-16 sap offset
)
35 (dpb (sap-ref-8 sap
(1+ offset
)) (byte 8 8)
36 (sap-ref-8 sap offset
)))
38 (defun (setf sap-ref-16le
) (value sap offset
)
40 (setf (sap-ref-16 sap offset
) value
)
42 (setf (sap-ref-8 sap offset
) (logand value
#xff
)
43 (sap-ref-8 sap
(1+ offset
)) (ldb (byte 8 8) value
)))
45 (defun sap-ref-16be (sap offset
)
46 (dpb (sap-ref-8 sap offset
) (byte 8 8)
47 (sap-ref-8 sap
(1+ offset
))))
49 (defun (setf sap-ref-16be
) (value sap offset
)
50 (setf (sap-ref-8 sap
(1+ offset
)) (logand value
#xff
)
51 (sap-ref-8 sap offset
) (ldb (byte 8 8) value
)))
53 (defun sap-ref-32le (sap offset
)
55 (sap-ref-32 sap offset
)
57 (dpb (sap-ref-8 sap
(+ offset
3)) (byte 8 24)
58 (dpb (sap-ref-8 sap
(+ offset
2)) (byte 8 16)
59 (sap-ref-16le sap offset
))))
61 (defun (setf sap-ref-32le
) (value sap offset
)
63 (setf (sap-ref-32 sap offset
) value
)
65 (setf (sap-ref-8 sap offset
) (logand value
#xff
)
66 (sap-ref-8 sap
(1+ offset
)) (ldb (byte 8 8) value
)
67 (sap-ref-8 sap
(+ offset
2)) (ldb (byte 8 16) value
)
68 (sap-ref-8 sap
(+ offset
3)) (ldb (byte 8 24) value
)))
70 (defun sap-ref-32be (sap offset
)
71 (dpb (sap-ref-8 sap offset
) (byte 8 24)
72 (dpb (sap-ref-8 sap
(1+ offset
)) (byte 8 16)
73 (dpb (sap-ref-8 sap
(+ offset
2)) (byte 8 8)
74 (sap-ref-8 sap
(+ offset
3))))))
76 (defun (setf sap-ref-32be
) (value sap offset
)
77 (setf (sap-ref-8 sap offset
) (ldb (byte 8 24) value
)
78 (sap-ref-8 sap
(1+ offset
)) (ldb (byte 8 16) value
)
79 (sap-ref-8 sap
(+ offset
2)) (ldb (byte 8 8) value
)
80 (sap-ref-8 sap
(+ offset
3)) (logand value
#xff
)))
86 ;;; Conversion to UCS-2{LE,BE}
87 (declaim (inline char-
>ucs-2le
))
88 (defun char->ucs-2le
(char dest string pos
)
89 (declare (optimize speed
(safety 0))
90 (type (array (unsigned-byte 8) (*)) dest
))
91 (let ((code (char-code char
)))
94 (declare (type (unsigned-byte 8) b
))
95 (vector-push-extend b dest
)))
96 (declare (inline add-byte
))
97 (add-byte (ldb (byte 8 0) code
))
98 (add-byte (ldb (byte 8 8) code
)))
99 (let ((replacement (encoding-error :ucs-2le string pos
)))
100 (declare (type (simple-array (unsigned-byte 8) (*)) replacement
))
101 (dotimes (i (length replacement
))
102 (vector-push-extend (aref replacement i
) dest
))))))
104 (declaim (inline char-
>ucs-2be
))
105 (defun char->ucs-2be
(char dest string pos
)
106 (declare (optimize speed
(safety 0))
107 (type (array (unsigned-byte 8) (*)) dest
))
108 (let ((code (char-code char
)))
111 (declare (type (unsigned-byte 8) b
))
112 (vector-push-extend b dest
)))
113 (declare (inline add-byte
))
114 (add-byte (ldb (byte 8 8) code
))
115 (add-byte (ldb (byte 8 0) code
)))
116 (let ((replacement (encoding-error :ucs-2be string pos
)))
117 (declare (type (simple-array (unsigned-byte 8) (*)) replacement
))
118 (dotimes (i (length replacement
))
119 (vector-push-extend (aref replacement i
) dest
))))))
121 (defun string->ucs-2le
(string sstart send additional-space
)
122 (declare (optimize speed
(safety 0))
123 (type simple-string string
)
124 (type array-range sstart send additional-space
))
125 (let ((array (make-array (* 2 (+ additional-space
(- send sstart
)))
126 :element-type
'(unsigned-byte 8)
127 :fill-pointer
0 :adjustable t
)))
128 (loop for i from sstart below send
129 do
(char->ucs-2le
(char string i
) array string i
))
130 (dotimes (i (* 2 additional-space
))
131 (vector-push-extend 0 array
))
132 (coerce array
'(simple-array (unsigned-byte 8) (*)))))
134 (defun string->ucs-2be
(string sstart send additional-space
)
135 (declare (optimize speed
(safety 0))
136 (type simple-string string
)
137 (type array-range sstart send additional-space
))
138 (let ((array (make-array (* 2 (+ additional-space
(- send sstart
)))
139 :element-type
'(unsigned-byte 8)
140 :fill-pointer
0 :adjustable t
)))
141 (loop for i from sstart below send
142 do
(char->ucs-2be
(char string i
) array string i
))
143 (dotimes (i (* 2 additional-space
))
144 (vector-push-extend 0 array
))
145 (coerce array
'(simple-array (unsigned-byte 8) (*)))))
147 ;; Conversion from UCS-2{LE,BE}
148 (defmacro define-bytes-per-ucs2-character
(accessor type
)
149 (declare (ignore type
))
150 (let ((name-le (make-od-name 'bytes-per-ucs-2le-character accessor
))
151 (name-be (make-od-name 'bytes-per-ucs-2be-character accessor
)))
153 (defun ,name-le
(array pos end
)
154 (declare (ignore array pos end
))
156 (defun ,name-be
(array pos end
)
157 (declare (ignore array pos end
))
159 (instantiate-octets-definition define-bytes-per-ucs2-character
)
161 (defmacro define-simple-get-ucs2-character
(accessor type
)
162 (let ((name-le (make-od-name 'simple-get-ucs-2le-char accessor
))
163 (name-be (make-od-name 'simple-get-ucs-2be-char accessor
)))
165 (defun ,name-le
(array pos bytes
)
166 (declare (optimize speed
(safety 0))
168 (type array-range pos
)
169 (type (integer 1 4) bytes
)
171 ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-16LE that
172 ;; reads two bytes at once on some architectures.
173 ,(if (and (eq accessor
'sap-ref-8
)
174 (eq type
'system-area-pointer
))
175 '(code-char (sap-ref-16le array pos
))
177 (,accessor array
(the array-range
(+ pos x
)))))
178 (declare (inline cref
))
179 (code-char (dpb (cref 1) (byte 8 8) (cref 0))))))
180 (defun ,name-be
(array pos bytes
)
181 (declare (optimize speed
(safety 0))
183 (type array-range pos
)
184 (type (integer 1 4) bytes
)
186 ;; Use SAP-REF-16BE even if it is not optimized
187 ,(if (and (eq accessor
'sap-ref-8
)
188 (eq type
'system-area-pointer
))
189 '(code-char (sap-ref-16be array pos
))
191 (,accessor array
(the array-range
(+ pos x
)))))
192 (declare (inline cref
))
193 (code-char (dpb (cref 0) (byte 8 8) (cref 1)))))))))
195 (instantiate-octets-definition define-simple-get-ucs2-character
)
197 (defmacro define-ucs-2-
>string
(accessor type
)
198 (let ((name-le (make-od-name 'ucs-2le-
>string accessor
))
199 (name-be (make-od-name 'ucs-2be-
>string accessor
)))
201 (defun ,name-le
(array astart aend
)
202 (declare (optimize speed
(safety 0))
204 (type array-range astart aend
))
205 (let ((string (make-array 0 :adjustable t
:fill-pointer
0 :element-type
'character
)))
206 (loop with pos
= astart
208 do
(multiple-value-bind (bytes invalid
)
209 (,(make-od-name 'bytes-per-ucs-2le-character accessor
) array pos aend
)
210 (declare (type (or null string
) invalid
))
211 (aver (null invalid
))
213 (,(make-od-name 'simple-get-ucs-2le-char accessor
)
218 (defun ,name-be
(array astart aend
)
219 (declare (optimize speed
(safety 0))
221 (type array-range astart aend
))
222 (let ((string (make-array 0 :adjustable t
:fill-pointer
0 :element-type
'character
)))
223 (loop with pos
= astart
225 do
(multiple-value-bind (bytes invalid
)
226 (,(make-od-name 'bytes-per-ucs-2be-character accessor
) array pos aend
)
227 (declare (type (or null string
) invalid
))
228 (aver (null invalid
))
230 (,(make-od-name 'simple-get-ucs-2be-char accessor
)
236 (instantiate-octets-definition define-ucs-2-
>string
)
238 (define-external-format/variable-width
(:ucs-2le
:ucs2le
#!+win32
:ucs2
#!+win32
:ucs-2
) t
242 (setf (sap-ref-16le sap tail
) bits
)
243 (external-format-encoding-error stream bits
))
245 (code-char (sap-ref-16le sap head
))
249 (define-external-format/variable-width
(:ucs-2be
:ucs2be
) t
253 (setf (sap-ref-16be sap tail
) bits
)
254 (external-format-encoding-error stream bits
))
256 (code-char (sap-ref-16be sap head
))
260 (declaim (inline char-
>ucs-4le
))
261 (defun char->ucs-4le
(char dest string pos
)
262 (declare (optimize speed
(safety 0))
263 (type (array (unsigned-byte 8) (*)) dest
)
265 (let ((code (char-code char
)))
267 (declare (type (unsigned-byte 8) b
))
268 (vector-push-extend b dest
)))
269 (declare (inline add-byte
))
270 (add-byte (ldb (byte 8 0) code
))
271 (add-byte (ldb (byte 8 8) code
))
272 (add-byte (ldb (byte 8 16) code
))
273 (add-byte (ldb (byte 8 24) code
)))))
275 (declaim (inline char-
>ucs-4be
))
276 (defun char->ucs-4be
(char dest string pos
)
277 (declare (optimize speed
(safety 0))
278 (type (array (unsigned-byte 8) (*)) dest
)
280 (let ((code (char-code char
)))
282 (declare (type (unsigned-byte 8) b
))
283 (vector-push-extend b dest
)))
284 (declare (inline add-byte
))
285 (add-byte (ldb (byte 8 24) code
))
286 (add-byte (ldb (byte 8 16) code
))
287 (add-byte (ldb (byte 8 8) code
))
288 (add-byte (ldb (byte 8 0) code
)))))
290 (defun string->ucs-4le
(string sstart send additional-space
)
291 (declare (optimize speed
(safety 0))
292 (type simple-string string
)
293 (type array-range sstart send additional-space
))
294 (let ((array (make-array (* 4 (+ additional-space
(- send sstart
)))
295 :element-type
'(unsigned-byte 8)
296 :fill-pointer
0 :adjustable t
)))
297 (loop for i from sstart below send
298 do
(char->ucs-4le
(char string i
) array string i
))
299 (dotimes (i (* 4 additional-space
))
300 (vector-push-extend 0 array
))
301 (coerce array
'(simple-array (unsigned-byte 8) (*)))))
303 (defun string->ucs-4be
(string sstart send additional-space
)
304 (declare (optimize speed
(safety 0))
305 (type simple-string string
)
306 (type array-range sstart send additional-space
))
307 (let ((array (make-array (* 4 (+ additional-space
(- send sstart
)))
308 :element-type
'(unsigned-byte 8)
309 :fill-pointer
0 :adjustable t
)))
310 (loop for i from sstart below send
311 do
(char->ucs-4be
(char string i
) array string i
))
312 (dotimes (i (* 4 additional-space
))
313 (vector-push-extend 0 array
))
314 (coerce array
'(simple-array (unsigned-byte 8) (*)))))
316 ;; Conversion from UCS-4{LE,BE}
317 (defmacro define-bytes-per-ucs4-character
(accessor type
)
318 (declare (ignore type
))
319 (let ((name-le (make-od-name 'bytes-per-ucs-4le-character accessor
))
320 (name-be (make-od-name 'bytes-per-ucs-4be-character accessor
)))
322 (defun ,name-le
(array pos end
)
323 (declare (ignore array pos end
))
325 (defun ,name-be
(array pos end
)
326 (declare (ignore array pos end
))
328 (instantiate-octets-definition define-bytes-per-ucs4-character
)
330 (defmacro define-simple-get-ucs4-character
(accessor type
)
331 (let ((name-le (make-od-name 'simple-get-ucs-4le-char accessor
))
332 (name-be (make-od-name 'simple-get-ucs-4be-char accessor
)))
334 (defun ,name-le
(array pos bytes
)
335 (declare (optimize speed
(safety 0))
337 (type array-range pos
)
338 (type (integer 1 4) bytes
))
339 ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-32LE that
340 ;; reads four bytes at once on some architectures.
341 (let ((code ,(if (and (eq accessor
'sap-ref-8
)
342 (eq type
'system-area-pointer
))
343 '(sap-ref-32le array pos
)
345 (,accessor array
(the array-range
(+ pos x
)))))
346 (declare (inline cref
))
347 (dpb (cref 3) (byte 8 24)
348 (dpb (cref 2) (byte 8 16)
349 (dpb (cref 1) (byte 8 8) (cref 0))))))))
350 (if (< code sb
!xc
:char-code-limit
)
352 (decoding-error array pos
(+ pos bytes
) :ucs-4le
353 'octet-decoding-error pos
))))
354 (defun ,name-be
(array pos bytes
)
355 (declare (optimize speed
(safety 0))
357 (type array-range pos
)
358 (type (integer 1 4) bytes
))
359 ;; Use SAP-REF-32BE even if it is not optimized
360 (let ((code ,(if (and (eq accessor
'sap-ref-8
)
361 (eq type
'system-area-pointer
))
362 '(sap-ref-32be array pos
)
364 (,accessor array
(the array-range
(+ pos x
)))))
365 (declare (inline cref
))
366 (dpb (cref 0) (byte 8 24)
367 (dpb (cref 1) (byte 8 16)
368 (dpb (cref 2) (byte 8 8) (cref 3))))))))
369 (if (< code sb
!xc
:char-code-limit
)
371 (decoding-error array pos
(+ pos bytes
) :ucs-4be
372 'octet-decoding-error pos
)))))))
374 (instantiate-octets-definition define-simple-get-ucs4-character
)
376 (defmacro define-ucs-4-
>string
(accessor type
)
377 (let ((name-le (make-od-name 'ucs-4le-
>string accessor
))
378 (name-be (make-od-name 'ucs-4be-
>string accessor
)))
380 (defun ,name-le
(array astart aend
)
381 (declare (optimize speed
(safety 0))
383 (type array-range astart aend
))
384 (let ((string (make-array 0 :adjustable t
:fill-pointer
0 :element-type
'character
)))
385 (loop with pos
= astart
387 do
(multiple-value-bind (bytes invalid
)
388 (,(make-od-name 'bytes-per-ucs-4le-character accessor
) array pos aend
)
389 (declare (type (or null string
) invalid
))
390 (aver (null invalid
))
391 (let ((thing (,(make-od-name 'simple-get-ucs-4le-char accessor
) array pos bytes
)))
393 (character (vector-push-extend thing string
))
394 (string (dotimes (i (length thing
))
395 (vector-push-extend (char thing i
) string
)))))
398 (defun ,name-be
(array astart aend
)
399 (declare (optimize speed
(safety 0))
401 (type array-range astart aend
))
402 (let ((string (make-array 0 :adjustable t
:fill-pointer
0 :element-type
'character
)))
403 (loop with pos
= astart
405 do
(multiple-value-bind (bytes invalid
)
406 (,(make-od-name 'bytes-per-ucs-4be-character accessor
) array pos aend
)
407 (declare (type (or null string
) invalid
))
408 (aver (null invalid
))
409 (let ((thing (,(make-od-name 'simple-get-ucs-4be-char accessor
) array pos bytes
)))
411 (character (vector-push-extend thing string
))
412 (string (dotimes (i (length thing
))
413 (vector-push-extend (char thing i
) string
)))))
417 (instantiate-octets-definition define-ucs-4-
>string
)
419 (define-external-format/variable-width
(:ucs-4le
:ucs4le
) nil
422 (setf (sap-ref-32le sap tail
) bits
)
424 (let ((code (sap-ref-32le sap head
)))
425 (if (< code sb
!xc
:char-code-limit
)
427 (return-from decode-break-reason
4)))
431 (define-external-format/variable-width
(:ucs-4be
:ucs4be
) nil
434 (setf (sap-ref-32be sap tail
) bits
)
436 (let ((code (sap-ref-32be sap head
)))
437 (if (< code sb
!xc
:char-code-limit
)
439 (return-from decode-break-reason
4)))