1 ;;;; tests of octet/character machinery with no side effects
3 ;;;; This software is part of the SBCL system. See the README file for
6 ;;;; While most of SBCL is derived from the CMU CL system, the test
7 ;;;; files (like this one) were written from scratch after the fork
10 ;;;; This software is in the public domain and is provided with
11 ;;;; absolutely no warranty. See the COPYING and CREDITS files for
12 ;;;; more information.
14 (cl:in-package
:cl-user
)
17 (declare (optimize debug
(speed 0)))
19 (labels ((ub8 (len-or-seq)
20 (if (numberp len-or-seq
)
21 (make-array len-or-seq
:element-type
'(unsigned-byte 8) :initial-element
0)
22 (coerce len-or-seq
'(simple-array (unsigned-byte 8) (*)))))
24 (ensure-roundtrip-ascii ()
25 (let ((octets (ub8 128)))
27 (setf (aref octets i
) i
))
28 (let* ((str (octets-to-string octets
:external-format
:ascii
))
29 (oct2 (string-to-octets str
:external-format
:ascii
)))
30 (assert (= (length octets
) (length oct2
)))
31 (assert (every #'= octets oct2
))))
34 (ensure-roundtrip-latin (format)
35 (let ((octets (ub8 256)))
37 (setf (aref octets i
) i
))
38 (let* ((str (octets-to-string octets
:external-format format
))
39 (oct2 (string-to-octets str
:external-format format
)))
40 (assert (= (length octets
) (length oct2
)))
41 (assert (every #'= octets oct2
))))
44 (ensure-roundtrip-latin1 ()
45 (ensure-roundtrip-latin :latin1
))
48 (ensure-roundtrip-latin9 ()
49 (ensure-roundtrip-latin :latin9
))
51 (ensure-roundtrip-utf8 ()
52 (let ((string (make-string char-code-limit
)))
53 (dotimes (i char-code-limit
)
54 (unless (<= #xd800 i
#xdfff
)
55 (setf (char string i
) (code-char i
))))
57 (octets-to-string (string-to-octets string
:external-format
:utf8
)
58 :external-format
:utf8
)))
59 (assert (= (length string2
) (length string
)))
60 (assert (string= string string2
))))
63 (utf8-decode-test (octets expected-results expected-errors
)
64 (let ((error-count 0))
65 (handler-bind ((sb-int:character-decoding-error
69 (assert (string= expected-results
70 (octets-to-string (ub8 octets
)
71 :external-format
:utf-8
)))
72 (assert (= error-count expected-errors
)))))
74 (utf8-decode-tests (octets expected-results
)
75 (let ((expected-errors (count #\? expected-results
)))
76 (utf8-decode-test octets expected-results expected-errors
)
77 (utf8-decode-test (concatenate 'vector
81 (format nil
"\"~A\"" expected-results
)
84 (ensure-roundtrip-ascii)
85 (ensure-roundtrip-latin1)
88 (ensure-roundtrip-latin9)
89 ;; Latin-9 chars; the previous test checked roundtrip from
90 ;; octets->char and back, now test that the latin-9 characters did
91 ;; in fact appear during that trip.
92 (let ((l9c (map 'string
#'code-char
'(8364 352 353 381 382 338 339 376))))
94 (string= (octets-to-string (string-to-octets l9c
:external-format
:latin9
)
95 :external-format
:latin9
)
97 (ensure-roundtrip-utf8)
99 (with-test (:name
(:ascii
:decoding-error use-value
))
100 (let ((non-ascii-bytes (make-array 128
101 :element-type
'(unsigned-byte 8)
102 :initial-contents
(loop for i from
128 below
256 collect i
)))
104 (handler-bind ((sb-int:character-decoding-error
107 (use-value "??" c
))))
108 (assert (string= (octets-to-string non-ascii-bytes
:external-format
:ascii
)
109 (make-string 256 :initial-element
#\?)))
110 (assert (= error-count
128)))))
111 (with-test (:name
(:ascii
:encoding-error use-value
))
112 (let ((non-ascii-chars (make-array 128
113 :element-type
'character
114 :initial-contents
(loop for i from
128 below
256 collect
(code-char i
))))
116 (handler-bind ((sb-int:character-encoding-error
119 (use-value "??" c
))))
120 (assert (equalp (string-to-octets non-ascii-chars
:external-format
:ascii
)
121 (make-array 256 :initial-element
(char-code #\?))))
122 (assert (= error-count
128)))))
124 ;; From Markus Kuhn's UTF-8 test file:
125 ;; http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
127 ;; Too-big characters
130 (utf8-decode-tests #(#xc4
#x80
) "?") ; #x100
131 (utf8-decode-tests #(#xdf
#xbf
) "?") ; #x7ff
132 (utf8-decode-tests #(#xe0
#xa0
#x80
) "?") ; #x800
133 (utf8-decode-tests #(#xef
#xbf
#xbf
) "?") ; #xffff
134 (utf8-decode-tests #(#xf0
#x90
#x80
#x80
) "?")) ; #x10000
135 #+nil
; old, 6-byte UTF-8 definition
137 (utf8-decode-tests #(#xf4
#x90
#x80
#x80
) "?") ; #x110000
138 (utf8-decode-tests #(#xf7
#xbf
#xbf
#xbf
) "?") ; #x1fffff
139 (utf8-decode-tests #(#xf8
#x88
#x80
#x80
#x80
) "?") ; #x200000
140 (utf8-decode-tests #(#xfb
#xbf
#xbf
#xbf
#xbf
) "?") ; #x3ffffff
141 (utf8-decode-tests #(#xfc
#x84
#x80
#x80
#x80
#x80
) "?") ; #x4000000
142 (utf8-decode-tests #(#xfd
#xbf
#xbf
#xbf
#xbf
#xbf
) "?")) ; #x7fffffff
143 (progn ; new, 4-byte (maximum #x10ffff) UTF-8 definition
144 (utf8-decode-tests #(#xf4
#x90
) "??") ; #x110000
145 (utf8-decode-tests #(#xf7
#xbf
#xbf
#xbf
) "????") ; #x1fffff
146 (utf8-decode-tests #(#xf8
#x88
#x80
#x80
#x80
) "?????") ; #x200000
147 (utf8-decode-tests #(#xfb
#xbf
#xbf
#xbf
#xbf
) "?????") ; #x3ffffff
148 (utf8-decode-tests #(#xfc
#x84
#x80
#x80
#x80
#x80
) "??????") ; #x4000000
149 (utf8-decode-tests #(#xfd
#xbf
#xbf
#xbf
#xbf
#xbf
) "??????")) ; #x7fffffff
151 ;; Unexpected continuation bytes
152 (utf8-decode-tests #(#x80
) "?")
153 (utf8-decode-tests #(#xbf
) "?")
154 (utf8-decode-tests #(#x80
#xbf
) "??")
155 (utf8-decode-tests #(#x80
#xbf
#x80
) "???")
156 (utf8-decode-tests #(#x80
#xbf
#x80
#xbf
) "????")
157 (utf8-decode-tests #(#x80
#xbf
#x80
#xbf
#x80
) "?????")
158 (utf8-decode-tests #(#x80
#xbf
#x80
#xbf
#x80
#xbf
) "??????")
159 (utf8-decode-tests #(#x80
#xbf
#x80
#xbf
#x80
#xbf
#x80
) "???????")
161 ;; All 64 continuation bytes in a row
162 (apply #'utf8-decode-tests
163 (loop for i from
#x80 to
#xbf
165 collect
#\? into chars
166 finally
(return (list bytes
167 (coerce chars
'string
)))))
169 ;; Lonely start characters
170 (flet ((lsc (first last
)
171 (apply #'utf8-decode-tests
172 (loop for i from first to last
173 nconc
(list i
32) into bytes
174 nconc
(list #\? #\Space
) into chars
175 finally
(return (list bytes
176 (coerce chars
'string
)))))
177 (apply #'utf8-decode-tests
178 (loop for i from first to last
180 collect
#\? into chars
181 finally
(return (list bytes
182 (coerce chars
'string
)))))))
183 (lsc #xc0
#xdf
) ; 2-byte sequence start chars
184 (lsc #xe0
#xef
) ; 3-byte
185 (lsc #xf0
#xf7
) ; 4-byte
186 (lsc #xf8
#xfb
) ; 5-byte
187 (lsc #xfc
#xfd
)) ; 6-byte
189 ;; Otherwise incomplete sequences (last continuation byte missing)
190 (utf8-decode-tests #0=#(#xc0
) "?")
191 (utf8-decode-tests #1=#(#xe0
#xa0
) "?")
192 (utf8-decode-tests #2=#(#xf0
#x90
#x80
) "?")
194 (utf8-decode-tests #3=#(#xf8
#x80
#x80
#x80
) "?")
196 (utf8-decode-tests #4=#(#xfc
#x80
#x80
#x80
#x80
) "?")
197 (utf8-decode-tests #5=#(#xdf
) "?")
198 (utf8-decode-tests #6=#(#xef
#xbf
) "?")
200 (utf8-decode-tests #7=#(#xf7
#xbf
#xbf
) "?")
202 (utf8-decode-tests #8=#(#xfb
#xbf
#xbf
#xbf
) "?")
204 (utf8-decode-tests #9=#(#xfd
#xbf
#xbf
#xbf
#xbf
) "?")
206 ;; All ten previous tests concatenated
207 (utf8-decode-tests (concatenate 'vector
#0# #1# #2# #5# #6#)
210 ;; Random impossible bytes
211 (utf8-decode-tests #(#xfe
) "?")
212 (utf8-decode-tests #(#xff
) "?")
213 (utf8-decode-tests #(#xfe
#xfe
#xff
#xff
) "????")
215 ;; Overlong sequences - /
216 (utf8-decode-tests #(#xc0
#xaf
) "??")
217 (utf8-decode-tests #(#xe0
#x80
#xaf
) "???")
218 (utf8-decode-tests #(#xf0
#x80
#x80
#xaf
) "????")
219 (utf8-decode-tests #(#xf8
#x80
#x80
#x80
#xaf
) "?????")
220 (utf8-decode-tests #(#xfc
#x80
#x80
#x80
#x80
#xaf
) "??????")
222 ;; Overlong sequences - #\Rubout
223 (utf8-decode-tests #(#xc1
#xbf
) "??")
224 (utf8-decode-tests #(#xe0
#x9f
#xbf
) "???")
225 (utf8-decode-tests #(#xf0
#x8f
#xbf
#xbf
) "????")
226 (utf8-decode-tests #(#xf8
#x87
#xbf
#xbf
#xbf
) "?????")
227 (utf8-decode-tests #(#xfc
#x83
#xbf
#xbf
#xbf
#xbf
) "??????")
229 ;; Overlong sequences - #\Null
230 (utf8-decode-tests #(#xc0
#x80
) "??")
231 (utf8-decode-tests #(#xe0
#x80
#x80
) "???")
232 (utf8-decode-tests #(#xf0
#x80
#x80
#x80
) "????")
233 (utf8-decode-tests #(#xf8
#x80
#x80
#x80
#x80
) "?????")
234 (utf8-decode-tests #(#xfc
#x80
#x80
#x80
#x80
#x80
) "??????")
236 ;; Not testing surrogates & characters #xFFFE, #xFFFF; they're
237 ;; perfectly good sbcl chars even if they're not actually ISO 10646
238 ;; characters, and it's probably a good idea for s-to-o and o-to-s
239 ;; to be inverses of each other as far as possible.
244 ;;; regression test: STRING->UTF8 didn't properly handle a non-zero
246 (assert (equalp #(50) (string-to-octets "42" :start
1 :external-format
:utf-8
)))
248 ;;; STRING->UTF8 should cope with NIL strings if a null range is required
249 (assert (equalp #() (string-to-octets "" :external-format
:utf-8
)))
250 (assert (equalp #() (string-to-octets (make-array 0 :element-type nil
)
251 :external-format
:utf-8
)))
252 (assert (equalp #() (string-to-octets (make-array 5 :element-type nil
)
253 :start
3 :end
3 :external-format
:utf-8
)))
254 (assert (equalp #(0) (string-to-octets (make-array 5 :element-type nil
)
255 :start
3 :end
3 :null-terminate t
256 :external-format
:utf-8
)))
258 ;;; whoops: the iso-8859-2 format referred to an undefined symbol.
260 (assert (equalp #(251) (string-to-octets (string (code-char 369))
261 :external-format
:latin-2
)))
264 (with-test (:name
(:euc-jp
:decoding-errors
))
265 (handler-bind ((sb-int:character-decoding-error
266 (lambda (c) (use-value #\? c
))))
267 (assert (string= "?{?"
269 (coerce #(182 123 253 238) '(vector (unsigned-byte 8)))
270 :external-format
:euc-jp
)))))
273 (with-test (:name
(:utf-8
:surrogates
:encoding-errors
))
274 (handler-bind ((sb-int:character-encoding-error
275 (lambda (c) (use-value #\? c
))))
276 (assert (equalp (string-to-octets (string (code-char #xd800
))
277 :external-format
:utf-8
)
278 (vector (char-code #\?))))))
280 (with-test (:name
(:utf-8
:surrogates
:decoding-errors
))
281 (handler-bind ((sb-int:character-decoding-error
282 (lambda (c) (use-value #\? c
))))
283 (assert (find #\? (octets-to-string
284 (coerce #(237 160 128) '(vector (unsigned-byte 8)))
285 :external-format
:utf-8
)))))
288 (with-test (:name
(:ucs-2
:out-of-range
:encoding-errors
))
289 (handler-bind ((sb-int:character-encoding-error
290 (lambda (c) (use-value "???" c
))))
291 (assert (equalp (string-to-octets (string (code-char #x10001
))
292 :external-format
:ucs-2le
)
294 (handler-bind ((sb-int:character-encoding-error
295 (lambda (c) (use-value "???" c
))))
296 (assert (equalp (string-to-octets (string (code-char #x10001
))
297 :external-format
:ucs-2be
)
298 #(0 63 0 63 0 63)))))
301 (with-test (:name
(:ucs-4
:out-of-range
:decoding-errors
))
302 (handler-bind ((sb-int:character-decoding-error
303 (lambda (c) (use-value "???" c
))))
304 (assert (equalp (octets-to-string (coerce '(1 2 3 4) '(vector (unsigned-byte 8)))
305 :external-format
:ucs-4le
)
307 (assert (equalp (octets-to-string (coerce '(#xff
#xff
#x10
#x00
) '(vector (unsigned-byte 8)))
308 :external-format
:ucs-4le
)
309 (string (code-char #x10ffff
))))
310 (handler-bind ((sb-int:character-decoding-error
311 (lambda (c) (use-value "???" c
))))
312 (assert (equalp (octets-to-string (coerce '(1 2 3 4) '(vector (unsigned-byte 8)))
313 :external-format
:ucs-4be
)
315 (assert (equalp (octets-to-string (coerce '(#x00
#x10
#xff
#xff
) '(vector (unsigned-byte 8)))
316 :external-format
:ucs-4be
)
317 (string (code-char #x10ffff
))))))
320 (with-test (:name
(:utf-16le
:ensure-roundtrip
))
322 (string-to-octets x
:external-format
:utf-16le
))
324 (octets-to-string (coerce x
'(vector (unsigned-byte 8)))
325 :external-format
:utf-16le
)))
326 (let ((string (map 'string
'code-char
327 '(#x20
#x200
#x2000
#x10000
#x10401
#x10fffd
)))
328 (octets #(#x20
0 0 #x2
0 #x20
0 #xd8
0 #xdc
1 #xd8
1 #xdc
#xff
#xdb
#xfd
#xdf
)))
329 (assert (equalp (enc string
) octets
))
330 (assert (equalp (dec octets
) string
)))))
332 (with-test (:name
(:utf-16le
:encoding-error
))
334 (string-to-octets x
:external-format
'(:utf-16le
:replacement
#\?)))
336 (octets-to-string (coerce x
'(vector (unsigned-byte 8)))
337 :external-format
:utf-16le
)))
338 (let ((string (map 'string
'code-char
'(#xd800
#xdc00
#xfffe
#x10ffff
))))
339 (assert (equalp (enc string
) #(63 0 63 0 63 0 63 0))))))
342 (with-test (:name
(:utf-16be
:ensure-roundtrip
))
344 (string-to-octets x
:external-format
:utf-16be
))
346 (octets-to-string (coerce x
'(vector (unsigned-byte 8)))
347 :external-format
:utf-16be
)))
348 (let ((string (map 'string
'code-char
349 '(#x20
#x200
#x2000
#x10000
#x10401
#x10fffd
)))
350 (octets #(0 #x20
#x2
0 #x20
0 #xd8
0 #xdc
0 #xd8
1 #xdc
1 #xdb
#xff
#xdf
#xfd
)))
351 (assert (equalp (enc string
) octets
))
352 (assert (equalp (dec octets
) string
)))))
354 (with-test (:name
(:utf-16be
:encoding-error
))
356 (string-to-octets x
:external-format
'(:utf-16be
:replacement
#\?)))
358 (octets-to-string (coerce x
'(vector (unsigned-byte 8)))
359 :external-format
:utf-16be
)))
360 (let ((string (map 'string
'code-char
'(#xd800
#xdc00
#xfffe
#x10ffff
))))
361 (assert (equalp (enc string
) #(0 63 0 63 0 63 0 63))))))
364 (with-test (:name
(:utf-32le
:ensure-roundtrip
))
366 (string-to-octets x
:external-format
:utf-32le
))
368 (octets-to-string (coerce x
'(vector (unsigned-byte 8)))
369 :external-format
:utf-32le
)))
370 (let ((string (map 'string
'code-char
371 '(#x20
#x200
#x2000
#x10000
#x10401
#x10fffd
)))
372 (octets #(#x20
0 0 0 0 #x2
0 0 0 #x20
0 0 0 0 1 0 1 4 1 0 #xfd
#xff
#x10
0)))
373 (assert (equalp (enc string
) octets
))
374 (assert (equalp (dec octets
) string
)))))
376 (with-test (:name
(:utf-32le
:encoding-error
))
378 (string-to-octets x
:external-format
'(:utf-32le
:replacement
#\?)))
380 (octets-to-string (coerce x
'(vector (unsigned-byte 8)))
381 :external-format
:utf-32le
)))
382 (let ((string (map 'string
'code-char
'(#xd800
#xdc00
#xfffe
#x10ffff
))))
383 (assert (equalp (enc string
) #(63 0 0 0 63 0 0 0 63 0 0 0 63 0 0 0))))))
386 (with-test (:name
(:utf-32be
:ensure-roundtrip
))
388 (string-to-octets x
:external-format
:utf-32be
))
390 (octets-to-string (coerce x
'(vector (unsigned-byte 8)))
391 :external-format
:utf-32be
)))
392 (let ((string (map 'string
'code-char
393 '(#x20
#x200
#x2000
#x10000
#x10401
#x10fffd
)))
394 (octets #(0 0 0 #x20
0 0 #x2
0 0 0 #x20
0 0 1 0 0 0 1 4 1 0 #x10
#xff
#xfd
)))
395 (assert (equalp (enc string
) octets
))
396 (assert (equalp (dec octets
) string
)))))
398 (with-test (:name
(:utf-32be
:encoding-error
))
400 (string-to-octets x
:external-format
'(:utf-32be
:replacement
#\?)))
402 (octets-to-string (coerce x
'(vector (unsigned-byte 8)))
403 :external-format
:utf-32be
)))
404 (let ((string (map 'string
'code-char
'(#xd800
#xdc00
#xfffe
#x10ffff
))))
405 (assert (equalp (enc string
) #(0 0 0 63 0 0 0 63 0 0 0 63 0 0 0 63))))))