A test no longer fails.
[sbcl.git] / tests / octets.pure.lisp
blobf2784c63b516e70d8e3860b3439780583f70508f
1 ;;;; tests of octet/character machinery with no side effects
3 ;;;; This software is part of the SBCL system. See the README file for
4 ;;;; more information.
5 ;;;;
6 ;;;; While most of SBCL is derived from the CMU CL system, the test
7 ;;;; files (like this one) were written from scratch after the fork
8 ;;;; from CMU CL.
9 ;;;;
10 ;;;; This software is in the public domain and is provided with
11 ;;;; absolutely no warranty. See the COPYING and CREDITS files for
12 ;;;; more information.
14 (locally
15 (declare (optimize debug (speed 0)))
17 (labels ((ub8 (len-or-seq)
18 (if (numberp len-or-seq)
19 (make-array len-or-seq :element-type '(unsigned-byte 8) :initial-element 0)
20 (coerce len-or-seq '(simple-array (unsigned-byte 8) (*)))))
22 (ensure-roundtrip-ascii ()
23 (let ((octets (ub8 128)))
24 (dotimes (i 128)
25 (setf (aref octets i) i))
26 (let* ((str (octets-to-string octets :external-format :ascii))
27 (oct2 (string-to-octets str :external-format :ascii)))
28 (assert (= (length octets) (length oct2)))
29 (assert (every #'= octets oct2))))
32 (ensure-roundtrip-latin (format)
33 (let ((octets (ub8 256)))
34 (dotimes (i 256)
35 (setf (aref octets i) i))
36 (let* ((str (octets-to-string octets :external-format format))
37 (oct2 (string-to-octets str :external-format format)))
38 (assert (= (length octets) (length oct2)))
39 (assert (every #'= octets oct2))))
42 (ensure-roundtrip-latin1 ()
43 (ensure-roundtrip-latin :latin1))
45 #+sb-unicode
46 (ensure-roundtrip-latin9 ()
47 (ensure-roundtrip-latin :latin9))
49 (ensure-roundtrip-utf8 ()
50 (let ((string (make-string char-code-limit :initial-element #\nul)))
51 (dotimes (i char-code-limit)
52 (unless (<= #xd800 i #xdfff)
53 (setf (char string i) (code-char i))))
54 (let ((string2
55 (octets-to-string (string-to-octets string :external-format :utf8)
56 :external-format :utf8)))
57 (assert (= (length string2) (length string)))
58 (assert (string= string string2))))
61 (utf8-decode-test (octets expected-results expected-errors)
62 (let ((error-count 0))
63 (handler-bind ((sb-int:character-decoding-error
64 (lambda (c)
65 (incf error-count)
66 (use-value "?" c))))
67 (assert (string= expected-results
68 (octets-to-string (ub8 octets)
69 :external-format :utf-8)))
70 (assert (= error-count expected-errors)))))
72 (utf8-decode-tests (octets expected-results)
73 (let ((expected-errors (count #\? expected-results)))
74 (utf8-decode-test octets expected-results expected-errors)
75 (utf8-decode-test (concatenate 'vector
76 '(34)
77 octets
78 '(34))
79 (format nil "\"~A\"" expected-results)
80 expected-errors))))
82 (ensure-roundtrip-ascii)
83 (ensure-roundtrip-latin1)
84 #+(and sb-unicode (not unicode-lite))
85 (progn
86 (ensure-roundtrip-latin9)
87 ;; Latin-9 chars; the previous test checked roundtrip from
88 ;; octets->char and back, now test that the latin-9 characters did
89 ;; in fact appear during that trip.
90 (let ((l9c (map 'string #'code-char '(8364 352 353 381 382 338 339 376))))
91 (assert
92 (string= (octets-to-string (string-to-octets l9c :external-format :latin9)
93 :external-format :latin9)
94 l9c))))
95 (ensure-roundtrip-utf8)
97 (with-test (:name (:ascii :decoding-error use-value))
98 (let ((non-ascii-bytes (make-array 128
99 :element-type '(unsigned-byte 8)
100 :initial-contents (loop for i from 128 below 256 collect i)))
101 (error-count 0))
102 (handler-bind ((sb-int:character-decoding-error
103 (lambda (c)
104 (incf error-count)
105 (use-value "??" c))))
106 (assert (string= (octets-to-string non-ascii-bytes :external-format :ascii)
107 (make-string 256 :initial-element #\?)))
108 (assert (= error-count 128)))))
109 (with-test (:name (:ascii :encoding-error use-value))
110 (let ((non-ascii-chars (make-array 128
111 :element-type 'character
112 :initial-contents (loop for i from 128 below 256 collect (code-char i))))
113 (error-count 0))
114 (handler-bind ((sb-int:character-encoding-error
115 (lambda (c)
116 (incf error-count)
117 (use-value "??" c))))
118 (assert (equalp (string-to-octets non-ascii-chars :external-format :ascii)
119 (make-array 256 :initial-element (char-code #\?))))
120 (assert (= error-count 128)))))
122 ;; From Markus Kuhn's UTF-8 test file:
123 ;; http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
125 ;; Too-big characters
126 #-sb-unicode
127 (progn
128 (utf8-decode-tests #(#xc4 #x80) "?") ; #x100
129 (utf8-decode-tests #(#xdf #xbf) "?") ; #x7ff
130 (utf8-decode-tests #(#xe0 #xa0 #x80) "?") ; #x800
131 (utf8-decode-tests #(#xef #xbf #xbf) "?") ; #xffff
132 (utf8-decode-tests #(#xf0 #x90 #x80 #x80) "?")) ; #x10000
133 #+nil ; old, 6-byte UTF-8 definition
134 (progn
135 (utf8-decode-tests #(#xf4 #x90 #x80 #x80) "?") ; #x110000
136 (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "?") ; #x1fffff
137 (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?") ; #x200000
138 (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?") ; #x3ffffff
139 (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "?") ; #x4000000
140 (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "?")) ; #x7fffffff
141 (progn ; new, 4-byte (maximum #x10ffff) UTF-8 definition
142 (utf8-decode-tests #(#xf4 #x90) "??") ; #x110000
143 (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "????") ; #x1fffff
144 (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?????") ; #x200000
145 (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?????") ; #x3ffffff
146 (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "??????") ; #x4000000
147 (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "??????")) ; #x7fffffff
149 ;; Unexpected continuation bytes
150 (utf8-decode-tests #(#x80) "?")
151 (utf8-decode-tests #(#xbf) "?")
152 (utf8-decode-tests #(#x80 #xbf) "??")
153 (utf8-decode-tests #(#x80 #xbf #x80) "???")
154 (utf8-decode-tests #(#x80 #xbf #x80 #xbf) "????")
155 (utf8-decode-tests #(#x80 #xbf #x80 #xbf #x80) "?????")
156 (utf8-decode-tests #(#x80 #xbf #x80 #xbf #x80 #xbf) "??????")
157 (utf8-decode-tests #(#x80 #xbf #x80 #xbf #x80 #xbf #x80) "???????")
159 ;; All 64 continuation bytes in a row
160 (apply #'utf8-decode-tests
161 (loop for i from #x80 to #xbf
162 collect i into bytes
163 collect #\? into chars
164 finally (return (list bytes
165 (coerce chars 'string)))))
167 ;; Lonely start characters
168 (flet ((lsc (first last)
169 (apply #'utf8-decode-tests
170 (loop for i from first to last
171 nconc (list i 32) into bytes
172 nconc (list #\? #\Space) into chars
173 finally (return (list bytes
174 (coerce chars 'string)))))
175 (apply #'utf8-decode-tests
176 (loop for i from first to last
177 collect i into bytes
178 collect #\? into chars
179 finally (return (list bytes
180 (coerce chars 'string)))))))
181 (lsc #xc0 #xdf) ; 2-byte sequence start chars
182 (lsc #xe0 #xef) ; 3-byte
183 (lsc #xf0 #xf7) ; 4-byte
184 (lsc #xf8 #xfb) ; 5-byte
185 (lsc #xfc #xfd)) ; 6-byte
187 ;; Otherwise incomplete sequences (last continuation byte missing)
188 (utf8-decode-tests #0=#(#xc0) "?")
189 (utf8-decode-tests #1=#(#xe0 #xa0) "?")
190 (utf8-decode-tests #2=#(#xf0 #x90 #x80) "?")
191 #+nil
192 (utf8-decode-tests #3=#(#xf8 #x80 #x80 #x80) "?")
193 #+nil
194 (utf8-decode-tests #4=#(#xfc #x80 #x80 #x80 #x80) "?")
195 (utf8-decode-tests #5=#(#xdf) "?")
196 (utf8-decode-tests #6=#(#xef #xbf) "?")
197 #+nil
198 (utf8-decode-tests #7=#(#xf7 #xbf #xbf) "?")
199 #+nil
200 (utf8-decode-tests #8=#(#xfb #xbf #xbf #xbf) "?")
201 #+nil
202 (utf8-decode-tests #9=#(#xfd #xbf #xbf #xbf #xbf) "?")
204 ;; All ten previous tests concatenated
205 (utf8-decode-tests (concatenate 'vector #0# #1# #2# #5# #6#)
206 "?????")
208 ;; Random impossible bytes
209 (utf8-decode-tests #(#xfe) "?")
210 (utf8-decode-tests #(#xff) "?")
211 (utf8-decode-tests #(#xfe #xfe #xff #xff) "????")
213 ;; Overlong sequences - /
214 (utf8-decode-tests #(#xc0 #xaf) "??")
215 (utf8-decode-tests #(#xe0 #x80 #xaf) "???")
216 (utf8-decode-tests #(#xf0 #x80 #x80 #xaf) "????")
217 (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #xaf) "?????")
218 (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #xaf) "??????")
220 ;; Overlong sequences - #\Rubout
221 (utf8-decode-tests #(#xc1 #xbf) "??")
222 (utf8-decode-tests #(#xe0 #x9f #xbf) "???")
223 (utf8-decode-tests #(#xf0 #x8f #xbf #xbf) "????")
224 (utf8-decode-tests #(#xf8 #x87 #xbf #xbf #xbf) "?????")
225 (utf8-decode-tests #(#xfc #x83 #xbf #xbf #xbf #xbf) "??????")
227 ;; Overlong sequences - #\Null
228 (utf8-decode-tests #(#xc0 #x80) "??")
229 (utf8-decode-tests #(#xe0 #x80 #x80) "???")
230 (utf8-decode-tests #(#xf0 #x80 #x80 #x80) "????")
231 (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #x80) "?????")
232 (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #x80) "??????")
234 ;; Not testing surrogates & characters #xFFFE, #xFFFF; they're
235 ;; perfectly good sbcl chars even if they're not actually ISO 10646
236 ;; characters, and it's probably a good idea for s-to-o and o-to-s
237 ;; to be inverses of each other as far as possible.
242 ;;; regression test: STRING->UTF8 didn't properly handle a non-zero
243 ;;; START argument.
244 (assert (equalp #(50) (string-to-octets "42" :start 1 :external-format :utf-8)))
246 (assert (equalp #() (string-to-octets "" :external-format :utf-8)))
247 (assert (equalp #() (string-to-octets (make-string 0)
248 :external-format :utf-8)))
249 (assert (equalp #() (string-to-octets (make-string 5)
250 :start 3 :end 3 :external-format :utf-8)))
251 (assert (equalp #(0) (string-to-octets (make-string 5)
252 :start 3 :end 3 :null-terminate t
253 :external-format :utf-8)))
255 ;;; whoops: the iso-8859-2 format referred to an undefined symbol.
256 #+(and sb-unicode (not unicode-lite))
257 (assert (equalp #(251) (string-to-octets (string (code-char 369))
258 :external-format :latin-2)))
260 (with-test (:name (:euc-jp :encoding-errors) :skipped-on (or (not :sb-unicode) :unicode-lite))
261 (handler-bind ((sb-int:character-encoding-error
262 (lambda (c) (use-value #\? c))))
263 (assert (equalp (string-to-octets (map 'string 'code-char '(#xb2 #x5e #xb3))
264 :external-format :euc-jp)
265 (vector (char-code #\?) (char-code #\^) (char-code #\?))))))
266 (with-test (:name (:euc-jp :decoding-errors) :skipped-on (or (not :sb-unicode) :unicode-lite))
267 (handler-bind ((sb-int:character-decoding-error
268 (lambda (c) (use-value #\? c))))
269 (assert (string= "?{?"
270 (octets-to-string
271 (coerce #(182 123 253 238) '(vector (unsigned-byte 8)))
272 :external-format :euc-jp)))))
274 (with-test (:name (:utf-8 :surrogates :encoding-errors) :skipped-on (not :sb-unicode))
275 (handler-bind ((sb-int:character-encoding-error
276 (lambda (c) (use-value #\? c))))
277 (assert (equalp (string-to-octets (string (code-char #xd800))
278 :external-format :utf-8)
279 (vector (char-code #\?))))))
280 (with-test (:name (:utf-8 :surrogates :decoding-errors) :skipped-on (not :sb-unicode))
281 (handler-bind ((sb-int:character-decoding-error
282 (lambda (c) (use-value #\? c))))
283 (assert (find #\? (octets-to-string
284 (coerce #(237 160 128) '(vector (unsigned-byte 8)))
285 :external-format :utf-8)))))
287 (with-test (:name (:ucs-2 :out-of-range :encoding-errors) :skipped-on (not :sb-unicode))
288 (handler-bind ((sb-int:character-encoding-error
289 (lambda (c) (use-value "???" c))))
290 (assert (equalp (string-to-octets (string (code-char #x10001))
291 :external-format :ucs-2le)
292 #(63 0 63 0 63 0))))
293 (handler-bind ((sb-int:character-encoding-error
294 (lambda (c) (use-value "???" c))))
295 (assert (equalp (string-to-octets (string (code-char #x10001))
296 :external-format :ucs-2be)
297 #(0 63 0 63 0 63)))))
299 (with-test (:name (:ucs-4 :out-of-range :decoding-errors) :skipped-on (not :sb-unicode))
300 (handler-bind ((sb-int:character-decoding-error
301 (lambda (c) (use-value "???" c))))
302 (assert (equalp (octets-to-string (coerce '(1 2 3 4) '(vector (unsigned-byte 8)))
303 :external-format :ucs-4le)
304 "???")))
305 (assert (equalp (octets-to-string (coerce '(#xff #xff #x10 #x00) '(vector (unsigned-byte 8)))
306 :external-format :ucs-4le)
307 (string (code-char #x10ffff))))
308 (handler-bind ((sb-int:character-decoding-error
309 (lambda (c) (use-value "???" c))))
310 (assert (equalp (octets-to-string (coerce '(1 2 3 4) '(vector (unsigned-byte 8)))
311 :external-format :ucs-4be)
312 "???"))
313 (assert (equalp (octets-to-string (coerce '(#x00 #x10 #xff #xff) '(vector (unsigned-byte 8)))
314 :external-format :ucs-4be)
315 (string (code-char #x10ffff))))))
317 (with-test (:name (:utf-16le :ensure-roundtrip) :skipped-on (not :sb-unicode))
318 (flet ((enc (x)
319 (string-to-octets x :external-format :utf-16le))
320 (dec (x)
321 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
322 :external-format :utf-16le)))
323 (let ((string (map 'string 'code-char
324 '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd)))
325 (octets #(#x20 0 0 #x2 0 #x20 0 #xd8 0 #xdc 1 #xd8 1 #xdc #xff #xdb #xfd #xdf)))
326 (assert (equalp (enc string) octets))
327 (assert (equalp (dec octets) string)))))
329 (with-test (:name (:utf-16le :encoding-error) :skipped-on (not :sb-unicode))
330 (flet ((enc (x)
331 (string-to-octets x :external-format '(:utf-16le :replacement #\?)))
332 (dec (x)
333 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
334 :external-format :utf-16le)))
335 (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff))))
336 (assert (equalp (enc string) #(63 0 63 0 63 0 63 0))))))
338 (with-test (:name (:utf-16be :ensure-roundtrip) :skipped-on (not :sb-unicode))
339 (flet ((enc (x)
340 (string-to-octets x :external-format :utf-16be))
341 (dec (x)
342 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
343 :external-format :utf-16be)))
344 (let ((string (map 'string 'code-char
345 '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd)))
346 (octets #(0 #x20 #x2 0 #x20 0 #xd8 0 #xdc 0 #xd8 1 #xdc 1 #xdb #xff #xdf #xfd)))
347 (assert (equalp (enc string) octets))
348 (assert (equalp (dec octets) string)))))
350 (with-test (:name (:utf-16be :encoding-error) :skipped-on (not :sb-unicode))
351 (flet ((enc (x)
352 (string-to-octets x :external-format '(:utf-16be :replacement #\?)))
353 (dec (x)
354 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
355 :external-format :utf-16be)))
356 (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff))))
357 (assert (equalp (enc string) #(0 63 0 63 0 63 0 63))))))
360 (with-test (:name (:utf-32le :ensure-roundtrip) :skipped-on (not :sb-unicode))
361 (flet ((enc (x)
362 (string-to-octets x :external-format :utf-32le))
363 (dec (x)
364 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
365 :external-format :utf-32le)))
366 (let ((string (map 'string 'code-char
367 '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd)))
368 (octets #(#x20 0 0 0 0 #x2 0 0 0 #x20 0 0 0 0 1 0 1 4 1 0 #xfd #xff #x10 0)))
369 (assert (equalp (enc string) octets))
370 (assert (equalp (dec octets) string)))))
372 (with-test (:name (:utf-32le :encoding-error) :skipped-on (not :sb-unicode))
373 (flet ((enc (x)
374 (string-to-octets x :external-format '(:utf-32le :replacement #\?)))
375 (dec (x)
376 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
377 :external-format :utf-32le)))
378 (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff))))
379 (assert (equalp (enc string) #(63 0 0 0 63 0 0 0 63 0 0 0 63 0 0 0))))))
382 (with-test (:name (:utf-32be :ensure-roundtrip) :skipped-on (not :sb-unicode))
383 (flet ((enc (x)
384 (string-to-octets x :external-format :utf-32be))
385 (dec (x)
386 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
387 :external-format :utf-32be)))
388 (let ((string (map 'string 'code-char
389 '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd)))
390 (octets #(0 0 0 #x20 0 0 #x2 0 0 0 #x20 0 0 1 0 0 0 1 4 1 0 #x10 #xff #xfd)))
391 (assert (equalp (enc string) octets))
392 (assert (equalp (dec octets) string)))))
394 (with-test (:name (:utf-32be :encoding-error) :skipped-on (not :sb-unicode))
395 (flet ((enc (x)
396 (string-to-octets x :external-format '(:utf-32be :replacement #\?)))
397 (dec (x)
398 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
399 :external-format :utf-32be)))
400 (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff))))
401 (assert (equalp (enc string) #(0 0 0 63 0 0 0 63 0 0 0 63 0 0 0 63))))))
403 (with-test (:name :compile-file-position-with-encodings
404 :skipped-on (not :sb-unicode))
405 (with-open-file (f1 "data/compile-file-pos.lisp" :external-format :utf-8)
406 (with-open-file (f2 "data/compile-file-pos-utf16be.lisp"
407 :external-format :utf-16be)
408 (dotimes (i 3) ; skip three lines
409 ;; because a comment line differs, and the function names differ
410 (read-line f1)
411 (read-line f2))
412 (dotimes (i 3) ; compare three lines
413 (assert (string= (read-line f1) (read-line f2))))))
414 (flet ((compile-and-load (file encoding main-fun)
415 (let ((fasl (compile-file file
416 :output-file (scratch-file-name "fasl")
417 :external-format encoding
418 :print nil :verbose nil)))
419 (load fasl)
420 (delete-file fasl)
421 (funcall main-fun))))
422 (multiple-value-bind (a1 b1 c1)
423 (compile-and-load "data/compile-file-pos.lisp" :utf-8 'cfp-foolz1)
424 (multiple-value-bind (a2 b2 c2)
425 (compile-and-load "data/compile-file-pos-utf16be.lisp" :utf-16be
426 'cfp-foolz2)
427 (assert (string= a1 a2))
428 (assert (string= b1 b2))
429 ;; COMPILE-FILE-POSITION is insensitive to file encoding.
430 (assert (string= c1 c2))))))
432 (with-test (:name (:compile-constant-bad-external-format :octets-to-string))
433 (checked-compile-and-assert ()
434 '(lambda (x) (sb-ext:octets-to-string x :external-format 0))
435 (((make-array 0 :element-type '(unsigned-byte 8))) (condition 'error))))
436 (with-test (:name (:compile-constant-bad-external-format :string-to-octets))
437 (checked-compile-and-assert ()
438 '(lambda (x) (sb-ext:octets-to-string x :external-format 0))
439 (("foo") (condition 'error))))