1.0.37.8: add ATOMIC-DECF, fix WAIT-ON-SEMAPHORE-BUGLET
[sbcl/nikodemus.git] / tests / octets.pure.lisp
blobd40b26095877c393b4fb8d0336531dc3ebe84fda
1 ;;;; tests of octet/character machinery with no side effects
3 ;;;; This software is part of the SBCL system. See the README file for
4 ;;;; more information.
5 ;;;;
6 ;;;; While most of SBCL is derived from the CMU CL system, the test
7 ;;;; files (like this one) were written from scratch after the fork
8 ;;;; from CMU CL.
9 ;;;;
10 ;;;; This software is in the public domain and is provided with
11 ;;;; absolutely no warranty. See the COPYING and CREDITS files for
12 ;;;; more information.
14 (cl:in-package :cl-user)
16 (locally
17 (declare (optimize debug (speed 0)))
19 (labels ((ub8 (len-or-seq)
20 (if (numberp len-or-seq)
21 (make-array len-or-seq :element-type '(unsigned-byte 8) :initial-element 0)
22 (coerce len-or-seq '(simple-array (unsigned-byte 8) (*)))))
24 (ensure-roundtrip-ascii ()
25 (let ((octets (ub8 128)))
26 (dotimes (i 128)
27 (setf (aref octets i) i))
28 (let* ((str (octets-to-string octets :external-format :ascii))
29 (oct2 (string-to-octets str :external-format :ascii)))
30 (assert (= (length octets) (length oct2)))
31 (assert (every #'= octets oct2))))
34 (ensure-roundtrip-latin (format)
35 (let ((octets (ub8 256)))
36 (dotimes (i 256)
37 (setf (aref octets i) i))
38 (let* ((str (octets-to-string octets :external-format format))
39 (oct2 (string-to-octets str :external-format format)))
40 (assert (= (length octets) (length oct2)))
41 (assert (every #'= octets oct2))))
44 (ensure-roundtrip-latin1 ()
45 (ensure-roundtrip-latin :latin1))
47 #+sb-unicode
48 (ensure-roundtrip-latin9 ()
49 (ensure-roundtrip-latin :latin9))
51 (ensure-roundtrip-utf8 ()
52 (let ((string (make-string char-code-limit)))
53 (dotimes (i char-code-limit)
54 (unless (<= #xd800 i #xdfff)
55 (setf (char string i) (code-char i))))
56 (let ((string2
57 (octets-to-string (string-to-octets string :external-format :utf8)
58 :external-format :utf8)))
59 (assert (= (length string2) (length string)))
60 (assert (string= string string2))))
63 (utf8-decode-test (octets expected-results expected-errors)
64 (let ((error-count 0))
65 (handler-bind ((sb-int:character-decoding-error
66 (lambda (c)
67 (incf error-count)
68 (use-value "?" c))))
69 (assert (string= expected-results
70 (octets-to-string (ub8 octets)
71 :external-format :utf-8)))
72 (assert (= error-count expected-errors)))))
74 (utf8-decode-tests (octets expected-results)
75 (let ((expected-errors (count #\? expected-results)))
76 (utf8-decode-test octets expected-results expected-errors)
77 (utf8-decode-test (concatenate 'vector
78 '(34)
79 octets
80 '(34))
81 (format nil "\"~A\"" expected-results)
82 expected-errors))))
84 (ensure-roundtrip-ascii)
85 (ensure-roundtrip-latin1)
86 #+sb-unicode
87 (progn
88 (ensure-roundtrip-latin9)
89 ;; Latin-9 chars; the previous test checked roundtrip from
90 ;; octets->char and back, now test that the latin-9 characters did
91 ;; in fact appear during that trip.
92 (let ((l9c (map 'string #'code-char '(8364 352 353 381 382 338 339 376))))
93 (assert
94 (string= (octets-to-string (string-to-octets l9c :external-format :latin9)
95 :external-format :latin9)
96 l9c))))
97 (ensure-roundtrip-utf8)
99 (with-test (:name (:ascii :decoding-error use-value))
100 (let ((non-ascii-bytes (make-array 128
101 :element-type '(unsigned-byte 8)
102 :initial-contents (loop for i from 128 below 256 collect i)))
103 (error-count 0))
104 (handler-bind ((sb-int:character-decoding-error
105 (lambda (c)
106 (incf error-count)
107 (use-value "??" c))))
108 (assert (string= (octets-to-string non-ascii-bytes :external-format :ascii)
109 (make-string 256 :initial-element #\?)))
110 (assert (= error-count 128)))))
111 (with-test (:name (:ascii :encoding-error use-value))
112 (let ((non-ascii-chars (make-array 128
113 :element-type 'character
114 :initial-contents (loop for i from 128 below 256 collect (code-char i))))
115 (error-count 0))
116 (handler-bind ((sb-int:character-encoding-error
117 (lambda (c)
118 (incf error-count)
119 (use-value "??" c))))
120 (assert (equalp (string-to-octets non-ascii-chars :external-format :ascii)
121 (make-array 256 :initial-element (char-code #\?))))
122 (assert (= error-count 128)))))
124 ;; From Markus Kuhn's UTF-8 test file:
125 ;; http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
127 ;; Too-big characters
128 #-sb-unicode
129 (progn
130 (utf8-decode-tests #(#xc4 #x80) "?") ; #x100
131 (utf8-decode-tests #(#xdf #xbf) "?") ; #x7ff
132 (utf8-decode-tests #(#xe0 #xa0 #x80) "?") ; #x800
133 (utf8-decode-tests #(#xef #xbf #xbf) "?") ; #xffff
134 (utf8-decode-tests #(#xf0 #x90 #x80 #x80) "?")) ; #x10000
135 #+nil ; old, 6-byte UTF-8 definition
136 (progn
137 (utf8-decode-tests #(#xf4 #x90 #x80 #x80) "?") ; #x110000
138 (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "?") ; #x1fffff
139 (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?") ; #x200000
140 (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?") ; #x3ffffff
141 (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "?") ; #x4000000
142 (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "?")) ; #x7fffffff
143 (progn ; new, 4-byte (maximum #x10ffff) UTF-8 definition
144 (utf8-decode-tests #(#xf4 #x90) "??") ; #x110000
145 (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "????") ; #x1fffff
146 (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?????") ; #x200000
147 (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?????") ; #x3ffffff
148 (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "??????") ; #x4000000
149 (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "??????")) ; #x7fffffff
151 ;; Unexpected continuation bytes
152 (utf8-decode-tests #(#x80) "?")
153 (utf8-decode-tests #(#xbf) "?")
154 (utf8-decode-tests #(#x80 #xbf) "??")
155 (utf8-decode-tests #(#x80 #xbf #x80) "???")
156 (utf8-decode-tests #(#x80 #xbf #x80 #xbf) "????")
157 (utf8-decode-tests #(#x80 #xbf #x80 #xbf #x80) "?????")
158 (utf8-decode-tests #(#x80 #xbf #x80 #xbf #x80 #xbf) "??????")
159 (utf8-decode-tests #(#x80 #xbf #x80 #xbf #x80 #xbf #x80) "???????")
161 ;; All 64 continuation bytes in a row
162 (apply #'utf8-decode-tests
163 (loop for i from #x80 to #xbf
164 collect i into bytes
165 collect #\? into chars
166 finally (return (list bytes
167 (coerce chars 'string)))))
169 ;; Lonely start characters
170 (flet ((lsc (first last)
171 (apply #'utf8-decode-tests
172 (loop for i from first to last
173 nconc (list i 32) into bytes
174 nconc (list #\? #\Space) into chars
175 finally (return (list bytes
176 (coerce chars 'string)))))
177 (apply #'utf8-decode-tests
178 (loop for i from first to last
179 collect i into bytes
180 collect #\? into chars
181 finally (return (list bytes
182 (coerce chars 'string)))))))
183 (lsc #xc0 #xdf) ; 2-byte sequence start chars
184 (lsc #xe0 #xef) ; 3-byte
185 (lsc #xf0 #xf7) ; 4-byte
186 (lsc #xf8 #xfb) ; 5-byte
187 (lsc #xfc #xfd)) ; 6-byte
189 ;; Otherwise incomplete sequences (last continuation byte missing)
190 (utf8-decode-tests #0=#(#xc0) "?")
191 (utf8-decode-tests #1=#(#xe0 #xa0) "?")
192 (utf8-decode-tests #2=#(#xf0 #x90 #x80) "?")
193 #+nil
194 (utf8-decode-tests #3=#(#xf8 #x80 #x80 #x80) "?")
195 #+nil
196 (utf8-decode-tests #4=#(#xfc #x80 #x80 #x80 #x80) "?")
197 (utf8-decode-tests #5=#(#xdf) "?")
198 (utf8-decode-tests #6=#(#xef #xbf) "?")
199 #+nil
200 (utf8-decode-tests #7=#(#xf7 #xbf #xbf) "?")
201 #+nil
202 (utf8-decode-tests #8=#(#xfb #xbf #xbf #xbf) "?")
203 #+nil
204 (utf8-decode-tests #9=#(#xfd #xbf #xbf #xbf #xbf) "?")
206 ;; All ten previous tests concatenated
207 (utf8-decode-tests (concatenate 'vector #0# #1# #2# #5# #6#)
208 "?????")
210 ;; Random impossible bytes
211 (utf8-decode-tests #(#xfe) "?")
212 (utf8-decode-tests #(#xff) "?")
213 (utf8-decode-tests #(#xfe #xfe #xff #xff) "????")
215 ;; Overlong sequences - /
216 (utf8-decode-tests #(#xc0 #xaf) "??")
217 (utf8-decode-tests #(#xe0 #x80 #xaf) "???")
218 (utf8-decode-tests #(#xf0 #x80 #x80 #xaf) "????")
219 (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #xaf) "?????")
220 (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #xaf) "??????")
222 ;; Overlong sequences - #\Rubout
223 (utf8-decode-tests #(#xc1 #xbf) "??")
224 (utf8-decode-tests #(#xe0 #x9f #xbf) "???")
225 (utf8-decode-tests #(#xf0 #x8f #xbf #xbf) "????")
226 (utf8-decode-tests #(#xf8 #x87 #xbf #xbf #xbf) "?????")
227 (utf8-decode-tests #(#xfc #x83 #xbf #xbf #xbf #xbf) "??????")
229 ;; Overlong sequences - #\Null
230 (utf8-decode-tests #(#xc0 #x80) "??")
231 (utf8-decode-tests #(#xe0 #x80 #x80) "???")
232 (utf8-decode-tests #(#xf0 #x80 #x80 #x80) "????")
233 (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #x80) "?????")
234 (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #x80) "??????")
236 ;; Not testing surrogates & characters #xFFFE, #xFFFF; they're
237 ;; perfectly good sbcl chars even if they're not actually ISO 10646
238 ;; characters, and it's probably a good idea for s-to-o and o-to-s
239 ;; to be inverses of each other as far as possible.
244 ;;; regression test: STRING->UTF8 didn't properly handle a non-zero
245 ;;; START argument.
246 (assert (equalp #(50) (string-to-octets "42" :start 1 :external-format :utf-8)))
248 ;;; STRING->UTF8 should cope with NIL strings if a null range is required
249 (assert (equalp #() (string-to-octets "" :external-format :utf-8)))
250 (assert (equalp #() (string-to-octets (make-array 0 :element-type nil)
251 :external-format :utf-8)))
252 (assert (equalp #() (string-to-octets (make-array 5 :element-type nil)
253 :start 3 :end 3 :external-format :utf-8)))
254 (assert (equalp #(0) (string-to-octets (make-array 5 :element-type nil)
255 :start 3 :end 3 :null-terminate t
256 :external-format :utf-8)))
258 ;;; whoops: the iso-8859-2 format referred to an undefined symbol.
259 #+sb-unicode
260 (assert (equalp #(251) (string-to-octets (string (code-char 369))
261 :external-format :latin-2)))
263 #+sb-unicode
264 (with-test (:name (:euc-jp :decoding-errors))
265 (handler-bind ((sb-int:character-decoding-error
266 (lambda (c) (use-value #\? c))))
267 (assert (string= "?{?"
268 (octets-to-string
269 (coerce #(182 123 253 238) '(vector (unsigned-byte 8)))
270 :external-format :euc-jp)))))
272 #+sb-unicode
273 (with-test (:name (:utf-8 :surrogates :encoding-errors))
274 (handler-bind ((sb-int:character-encoding-error
275 (lambda (c) (use-value #\? c))))
276 (assert (equalp (string-to-octets (string (code-char #xd800))
277 :external-format :utf-8)
278 (vector (char-code #\?))))))
279 #+sb-unicode
280 (with-test (:name (:utf-8 :surrogates :decoding-errors))
281 (handler-bind ((sb-int:character-decoding-error
282 (lambda (c) (use-value #\? c))))
283 (assert (find #\? (octets-to-string
284 (coerce #(237 160 128) '(vector (unsigned-byte 8)))
285 :external-format :utf-8)))))
287 #+sb-unicode
288 (with-test (:name (:ucs-2 :out-of-range :encoding-errors))
289 (handler-bind ((sb-int:character-encoding-error
290 (lambda (c) (use-value "???" c))))
291 (assert (equalp (string-to-octets (string (code-char #x10001))
292 :external-format :ucs-2le)
293 #(63 0 63 0 63 0))))
294 (handler-bind ((sb-int:character-encoding-error
295 (lambda (c) (use-value "???" c))))
296 (assert (equalp (string-to-octets (string (code-char #x10001))
297 :external-format :ucs-2be)
298 #(0 63 0 63 0 63)))))
300 #+sb-unicode
301 (with-test (:name (:ucs-4 :out-of-range :decoding-errors))
302 (handler-bind ((sb-int:character-decoding-error
303 (lambda (c) (use-value "???" c))))
304 (assert (equalp (octets-to-string (coerce '(1 2 3 4) '(vector (unsigned-byte 8)))
305 :external-format :ucs-4le)
306 "???")))
307 (assert (equalp (octets-to-string (coerce '(#xff #xff #x10 #x00) '(vector (unsigned-byte 8)))
308 :external-format :ucs-4le)
309 (string (code-char #x10ffff))))
310 (handler-bind ((sb-int:character-decoding-error
311 (lambda (c) (use-value "???" c))))
312 (assert (equalp (octets-to-string (coerce '(1 2 3 4) '(vector (unsigned-byte 8)))
313 :external-format :ucs-4be)
314 "???"))
315 (assert (equalp (octets-to-string (coerce '(#x00 #x10 #xff #xff) '(vector (unsigned-byte 8)))
316 :external-format :ucs-4be)
317 (string (code-char #x10ffff))))))
319 #+sb-unicode
320 (with-test (:name (:utf-16le :ensure-roundtrip))
321 (flet ((enc (x)
322 (string-to-octets x :external-format :utf-16le))
323 (dec (x)
324 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
325 :external-format :utf-16le)))
326 (let ((string (map 'string 'code-char
327 '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd)))
328 (octets #(#x20 0 0 #x2 0 #x20 0 #xd8 0 #xdc 1 #xd8 1 #xdc #xff #xdb #xfd #xdf)))
329 (assert (equalp (enc string) octets))
330 (assert (equalp (dec octets) string)))))
331 #+sb-unicode
332 (with-test (:name (:utf-16le :encoding-error))
333 (flet ((enc (x)
334 (string-to-octets x :external-format '(:utf-16le :replacement #\?)))
335 (dec (x)
336 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
337 :external-format :utf-16le)))
338 (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff))))
339 (assert (equalp (enc string) #(63 0 63 0 63 0 63 0))))))
341 #+sb-unicode
342 (with-test (:name (:utf-16be :ensure-roundtrip))
343 (flet ((enc (x)
344 (string-to-octets x :external-format :utf-16be))
345 (dec (x)
346 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
347 :external-format :utf-16be)))
348 (let ((string (map 'string 'code-char
349 '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd)))
350 (octets #(0 #x20 #x2 0 #x20 0 #xd8 0 #xdc 0 #xd8 1 #xdc 1 #xdb #xff #xdf #xfd)))
351 (assert (equalp (enc string) octets))
352 (assert (equalp (dec octets) string)))))
353 #+sb-unicode
354 (with-test (:name (:utf-16be :encoding-error))
355 (flet ((enc (x)
356 (string-to-octets x :external-format '(:utf-16be :replacement #\?)))
357 (dec (x)
358 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
359 :external-format :utf-16be)))
360 (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff))))
361 (assert (equalp (enc string) #(0 63 0 63 0 63 0 63))))))
363 #+sb-unicode
364 (with-test (:name (:utf-32le :ensure-roundtrip))
365 (flet ((enc (x)
366 (string-to-octets x :external-format :utf-32le))
367 (dec (x)
368 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
369 :external-format :utf-32le)))
370 (let ((string (map 'string 'code-char
371 '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd)))
372 (octets #(#x20 0 0 0 0 #x2 0 0 0 #x20 0 0 0 0 1 0 1 4 1 0 #xfd #xff #x10 0)))
373 (assert (equalp (enc string) octets))
374 (assert (equalp (dec octets) string)))))
375 #+sb-unicode
376 (with-test (:name (:utf-32le :encoding-error))
377 (flet ((enc (x)
378 (string-to-octets x :external-format '(:utf-32le :replacement #\?)))
379 (dec (x)
380 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
381 :external-format :utf-32le)))
382 (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff))))
383 (assert (equalp (enc string) #(63 0 0 0 63 0 0 0 63 0 0 0 63 0 0 0))))))
385 #+sb-unicode
386 (with-test (:name (:utf-32be :ensure-roundtrip))
387 (flet ((enc (x)
388 (string-to-octets x :external-format :utf-32be))
389 (dec (x)
390 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
391 :external-format :utf-32be)))
392 (let ((string (map 'string 'code-char
393 '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd)))
394 (octets #(0 0 0 #x20 0 0 #x2 0 0 0 #x20 0 0 1 0 0 0 1 4 1 0 #x10 #xff #xfd)))
395 (assert (equalp (enc string) octets))
396 (assert (equalp (dec octets) string)))))
397 #+sb-unicode
398 (with-test (:name (:utf-32be :encoding-error))
399 (flet ((enc (x)
400 (string-to-octets x :external-format '(:utf-32be :replacement #\?)))
401 (dec (x)
402 (octets-to-string (coerce x '(vector (unsigned-byte 8)))
403 :external-format :utf-32be)))
404 (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff))))
405 (assert (equalp (enc string) #(0 0 0 63 0 0 0 63 0 0 0 63 0 0 0 63))))))