1 ;;;; tests of octet/character machinery with no side effects
3 ;;;; This software is part of the SBCL system. See the README file for
6 ;;;; While most of SBCL is derived from the CMU CL system, the test
7 ;;;; files (like this one) were written from scratch after the fork
10 ;;;; This software is in the public domain and is provided with
11 ;;;; absolutely no warranty. See the COPYING and CREDITS files for
12 ;;;; more information.
14 (cl:in-package
:cl-user
)
17 (declare (optimize debug
(speed 0)))
19 (labels ((ub8 (len-or-seq)
20 (if (numberp len-or-seq
)
21 (make-array len-or-seq
:element-type
'(unsigned-byte 8) :initial-element
0)
22 (coerce len-or-seq
'(simple-array (unsigned-byte 8) (*)))))
24 (ensure-roundtrip-ascii ()
25 (let ((octets (ub8 128)))
27 (setf (aref octets i
) i
))
28 (let* ((str (octets-to-string octets
:external-format
:ascii
))
29 (oct2 (string-to-octets str
:external-format
:ascii
)))
30 (assert (= (length octets
) (length oct2
)))
31 (assert (every #'= octets oct2
))))
34 (ensure-roundtrip-latin (format)
35 (let ((octets (ub8 256)))
37 (setf (aref octets i
) i
))
38 (let* ((str (octets-to-string octets
:external-format format
))
39 (oct2 (string-to-octets str
:external-format format
)))
40 (assert (= (length octets
) (length oct2
)))
41 (assert (every #'= octets oct2
))))
44 (ensure-roundtrip-latin1 ()
45 (ensure-roundtrip-latin :latin1
))
48 (ensure-roundtrip-latin9 ()
49 (ensure-roundtrip-latin :latin9
))
51 (ensure-roundtrip-utf8 ()
52 (let ((string (make-string char-code-limit
)))
53 (dotimes (i char-code-limit
)
54 (setf (char string i
) (code-char i
)))
56 (octets-to-string (string-to-octets string
:external-format
:utf8
)
57 :external-format
:utf8
)))
58 (assert (= (length string2
) (length string
)))
59 (assert (string= string string2
))))
62 (utf8-decode-test (octets expected-results expected-errors
)
63 (let ((error-count 0))
64 (handler-bind ((sb-int:character-decoding-error
68 (assert (string= expected-results
69 (octets-to-string (ub8 octets
)
70 :external-format
:utf-8
)))
71 (assert (= error-count expected-errors
)))))
73 (utf8-decode-tests (octets expected-results
)
74 (let ((expected-errors (count #\? expected-results
)))
75 (utf8-decode-test octets expected-results expected-errors
)
76 (utf8-decode-test (concatenate 'vector
80 (format nil
"\"~A\"" expected-results
)
83 (ensure-roundtrip-ascii)
84 (ensure-roundtrip-latin1)
87 (ensure-roundtrip-latin9)
88 ;; Latin-9 chars; the previous test checked roundtrip from
89 ;; octets->char and back, now test that the latin-9 characters did
90 ;; in fact appear during that trip.
91 (let ((l9c (map 'string
#'code-char
'(8364 352 353 381 382 338 339 376))))
93 (string= (octets-to-string (string-to-octets l9c
:external-format
:latin9
)
94 :external-format
:latin9
)
96 (ensure-roundtrip-utf8)
98 (let ((non-ascii-bytes (make-array 128
99 :element-type
'(unsigned-byte 8)
100 :initial-contents
(loop for i from
128 below
256
102 (handler-bind ((sb-int:character-decoding-error
104 (use-value "??" c
))))
105 (assert (string= (octets-to-string non-ascii-bytes
:external-format
:ascii
)
106 (make-string 256 :initial-element
#\?)))))
107 (let ((non-ascii-chars (make-array 128
108 :element-type
'character
109 :initial-contents
(loop for i from
128 below
256
110 collect
(code-char i
)))))
111 (handler-bind ((sb-int:character-encoding-error
113 (use-value "??" c
))))
114 (assert (equalp (string-to-octets non-ascii-chars
:external-format
:ascii
)
115 (make-array 256 :initial-element
(char-code #\?))))))
117 ;; From Markus Kuhn's UTF-8 test file:
118 ;; http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
120 ;; Too-big characters
123 (utf8-decode-tests #(#xc4
#x80
) "?") ; #x100
124 (utf8-decode-tests #(#xdf
#xbf
) "?") ; #x7ff
125 (utf8-decode-tests #(#xe0
#xa0
#x80
) "?") ; #x800
126 (utf8-decode-tests #(#xef
#xbf
#xbf
) "?") ; #xffff
127 (utf8-decode-tests #(#xf0
#x90
#x80
#x80
) "?")) ; #x10000
128 (utf8-decode-tests #(#xf4
#x90
#x80
#x80
) "?") ; #x110000
129 (utf8-decode-tests #(#xf7
#xbf
#xbf
#xbf
) "?") ; #x1fffff
130 (utf8-decode-tests #(#xf8
#x88
#x80
#x80
#x80
) "?") ; #x200000
131 (utf8-decode-tests #(#xfb
#xbf
#xbf
#xbf
#xbf
) "?") ; #x3ffffff
132 (utf8-decode-tests #(#xfc
#x84
#x80
#x80
#x80
#x80
) "?") ; #x4000000
133 (utf8-decode-tests #(#xfd
#xbf
#xbf
#xbf
#xbf
#xbf
) "?") ; #x7fffffff
135 ;; Unexpected continuation bytes
136 (utf8-decode-tests #(#x80
) "?")
137 (utf8-decode-tests #(#xbf
) "?")
138 (utf8-decode-tests #(#x80
#xbf
) "??")
139 (utf8-decode-tests #(#x80
#xbf
#x80
) "???")
140 (utf8-decode-tests #(#x80
#xbf
#x80
#xbf
) "????")
141 (utf8-decode-tests #(#x80
#xbf
#x80
#xbf
#x80
) "?????")
142 (utf8-decode-tests #(#x80
#xbf
#x80
#xbf
#x80
#xbf
) "??????")
143 (utf8-decode-tests #(#x80
#xbf
#x80
#xbf
#x80
#xbf
#x80
) "???????")
145 ;; All 64 continuation bytes in a row
146 (apply #'utf8-decode-tests
147 (loop for i from
#x80 to
#xbf
149 collect
#\? into chars
150 finally
(return (list bytes
151 (coerce chars
'string
)))))
153 ;; Lonely start characters
154 (flet ((lsc (first last
)
155 (apply #'utf8-decode-tests
156 (loop for i from first to last
157 nconc
(list i
32) into bytes
158 nconc
(list #\? #\Space
) into chars
159 finally
(return (list bytes
160 (coerce chars
'string
)))))
161 (apply #'utf8-decode-tests
162 (loop for i from first to last
164 collect
#\? into chars
165 finally
(return (list bytes
166 (coerce chars
'string
)))))))
167 (lsc #xc0
#xdf
) ; 2-byte sequence start chars
168 (lsc #xe0
#xef
) ; 3-byte
169 (lsc #xf0
#xf7
) ; 4-byte
170 (lsc #xf8
#xfb
) ; 5-byte
171 (lsc #xfc
#xfd
)) ; 6-byte
173 ;; Otherwise incomplete sequences (last continuation byte missing)
174 (utf8-decode-tests #0=#(#xc0
) "?")
175 (utf8-decode-tests #1=#(#xe0
#x80
) "?")
176 (utf8-decode-tests #2=#(#xf0
#x80
#x80
) "?")
177 (utf8-decode-tests #3=#(#xf8
#x80
#x80
#x80
) "?")
178 (utf8-decode-tests #4=#(#xfc
#x80
#x80
#x80
#x80
) "?")
179 (utf8-decode-tests #5=#(#xdf
) "?")
180 (utf8-decode-tests #6=#(#xef
#xbf
) "?")
181 (utf8-decode-tests #7=#(#xf7
#xbf
#xbf
) "?")
182 (utf8-decode-tests #8=#(#xfb
#xbf
#xbf
#xbf
) "?")
183 (utf8-decode-tests #9=#(#xfd
#xbf
#xbf
#xbf
#xbf
) "?")
185 ;; All ten previous tests concatenated
186 (utf8-decode-tests (concatenate 'vector
#0# #1# #2# #3# #4# #5# #6# #7# #8# #9#)
189 ;; Random impossible bytes
190 (utf8-decode-tests #(#xfe
) "?")
191 (utf8-decode-tests #(#xff
) "?")
192 (utf8-decode-tests #(#xfe
#xfe
#xff
#xff
) "????")
194 ;; Overlong sequences - /
195 (utf8-decode-tests #(#xc0
#xaf
) "?")
196 (utf8-decode-tests #(#xe0
#x80
#xaf
) "?")
197 (utf8-decode-tests #(#xf0
#x80
#x80
#xaf
) "?")
198 (utf8-decode-tests #(#xf8
#x80
#x80
#x80
#xaf
) "?")
199 (utf8-decode-tests #(#xfc
#x80
#x80
#x80
#x80
#xaf
) "?")
201 ;; Overlong sequences - #\Rubout
202 (utf8-decode-tests #(#xc1
#xbf
) "?")
203 (utf8-decode-tests #(#xe0
#x9f
#xbf
) "?")
204 (utf8-decode-tests #(#xf0
#x8f
#xbf
#xbf
) "?")
205 (utf8-decode-tests #(#xf8
#x87
#xbf
#xbf
#xbf
) "?")
206 (utf8-decode-tests #(#xfc
#x83
#xbf
#xbf
#xbf
#xbf
) "?")
208 ;; Overlong sequences - #\Null
209 (utf8-decode-tests #(#xc0
#x80
) "?")
210 (utf8-decode-tests #(#xe0
#x80
#x80
) "?")
211 (utf8-decode-tests #(#xf0
#x80
#x80
#x80
) "?")
212 (utf8-decode-tests #(#xf8
#x80
#x80
#x80
#x80
) "?")
213 (utf8-decode-tests #(#xfc
#x80
#x80
#x80
#x80
#x80
) "?")
215 ;; Not testing surrogates & characters #xFFFE, #xFFFF; they're
216 ;; perfectly good sbcl chars even if they're not actually ISO 10646
217 ;; characters, and it's probably a good idea for s-to-o and o-to-s
218 ;; to be inverses of each other as far as possible.
223 ;;; regression test: STRING->UTF8 didn't properly handle a non-zero
225 (assert (equalp #(50) (string-to-octets "42" :start
1 :external-format
:utf-8
)))
227 ;;; STRING->UTF8 should cope with NIL strings if a null range is required
228 (assert (equalp #() (string-to-octets "" :external-format
:utf-8
)))
229 (assert (equalp #() (string-to-octets (make-array 0 :element-type nil
)
230 :external-format
:utf-8
)))
231 (assert (equalp #() (string-to-octets (make-array 5 :element-type nil
)
232 :start
3 :end
3 :external-format
:utf-8
)))
234 ;;; whoops: the iso-8859-2 format referred to an undefined symbol.
236 (assert (equalp #(251) (string-to-octets (string (code-char 369))
237 :external-format
:latin-2
)))