1.0.27.46: Fix build on systems with "src" in the path.
[sbcl/tcr.git] / src / code / octets.lisp
blob4f725a7300aef85cde047353e3e663ce1d43aab5
1 ;;;; code for string to octet conversion
3 ;;;; This software is part of the SBCL system. See the README file for
4 ;;;; more information.
5 ;;;;
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
12 ;;; FIXME: The latin9 stuff is currently #!+sb-unicode, because I
13 ;;; don't like the idea of trying to do CODE-CHAR #x<big>. Is that a
14 ;;; justified fear? Can we arrange that it's caught and converted to
15 ;;; a decoding error error? Or should we just give up on non-Unicode
16 ;;; builds?
18 (in-package "SB!IMPL")
20 ;;; FIXME: don't we have this somewhere else?
21 (deftype array-range ()
22 "A number that can represent an index into a vector, including
23 one-past-the-end"
24 '(integer 0 #.sb!xc:array-dimension-limit))
26 ;;;; conditions
28 ;;; encoding condition
30 (define-condition octets-encoding-error (character-encoding-error)
31 ((string :initarg :string :reader octets-encoding-error-string)
32 (position :initarg :position :reader octets-encoding-error-position)
33 (external-format :initarg :external-format
34 :reader octets-encoding-error-external-format))
35 (:report (lambda (c s)
36 (format s "Unable to encode character ~A as ~S."
37 (char-code (char (octets-encoding-error-string c)
38 (octets-encoding-error-position c)))
39 (octets-encoding-error-external-format c)))))
41 (defun read-replacement-character ()
42 (format *query-io*
43 "Replacement byte, bytes, character, or string (evaluated): ")
44 (finish-output *query-io*)
45 (list (eval (read *query-io*))))
47 (defun encoding-error (external-format string pos)
48 (restart-case
49 (error 'octets-encoding-error
50 :external-format external-format
51 :string string
52 :position pos)
53 (use-value (replacement)
54 :report "Supply a set of bytes to use in place of the invalid one."
55 :interactive read-replacement-character
56 (typecase replacement
57 ((unsigned-byte 8)
58 (make-array 1 :element-type '(unsigned-byte 8) :initial-element replacement))
59 (character
60 (string-to-octets (string replacement)
61 :external-format external-format))
62 (string
63 (string-to-octets replacement
64 :external-format external-format))
66 (coerce replacement '(simple-array (unsigned-byte 8) (*))))))))
68 ;;; decoding condition
70 ;;; for UTF8, the specific condition signalled will be a generalized
71 ;;; instance of one of the following:
72 ;;;
73 ;;; end-of-input-in-character
74 ;;; character-out-of-range
75 ;;; invalid-utf8-starter-byte
76 ;;; invalid-utf8-continuation-byte
77 ;;; overlong-utf8-sequence
78 ;;;
79 ;;; Of these, the only one truly likely to be of interest to calling
80 ;;; code is end-of-input-in-character (in which case it's likely to
81 ;;; want to make a note of octet-decoding-error-start, supply "" as a
82 ;;; replacement string, and then move that last chunk of bytes to the
83 ;;; beginning of its buffer for the next go round) but they're all
84 ;;; provided on the off chance they're of interest. The next most
85 ;;; likely interesting option is overlong-utf8-sequence -- the
86 ;;; application, if it cares to, can decode this itself (taking care
87 ;;; to ensure that the result isn't out of range of CHAR-CODE-LIMIT)
88 ;;; and return that result. This library doesn't provide support for
89 ;;; that as a conforming UTF-8-using program is supposed to treat it
90 ;;; as an error.
92 (define-condition octet-decoding-error (character-decoding-error)
93 ((array :initarg :array :accessor octet-decoding-error-array)
94 (start :initarg :start :accessor octet-decoding-error-start)
95 (end :initarg :end :accessor octet-decoding-error-end)
96 (position :initarg :pos :accessor octet-decoding-bad-byte-position)
97 (external-format :initarg :external-format
98 :accessor octet-decoding-error-external-format))
99 (:report
100 (lambda (condition stream)
101 (format stream "Illegal ~S character starting at byte position ~D."
102 (octet-decoding-error-external-format condition)
103 (octet-decoding-error-start condition)))))
105 (define-condition end-of-input-in-character (octet-decoding-error) ())
106 (define-condition character-out-of-range (octet-decoding-error) ())
107 (define-condition invalid-utf8-starter-byte (octet-decoding-error) ())
108 (define-condition invalid-utf8-continuation-byte (octet-decoding-error) ())
109 (define-condition overlong-utf8-sequence (octet-decoding-error) ())
111 (define-condition malformed-ascii (octet-decoding-error) ())
113 (defun read-replacement-string ()
114 (format *query-io* "Enter a replacement string designator (evaluated): ")
115 (finish-output *query-io*)
116 (list (eval (read *query-io*))))
118 (defun decoding-error (array start end external-format reason pos)
119 (restart-case
120 (error reason
121 :external-format external-format
122 :array array
123 :start start
124 :end end
125 :pos pos)
126 (use-value (s)
127 :report "Supply a replacement string designator."
128 :interactive read-replacement-string
129 (string s))))
131 ;;; Utilities used in both to-string and to-octet conversions
133 (defmacro instantiate-octets-definition (definer)
134 `(progn
135 (,definer aref (simple-array (unsigned-byte 8) (*)))
136 (,definer sap-ref-8 system-area-pointer)))
138 ;;; FIXME: find out why the comment about SYMBOLICATE below is true
139 ;;; and fix it, or else replace with SYMBOLICATE.
141 ;;; FIXME: this is cute, but is going to prevent greps for def.*<name>
142 ;;; from working for (defun ,(make-od-name ...) ...)
143 (eval-when (:compile-toplevel :load-toplevel :execute)
144 (defun make-od-name (sym1 sym2)
145 ;; "MAKE-NAME" is too generic, but this doesn't do quite what
146 ;; SYMBOLICATE does; MAKE-OD-NAME ("octets definition") it is
147 ;; then.
148 (intern (concatenate 'string (symbol-name sym1) "-" (symbol-name sym2))
149 (symbol-package sym1))))
151 ;;;; to-octets conversions
153 ;;; to latin (including ascii)
155 ;;; Converting bytes to character codes is easy: just use a 256-element
156 ;;; lookup table that maps each possible byte to its corresponding
157 ;;; character code.
159 ;;; Converting character codes to bytes is a little harder, since the
160 ;;; codes may be spare (e.g. we use codes 0-127, 3490, and 4598). The
161 ;;; previous version of this macro utilized a gigantic CASE expression
162 ;;; to do the hard work, with the result that the code was huge (since
163 ;;; SBCL's then-current compilation strategy for CASE expressions was
164 ;;; (and still is) converting CASE into COND into if-the-elses--which is
165 ;;; also inefficient unless your code happens to occur very early in the
166 ;;; chain.
168 ;;; The current strategy is to build a table:
170 ;;; [ ... code_1 byte_1 code_2 byte_2 ... code_n byte_n ... ]
172 ;;; such that the codes are sorted in order from lowest to highest. We
173 ;;; can then binary search the table to discover the appropriate byte
174 ;;; for a character code. We also implement an optimization: all unibyte
175 ;;; mappings do not remap ASCII (0-127) and some do not remap part of
176 ;;; the range beyond character code 127. So we check to see if the
177 ;;; character code falls into that range first (a quick check, since
178 ;;; character codes are guaranteed to be positive) and then do the binary
179 ;;; search if not. This optimization also enables us to cut down on the
180 ;;; size of our lookup table.
181 (defmacro define-unibyte-mapper (byte-char-name code-byte-name &rest exceptions)
182 (let* (;; Build a list of (CODE BYTE) pairs
183 (pairs (loop for byte below 256
184 for code = (let ((exception (cdr (assoc byte exceptions))))
185 (cond
186 ((car exception) (car exception))
187 ((null exception) byte)
188 (t nil)))
189 when code collect (list code byte) into elements
190 finally (return elements)))
191 ;; Find the smallest character code such that the corresponding
192 ;; byte is != to the code.
193 (lowest-non-equivalent-code (position-if-not #'(lambda (pair)
194 (apply #'= pair))
195 pairs))
196 ;; Sort them for our lookup table.
197 (sorted-pairs (sort (subseq pairs lowest-non-equivalent-code)
198 #'< :key #'car))
199 ;; Create the lookup table.
200 (sorted-lookup-table
201 (reduce #'append sorted-pairs :from-end t :initial-value nil)))
202 `(progn
203 ; Can't inline it with a non-null lexical environment anyway.
204 ;(declaim (inline ,byte-char-name))
205 (let ((byte-to-code-table
206 ,(make-array 256 :element-type t #+nil 'char-code
207 :initial-contents (loop for byte below 256
208 collect
209 (let ((exception (cadr (assoc byte exceptions))))
210 (if exception
211 exception
212 byte)))))
213 (code-to-byte-table
214 ,(make-array (length sorted-lookup-table)
215 :initial-contents sorted-lookup-table)))
216 (defun ,byte-char-name (byte)
217 (declare (optimize speed (safety 0))
218 (type (unsigned-byte 8) byte))
219 (aref byte-to-code-table byte))
220 (defun ,code-byte-name (code)
221 (declare (optimize speed (safety 0))
222 (type char-code code))
223 (if (< code ,lowest-non-equivalent-code)
224 code
225 ;; We could toss in some TRULY-THEs if we really needed to
226 ;; make this faster...
227 (loop with low = 0
228 with high = (- (length code-to-byte-table) 2)
229 while (< low high)
230 do (let ((mid (logandc2 (truncate (+ low high 2) 2) 1)))
231 (if (< code (aref code-to-byte-table mid))
232 (setf high (- mid 2))
233 (setf low mid)))
234 finally (return (if (eql code (aref code-to-byte-table low))
235 (aref code-to-byte-table (1+ low))
236 nil)))))))))
238 #!+sb-unicode
239 (define-unibyte-mapper
240 latin9->code-mapper
241 code->latin9-mapper
242 (#xA4 #x20AC)
243 (#xA6 #x0160)
244 (#xA8 #x0161)
245 (#xB4 #x017D)
246 (#xB8 #x017E)
247 (#xBC #x0152)
248 (#xBD #x0153)
249 (#xBE #x0178))
251 (declaim (inline get-latin-bytes))
252 (defun get-latin-bytes (mapper external-format string pos)
253 (let ((code (funcall mapper (char-code (char string pos)))))
254 (declare (type (or null char-code) code))
255 (values (cond
256 ((and code (< code 256)) code)
258 (encoding-error external-format string pos)))
259 1)))
261 (declaim (inline code->ascii-mapper))
262 (defun code->ascii-mapper (code)
263 (declare (optimize speed (safety 0))
264 (type char-code code))
265 (if (> code 127)
267 code))
269 (declaim (inline get-ascii-bytes))
270 (defun get-ascii-bytes (string pos)
271 (declare (optimize speed (safety 0))
272 (type simple-string string)
273 (type array-range pos))
274 (get-latin-bytes #'code->ascii-mapper :ascii string pos))
276 (declaim (inline get-latin1-bytes))
277 (defun get-latin1-bytes (string pos)
278 (declare (optimize speed (safety 0))
279 (type simple-string string)
280 (type array-range pos))
281 (get-latin-bytes #'identity :latin-1 string pos))
283 #!+sb-unicode
284 (progn
285 (declaim (inline get-latin9-bytes))
286 (defun get-latin9-bytes (string pos)
287 (declare (optimize speed (safety 0))
288 (type simple-string string)
289 (type array-range pos))
290 (get-latin-bytes #'code->latin9-mapper :latin-9 string pos)))
292 (declaim (inline string->latin%))
293 (defun string->latin% (string sstart send get-bytes null-padding)
294 (declare (optimize speed)
295 (type simple-string string)
296 (type index sstart send)
297 (type (integer 0 1) null-padding)
298 (type function get-bytes))
299 ;; The latin encodings are all unibyte encodings, so just directly
300 ;; compute the number of octets we're going to generate.
301 (let ((octets (make-array (+ (- send sstart) null-padding)
302 ;; This takes care of any null padding the
303 ;; caller requests.
304 :initial-element 0
305 :element-type '(unsigned-byte 8)))
306 (index 0)
307 (error-position 0))
308 (tagbody
309 :no-error
310 (loop for pos of-type index from sstart below send
311 do (let ((byte (funcall get-bytes string pos)))
312 (typecase byte
313 ((unsigned-byte 8)
314 (locally (declare (optimize (sb!c::insert-array-bounds-checks 0)))
315 (setf (aref octets index) byte)))
316 ((simple-array (unsigned-byte 8) (*))
317 ;; KLUDGE: We ran into encoding errors. Bail and do
318 ;; things the slow way (does anybody actually use this
319 ;; functionality besides our own test suite?).
320 (setf error-position pos)
321 (go :error)))
322 (incf index))
323 finally (return-from string->latin% octets))
324 :error
325 ;; We have encoded INDEX octets so far and we ran into an encoding
326 ;; error at ERROR-POSITION.
327 (let ((new-octets (make-array (* index 2)
328 :element-type '(unsigned-byte 8)
329 :adjustable t :fill-pointer index)))
330 (replace new-octets octets)
331 (loop for pos of-type index from error-position below send
332 do (let ((thing (funcall get-bytes string pos)))
333 (typecase thing
334 ((unsigned-byte 8)
335 (vector-push-extend thing new-octets))
336 ((simple-array (unsigned-byte 8) (*))
337 (dotimes (i (length thing))
338 (vector-push-extend (aref thing i) new-octets)))))
339 finally (return-from string->latin%
340 (progn
341 (unless (zerop null-padding)
342 (vector-push-extend 0 new-octets))
343 (copy-seq new-octets))))))))
345 (defun string->ascii (string sstart send null-padding)
346 (declare (optimize speed (safety 0))
347 (type simple-string string)
348 (type array-range sstart send))
349 (values (string->latin% string sstart send #'get-ascii-bytes null-padding)))
351 (defun string->latin1 (string sstart send null-padding)
352 (declare (optimize speed (safety 0))
353 (type simple-string string)
354 (type array-range sstart send))
355 (values (string->latin% string sstart send #'get-latin1-bytes null-padding)))
357 #!+sb-unicode
358 (defun string->latin9 (string sstart send null-padding)
359 (declare (optimize speed (safety 0))
360 (type simple-string string)
361 (type array-range sstart send))
362 (values (string->latin% string sstart send #'get-latin9-bytes null-padding)))
364 ;;; to utf8
366 (declaim (inline char-len-as-utf8))
367 (defun char-len-as-utf8 (code)
368 (declare (optimize speed (safety 0))
369 (type (integer 0 (#.sb!xc:char-code-limit)) code))
370 (cond ((< code 0) (bug "can't happen"))
371 ((< code #x80) 1)
372 ((< code #x800) 2)
373 ((< code #x10000) 3)
374 ((< code #x110000) 4)
375 (t (bug "can't happen"))))
377 (defun string->utf8 (string sstart send null-padding)
378 (declare (optimize (speed 3) (safety 0))
379 (type simple-string string)
380 (type (integer 0 1) null-padding)
381 (type array-range sstart send))
382 (macrolet ((ascii-bash ()
383 '(let ((array (make-array (+ null-padding (- send sstart))
384 :element-type '(unsigned-byte 8))))
385 (loop for i from 0
386 and j from sstart below send
387 do (setf (aref array i) (char-code (char string j))))
388 array)))
389 (etypecase string
390 ((simple-array character (*))
391 (let ((utf8-length 0))
392 ;; Since it has to fit in a vector, it must be a fixnum!
393 (declare (type (and unsigned-byte fixnum) utf8-length))
394 (loop for i of-type index from sstart below send
395 do (incf utf8-length (char-len-as-utf8 (char-code (char string i)))))
396 (if (= utf8-length (- send sstart))
397 (ascii-bash)
398 (let ((array (make-array (+ null-padding utf8-length)
399 :element-type '(unsigned-byte 8)))
400 (index 0))
401 (declare (type index index))
402 (flet ((add-byte (b)
403 (setf (aref array index) b)
404 (incf index)))
405 (declare (inline add-byte))
406 (loop for i of-type index from sstart below send
407 do (let ((code (char-code (char string i))))
408 (case (char-len-as-utf8 code)
410 (add-byte code))
412 (add-byte (logior #b11000000 (ldb (byte 5 6) code)))
413 (add-byte (logior #b10000000 (ldb (byte 6 0) code))))
415 (add-byte (logior #b11100000 (ldb (byte 4 12) code)))
416 (add-byte (logior #b10000000 (ldb (byte 6 6) code)))
417 (add-byte (logior #b10000000 (ldb (byte 6 0) code))))
419 (add-byte (logior #b11110000 (ldb (byte 3 18) code)))
420 (add-byte (logior #b10000000 (ldb (byte 6 12) code)))
421 (add-byte (logior #b10000000 (ldb (byte 6 6) code)))
422 (add-byte (logior #b10000000 (ldb (byte 6 0) code))))))
423 finally (return array)))))))
424 #!+sb-unicode
425 ((simple-array base-char (*))
426 ;; On unicode builds BASE-STRINGs are limited to ASCII range, so we can take
427 ;; a fast path -- and get benefit of the element type information. On non-unicode
428 ;; build BASE-CHAR == CHARACTER.
429 (ascii-bash))
430 ((simple-array nil (*))
431 ;; Just get the error...
432 (aref string sstart)))))
434 ;;;; to-string conversions
436 ;;; from latin (including ascii)
438 (defmacro define-ascii->string (accessor type)
439 (let ((name (make-od-name 'ascii->string accessor)))
440 `(progn
441 (defun ,name (array astart aend)
442 (declare (optimize speed)
443 (type ,type array)
444 (type array-range astart aend))
445 ;; Since there is such a thing as a malformed ascii byte, a
446 ;; simple "make the string, fill it in" won't do.
447 (let ((string (make-array 0 :element-type 'character :fill-pointer 0 :adjustable t)))
448 (loop for apos from astart below aend
449 do (let* ((code (,accessor array apos))
450 (string-content
451 (if (< code 128)
452 (code-char code)
453 (decoding-error array apos (1+ apos) :ascii
454 'malformed-ascii apos))))
455 (if (characterp string-content)
456 (vector-push-extend string-content string)
457 (loop for c across string-content
458 do (vector-push-extend c string))))
459 finally (return (coerce string 'simple-string))))))))
460 (instantiate-octets-definition define-ascii->string)
462 (defmacro define-latin->string* (accessor type)
463 (let ((name (make-od-name 'latin->string* accessor)))
464 `(progn
465 (declaim (inline ,name))
466 (defun ,name (string sstart send array astart aend mapper)
467 (declare (optimize speed (safety 0))
468 (type simple-string string)
469 (type ,type array)
470 (type array-range sstart send astart aend)
471 (function mapper))
472 (loop for spos from sstart below send
473 for apos from astart below aend
474 do (setf (char string spos)
475 (code-char (funcall mapper (,accessor array apos))))
476 finally (return (values string spos apos)))))))
477 (instantiate-octets-definition define-latin->string*)
479 (defmacro define-latin1->string* (accessor type)
480 (declare (ignore type))
481 (let ((name (make-od-name 'latin1->string* accessor)))
482 `(progn
483 (defun ,name (string sstart send array astart aend)
484 (,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'identity)))))
485 (instantiate-octets-definition define-latin1->string*)
487 #!+sb-unicode
488 (progn
489 (defmacro define-latin9->string* (accessor type)
490 (declare (ignore type))
491 (let ((name (make-od-name 'latin9->string* accessor)))
492 `(progn
493 (defun ,name (string sstart send array astart aend)
494 (,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'latin9->code-mapper)))))
495 (instantiate-octets-definition define-latin9->string*))
497 (defmacro define-latin->string (accessor type)
498 (let ((name (make-od-name 'latin->string accessor)))
499 `(progn
500 (declaim (inline ,name))
501 (defun ,name (array astart aend mapper)
502 (declare (optimize speed (safety 0))
503 (type ,type array)
504 (type array-range astart aend)
505 (type function mapper))
506 (let ((length (the array-range (- aend astart))))
507 (values (,(make-od-name 'latin->string* accessor) (make-string length) 0 length
508 array astart aend
509 mapper)))))))
510 (instantiate-octets-definition define-latin->string)
512 (defmacro define-latin1->string (accessor type)
513 (declare (ignore type))
514 `(defun ,(make-od-name 'latin1->string accessor) (array astart aend)
515 (,(make-od-name 'latin->string accessor) array astart aend #'identity)))
516 (instantiate-octets-definition define-latin1->string)
518 #!+sb-unicode
519 (progn
520 (defmacro define-latin9->string (accessor type)
521 (declare (ignore type))
522 `(defun ,(make-od-name 'latin9->string accessor) (array astart aend)
523 (,(make-od-name 'latin->string accessor) array astart aend #'latin9->code-mapper)))
524 (instantiate-octets-definition define-latin9->string))
526 ;;; from utf8
528 (defmacro define-bytes-per-utf8-character (accessor type)
529 (let ((name (make-od-name 'bytes-per-utf8-character accessor)))
530 `(progn
531 ;;(declaim (inline ,name))
532 (let ((lexically-max
533 (string->utf8 (string (code-char ,(1- sb!xc:char-code-limit)))
534 0 1 0)))
535 (declare (type (simple-array (unsigned-byte 8) (#!+sb-unicode 4 #!-sb-unicode 2)) lexically-max))
536 (defun ,name (array pos end)
537 (declare (optimize speed (safety 0))
538 (type ,type array)
539 (type array-range pos end))
540 ;; returns the number of bytes consumed and nil if it's a
541 ;; valid character or the number of bytes consumed and a
542 ;; replacement string if it's not.
543 (let ((initial-byte (,accessor array pos))
544 (reject-reason nil)
545 (reject-position pos)
546 (remaining-bytes (- end pos)))
547 (declare (type array-range reject-position remaining-bytes))
548 (labels ((valid-utf8-starter-byte-p (b)
549 (declare (type (unsigned-byte 8) b))
550 (let ((ok (cond
551 ((zerop (logand b #b10000000)) 1)
552 ((= (logand b #b11100000) #b11000000)
554 ((= (logand b #b11110000) #b11100000)
556 ((= (logand b #b11111000) #b11110000)
558 ((= (logand b #b11111100) #b11111000)
560 ((= (logand b #b11111110) #b11111100)
563 nil))))
564 (unless ok
565 (setf reject-reason 'invalid-utf8-starter-byte))
566 ok))
567 (enough-bytes-left-p (x)
568 (let ((ok (> end (+ pos (1- x)))))
569 (unless ok
570 (setf reject-reason 'end-of-input-in-character))
571 ok))
572 (valid-secondary-p (x)
573 (let* ((idx (the array-range (+ pos x)))
574 (b (,accessor array idx))
575 (ok (= (logand b #b11000000) #b10000000)))
576 (unless ok
577 (setf reject-reason 'invalid-utf8-continuation-byte)
578 (setf reject-position idx))
579 ok))
580 (preliminary-ok-for-length (maybe-len len)
581 (and (eql maybe-len len)
582 ;; Has to be done in this order so that
583 ;; certain broken sequences (e.g., the
584 ;; two-byte sequence `"initial (length 3)"
585 ;; "non-continuation"' -- `#xef #x32')
586 ;; signal only part of that sequence as
587 ;; erroneous.
588 (loop for i from 1 below (min len remaining-bytes)
589 always (valid-secondary-p i))
590 (enough-bytes-left-p len)))
591 (overlong-chk (x y)
592 (let ((ok (or (/= initial-byte x)
593 (/= (logior (,accessor array (the array-range (+ pos 1)))
595 y))))
596 (unless ok
597 (setf reject-reason 'overlong-utf8-sequence))
598 ok))
599 (character-below-char-code-limit-p ()
600 ;; This is only called on a four-byte sequence
601 ;; (two in non-unicode builds) to ensure we
602 ;; don't go over SBCL's character limts.
603 (let ((ok (cond ((< (aref lexically-max 0) (,accessor array pos))
604 nil)
605 ((> (aref lexically-max 0) (,accessor array pos))
607 ((< (aref lexically-max 1) (,accessor array (+ pos 1)))
608 nil)
609 #!+sb-unicode
610 ((> (aref lexically-max 1) (,accessor array (+ pos 1)))
612 #!+sb-unicode
613 ((< (aref lexically-max 2) (,accessor array (+ pos 2)))
614 nil)
615 #!+sb-unicode
616 ((> (aref lexically-max 2) (,accessor array (+ pos 2)))
618 #!+sb-unicode
619 ((< (aref lexically-max 3) (,accessor array (+ pos 3)))
620 nil)
621 (t t))))
622 (unless ok
623 (setf reject-reason 'character-out-of-range))
624 ok)))
625 (declare (inline valid-utf8-starter-byte-p
626 enough-bytes-left-p
627 valid-secondary-p
628 preliminary-ok-for-length
629 overlong-chk))
630 (let ((maybe-len (valid-utf8-starter-byte-p initial-byte)))
631 (cond ((eql maybe-len 1)
632 (values 1 nil))
633 ((and (preliminary-ok-for-length maybe-len 2)
634 (overlong-chk #b11000000 #b10111111)
635 (overlong-chk #b11000001 #b10111111)
636 #!-sb-unicode (character-below-char-code-limit-p))
637 (values 2 nil))
638 ((and (preliminary-ok-for-length maybe-len 3)
639 (overlong-chk #b11100000 #b10011111)
640 #!-sb-unicode (not (setf reject-reason 'character-out-of-range)))
641 (values 3 nil))
642 ((and (preliminary-ok-for-length maybe-len 4)
643 (overlong-chk #b11110000 #b10001111)
644 #!-sb-unicode (not (setf reject-reason 'character-out-of-range))
645 (character-below-char-code-limit-p))
646 (values 4 nil))
647 ((and (preliminary-ok-for-length maybe-len 5)
648 (overlong-chk #b11111000 #b10000111)
649 (not (setf reject-reason 'character-out-of-range)))
650 (bug "can't happen"))
651 ((and (preliminary-ok-for-length maybe-len 6)
652 (overlong-chk #b11111100 #b10000011)
653 (not (setf reject-reason 'character-out-of-range)))
654 (bug "can't happen"))
656 (let* ((bad-end (ecase reject-reason
657 (invalid-utf8-starter-byte
658 (1+ pos))
659 (end-of-input-in-character
660 end)
661 (invalid-utf8-continuation-byte
662 reject-position)
663 ((overlong-utf8-sequence character-out-of-range)
664 (+ pos maybe-len))))
665 (bad-len (- bad-end pos)))
666 (declare (type array-range bad-end bad-len))
667 (let ((replacement (decoding-error array pos bad-end :utf-8 reject-reason reject-position)))
668 (values bad-len replacement)))))))))))))
669 (instantiate-octets-definition define-bytes-per-utf8-character)
671 (defmacro define-simple-get-utf8-char (accessor type)
672 (let ((name (make-od-name 'simple-get-utf8-char accessor)))
673 `(progn
674 (declaim (inline ,name))
675 (defun ,name (array pos bytes)
676 (declare (optimize speed (safety 0))
677 (type ,type array)
678 (type array-range pos)
679 (type (integer 1 4) bytes))
680 (flet ((cref (x)
681 (,accessor array (the array-range (+ pos x)))))
682 (declare (inline cref))
683 (code-char (ecase bytes
684 (1 (cref 0))
685 (2 (logior (ash (ldb (byte 5 0) (cref 0)) 6)
686 (ldb (byte 6 0) (cref 1))))
687 (3 (logior (ash (ldb (byte 4 0) (cref 0)) 12)
688 (ash (ldb (byte 6 0) (cref 1)) 6)
689 (ldb (byte 6 0) (cref 2))))
690 (4 (logior (ash (ldb (byte 3 0) (cref 0)) 18)
691 (ash (ldb (byte 6 0) (cref 1)) 12)
692 (ash (ldb (byte 6 0) (cref 2)) 6)
693 (ldb (byte 6 0) (cref 3)))))))))))
694 (instantiate-octets-definition define-simple-get-utf8-char)
696 (defmacro define-utf8->string (accessor type)
697 (let ((name (make-od-name 'utf8->string accessor)))
698 `(progn
699 (defun ,name (array astart aend)
700 (declare (optimize speed (safety 0))
701 (type ,type array)
702 (type array-range astart aend))
703 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
704 (loop with pos = astart
705 while (< pos aend)
706 do (multiple-value-bind (bytes invalid)
707 (,(make-od-name 'bytes-per-utf8-character accessor) array pos aend)
708 (declare (type (or null string) invalid))
709 (cond
710 ((null invalid)
711 (vector-push-extend (,(make-od-name 'simple-get-utf8-char accessor) array pos bytes) string))
713 (dotimes (i (length invalid))
714 (vector-push-extend (char invalid i) string))))
715 (incf pos bytes)))
716 (coerce string 'simple-string))))))
717 (instantiate-octets-definition define-utf8->string)
719 ;;;; external formats
721 (defvar *default-external-format* nil)
723 (defun default-external-format ()
724 (or *default-external-format*
725 ;; On non-unicode, use iso-8859-1 instead of detecting it from
726 ;; the locale settings. Defaulting to an external-format which
727 ;; can represent characters that the CHARACTER type can't
728 ;; doesn't seem very sensible.
729 #!-sb-unicode
730 (setf *default-external-format* :latin-1)
731 (let ((external-format #!-win32 (intern (or (sb!alien:alien-funcall
732 (extern-alien
733 "nl_langinfo"
734 (function (c-string :external-format :latin-1)
735 int))
736 sb!unix:codeset)
737 "LATIN-1")
738 "KEYWORD")
739 #!+win32 (sb!win32::ansi-codepage)))
740 (/show0 "cold-printing defaulted external-format:")
741 #!+sb-show
742 (cold-print external-format)
743 (/show0 "matching to known aliases")
744 (dolist (entry *external-formats*
745 (progn
746 ;;; FIXME! This WARN would try to do printing
747 ;;; before the streams have been initialized,
748 ;;; causing an infinite erroring loop. We should
749 ;;; either print it by calling to C, or delay the
750 ;;; warning until later. Since we're in freeze
751 ;;; right now, and the warning isn't really
752 ;;; essential, I'm doing what's least likely to
753 ;;; cause damage, and commenting it out. This
754 ;;; should be revisited after 0.9.17. -- JES,
755 ;;; 2006-09-21
756 #+nil
757 (warn "Invalid external-format ~A; using LATIN-1"
758 external-format)
759 (setf external-format :latin-1)))
760 (/show0 "cold printing known aliases:")
761 #!+sb-show
762 (dolist (alias (first entry)) (cold-print alias))
763 (/show0 "done cold-printing known aliases")
764 (when (member external-format (first entry))
765 (/show0 "matched")
766 (return)))
767 (/show0 "/default external format ok")
768 (setf *default-external-format* external-format))))
770 ;;; FIXME: OAOOM here vrt. DEFINE-EXTERNAL-FORMAT in fd-stream.lisp
771 (defparameter *external-format-functions* (make-hash-table))
773 (defun add-external-format-funs (format-names funs)
774 (dolist (name format-names (values))
775 (setf (gethash name *external-format-functions*) funs)))
777 (add-external-format-funs
778 '(:ascii :us-ascii :ansi_x3.4-1968 :iso-646 :iso-646-us :|646|)
779 '(ascii->string-aref string->ascii))
780 (add-external-format-funs
781 '(:latin1 :latin-1 :iso-8859-1 :iso8859-1)
782 '(latin1->string-aref string->latin1))
783 #!+sb-unicode
784 (add-external-format-funs
785 '(:latin9 :latin-9 :iso-8859-15 :iso8859-15)
786 '(latin9->string-aref string->latin9))
787 (add-external-format-funs '(:utf8 :utf-8) '(utf8->string-aref string->utf8))
789 (defun external-formats-funs (external-format)
790 (when (eql external-format :default)
791 (setf external-format (default-external-format)))
792 (or (gethash external-format *external-format-functions*)
793 (error "Unknown external-format ~S" external-format)))
795 ;;;; public interface
797 (defun octets-to-string (vector &key (external-format :default) (start 0) end)
798 (declare (type (vector (unsigned-byte 8)) vector))
799 (with-array-data ((vector vector)
800 (start start)
801 (end end)
802 :check-fill-pointer t)
803 (declare (type (simple-array (unsigned-byte 8) (*)) vector))
804 (funcall (symbol-function (first (external-formats-funs external-format)))
805 vector start end)))
807 (defun string-to-octets (string &key (external-format :default)
808 (start 0) end null-terminate)
809 (declare (type string string))
810 (with-array-data ((string string)
811 (start start)
812 (end end)
813 :check-fill-pointer t)
814 (declare (type simple-string string))
815 (funcall (symbol-function (second (external-formats-funs external-format)))
816 string start end (if null-terminate 1 0))))
818 #!+sb-unicode
819 (defvar +unicode-replacement-character+ (string (code-char #xfffd)))
820 #!+sb-unicode
821 (defun use-unicode-replacement-char (condition)
822 (use-value +unicode-replacement-character+ condition))
824 ;;; Utilities that maybe should be exported
826 #!+sb-unicode
827 (defmacro with-standard-replacement-character (&body body)
828 `(handler-bind ((octet-encoding-error #'use-unicode-replacement-char))
829 ,@body))
831 (defmacro with-default-decoding-replacement ((c) &body body)
832 (let ((cname (gensym)))
833 `(let ((,cname ,c))
834 (handler-bind
835 ((octet-decoding-error (lambda (c)
836 (use-value ,cname c))))
837 ,@body))))