src/code/octets.lisp

   1 ;;;; code for string to octet conversion
   2
   3 ;;;; This software is part of the SBCL system. See the README file for
   4 ;;;; more information.
   5 ;;;;
   6 ;;;; This software is derived from the CMU CL system, which was
   7 ;;;; written at Carnegie Mellon University and released into the
   8 ;;;; public domain. The software is in the public domain and is
   9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
  10 ;;;; files for more information.
  11
  12 ;;; FIXME: The latin9 stuff is currently #!+sb-unicode, because I
  13 ;;; don't like the idea of trying to do CODE-CHAR #x<big>.  Is that a
  14 ;;; justified fear?  Can we arrange that it's caught and converted to
  15 ;;; a decoding error error?  Or should we just give up on non-Unicode
  16 ;;; builds?
  17
  18 (in-package "SB!IMPL")
  19
  20 ;;; FIXME: don't we have this somewhere else?
  21 (deftype array-range ()
  22   "A number that can represent an index into a vector, including
  23 one-past-the-end"
  24   '(integer 0 #.sb!xc:array-dimension-limit))
  25 \f
  26 ;;;; conditions
  27
  28 ;;; encoding condition
  29
  30 (define-condition octets-encoding-error (character-encoding-error)
  31   ((string :initarg :string :reader octets-encoding-error-string)
  32    (position :initarg :position :reader octets-encoding-error-position)
  33    (external-format :initarg :external-format
  34                     :reader octets-encoding-error-external-format))
  35   (:report (lambda (c s)
  36              (format s "Unable to encode character ~A as ~S."
  37                      (char-code (char (octets-encoding-error-string c)
  38                                       (octets-encoding-error-position c)))
  39                      (octets-encoding-error-external-format c)))))
  40
  41 (defun read-replacement-character ()
  42   (format *query-io*
  43           "Replacement byte, bytes, character, or string (evaluated): ")
  44   (finish-output *query-io*)
  45   (list (eval (read *query-io*))))
  46
  47 (defun encoding-error (external-format string pos)
  48   (restart-case
  49       (error 'octets-encoding-error
  50              :external-format external-format
  51              :string string
  52              :position pos)
  53     (use-value (replacement)
  54       :report "Supply a set of bytes to use in place of the invalid one."
  55       :interactive read-replacement-character
  56       (typecase replacement
  57         ((unsigned-byte 8)
  58          (make-array 1 :element-type '(unsigned-byte 8) :initial-element replacement))
  59         (character
  60          (string-to-octets (string replacement)
  61                            :external-format external-format))
  62         (string
  63          (string-to-octets replacement
  64                            :external-format external-format))
  65         (t
  66          (coerce replacement '(simple-array (unsigned-byte 8) (*))))))))
  67
  68 ;;; decoding condition
  69
  70 ;;; for UTF8, the specific condition signalled will be a generalized
  71 ;;; instance of one of the following:
  72 ;;;
  73 ;;;   end-of-input-in-character
  74 ;;;   character-out-of-range
  75 ;;;   invalid-utf8-starter-byte
  76 ;;;   invalid-utf8-continuation-byte
  77 ;;;   overlong-utf8-sequence
  78 ;;;
  79 ;;; Of these, the only one truly likely to be of interest to calling
  80 ;;; code is end-of-input-in-character (in which case it's likely to
  81 ;;; want to make a note of octet-decoding-error-start, supply "" as a
  82 ;;; replacement string, and then move that last chunk of bytes to the
  83 ;;; beginning of its buffer for the next go round) but they're all
  84 ;;; provided on the off chance they're of interest.  The next most
  85 ;;; likely interesting option is overlong-utf8-sequence -- the
  86 ;;; application, if it cares to, can decode this itself (taking care
  87 ;;; to ensure that the result isn't out of range of CHAR-CODE-LIMIT)
  88 ;;; and return that result.  This library doesn't provide support for
  89 ;;; that as a conforming UTF-8-using program is supposed to treat it
  90 ;;; as an error.
  91
  92 (define-condition octet-decoding-error (character-decoding-error)
  93   ((array :initarg :array :accessor octet-decoding-error-array)
  94    (start :initarg :start :accessor octet-decoding-error-start)
  95    (end :initarg :end :accessor octet-decoding-error-end)
  96    (position :initarg :pos :accessor octet-decoding-bad-byte-position)
  97    (external-format :initarg :external-format
  98                     :accessor octet-decoding-error-external-format))
  99   (:report
 100    (lambda (condition stream)
 101      (format stream "Illegal ~S character starting at byte position ~D."
 102              (octet-decoding-error-external-format condition)
 103              (octet-decoding-error-start condition)))))
 104
 105 (define-condition end-of-input-in-character (octet-decoding-error) ())
 106 (define-condition character-out-of-range (octet-decoding-error) ())
 107 (define-condition invalid-utf8-starter-byte (octet-decoding-error) ())
 108 (define-condition invalid-utf8-continuation-byte (octet-decoding-error) ())
 109 (define-condition overlong-utf8-sequence (octet-decoding-error) ())
 110
 111 (define-condition malformed-ascii (octet-decoding-error) ())
 112
 113 (defun read-replacement-string ()
 114   (format *query-io* "Enter a replacement string designator (evaluated): ")
 115   (finish-output *query-io*)
 116   (list (eval (read *query-io*))))
 117
 118 (defun decoding-error (array start end external-format reason pos)
 119   (restart-case
 120       (error reason
 121              :external-format external-format
 122              :array array
 123              :start start
 124              :end end
 125              :pos pos)
 126     (use-value (s)
 127       :report "Supply a replacement string designator."
 128       :interactive read-replacement-string
 129       (string s))))
 130
 131 ;;; Utilities used in both to-string and to-octet conversions
 132
 133 (defmacro instantiate-octets-definition (definer)
 134   `(progn
 135     (,definer aref (simple-array (unsigned-byte 8) (*)))
 136     (,definer sap-ref-8 system-area-pointer)))
 137
 138 ;;; FIXME: find out why the comment about SYMBOLICATE below is true
 139 ;;; and fix it, or else replace with SYMBOLICATE.
 140 ;;;
 141 ;;; FIXME: this is cute, but is going to prevent greps for def.*<name>
 142 ;;; from working for (defun ,(make-od-name ...) ...)
 143 (eval-when (:compile-toplevel :load-toplevel :execute)
 144   (defun make-od-name (sym1 sym2)
 145     ;; "MAKE-NAME" is too generic, but this doesn't do quite what
 146     ;; SYMBOLICATE does; MAKE-OD-NAME ("octets definition") it is
 147     ;; then.
 148     (intern (concatenate 'string (symbol-name sym1) "-" (symbol-name sym2))
 149             (symbol-package sym1))))
 150 \f
 151 ;;;; to-octets conversions
 152
 153 ;;; to latin (including ascii)
 154
 155 ;;; Converting bytes to character codes is easy: just use a 256-element
 156 ;;; lookup table that maps each possible byte to its corresponding
 157 ;;; character code.
 158 ;;;
 159 ;;; Converting character codes to bytes is a little harder, since the
 160 ;;; codes may be spare (e.g. we use codes 0-127, 3490, and 4598).  The
 161 ;;; previous version of this macro utilized a gigantic CASE expression
 162 ;;; to do the hard work, with the result that the code was huge (since
 163 ;;; SBCL's then-current compilation strategy for CASE expressions was
 164 ;;; (and still is) converting CASE into COND into if-the-elses--which is
 165 ;;; also inefficient unless your code happens to occur very early in the
 166 ;;; chain.
 167 ;;;
 168 ;;; The current strategy is to build a table:
 169 ;;;
 170 ;;; [ ... code_1 byte_1 code_2 byte_2 ... code_n byte_n ... ]
 171 ;;;
 172 ;;; such that the codes are sorted in order from lowest to highest.  We
 173 ;;; can then binary search the table to discover the appropriate byte
 174 ;;; for a character code.  We also implement an optimization: all unibyte
 175 ;;; mappings do not remap ASCII (0-127) and some do not remap part of
 176 ;;; the range beyond character code 127.  So we check to see if the
 177 ;;; character code falls into that range first (a quick check, since
 178 ;;; character codes are guaranteed to be positive) and then do the binary
 179 ;;; search if not.  This optimization also enables us to cut down on the
 180 ;;; size of our lookup table.
 181 (defmacro define-unibyte-mapper (byte-char-name code-byte-name &rest exceptions)
 182   (let* (;; Build a list of (CODE BYTE) pairs
 183          (pairs (loop for byte below 256
 184                    for code = (let ((exception (cdr (assoc byte exceptions))))
 185                                 (cond
 186                                   ((car exception) (car exception))
 187                                   ((null exception) byte)
 188                                   (t nil)))
 189                    when code collect (list code byte) into elements
 190                    finally (return elements)))
 191          ;; Find the smallest character code such that the corresponding
 192          ;; byte is != to the code.
 193          (lowest-non-equivalent-code (position-if-not #'(lambda (pair)
 194                                                           (apply #'= pair))
 195                                                       pairs))
 196          ;; Sort them for our lookup table.
 197          (sorted-pairs (sort (subseq pairs lowest-non-equivalent-code)
 198                              #'< :key #'car))
 199          ;; Create the lookup table.
 200          (sorted-lookup-table
 201           (reduce #'append sorted-pairs :from-end t :initial-value nil)))
 202     `(progn
 203        ; Can't inline it with a non-null lexical environment anyway.
 204        ;(declaim (inline ,byte-char-name))
 205        (let ((byte-to-code-table
 206               ,(make-array 256 :element-type t #+nil 'char-code
 207                            :initial-contents (loop for byte below 256
 208                                                 collect
 209                                                 (let ((exception (cadr (assoc byte exceptions))))
 210                                                   (if exception
 211                                                       exception
 212                                                       byte)))))
 213              (code-to-byte-table
 214               ,(make-array (length sorted-lookup-table)
 215                            :initial-contents sorted-lookup-table)))
 216          (defun ,byte-char-name (byte)
 217            (declare (optimize speed (safety 0))
 218                     (type (unsigned-byte 8) byte))
 219            (aref byte-to-code-table byte))
 220          (defun ,code-byte-name (code)
 221            (declare (optimize speed (safety 0))
 222                     (type char-code code))
 223            (if (< code ,lowest-non-equivalent-code)
 224                code
 225                ;; We could toss in some TRULY-THEs if we really needed to
 226                ;; make this faster...
 227                (loop with low = 0
 228                   with high = (- (length code-to-byte-table) 2)
 229                   while (< low high)
 230                   do (let ((mid (logandc2 (truncate (+ low high 2) 2) 1)))
 231                        (if (< code (aref code-to-byte-table mid))
 232                            (setf high (- mid 2))
 233                            (setf low mid)))
 234                   finally (return (if (eql code (aref code-to-byte-table low))
 235                                       (aref code-to-byte-table (1+ low))
 236                                       nil)))))))))
 237
 238 #!+sb-unicode
 239 (define-unibyte-mapper
 240     latin9->code-mapper
 241     code->latin9-mapper
 242   (#xA4 #x20AC)
 243   (#xA6 #x0160)
 244   (#xA8 #x0161)
 245   (#xB4 #x017D)
 246   (#xB8 #x017E)
 247   (#xBC #x0152)
 248   (#xBD #x0153)
 249   (#xBE #x0178))
 250
 251 (declaim (inline get-latin-bytes))
 252 (defun get-latin-bytes (mapper external-format string pos)
 253   (let ((code (funcall mapper (char-code (char string pos)))))
 254     (declare (type (or null char-code) code))
 255     (values (cond
 256               ((and code (< code 256)) code)
 257               (t
 258                (encoding-error external-format string pos)))
 259             1)))
 260
 261 (declaim (inline code->ascii-mapper))
 262 (defun code->ascii-mapper (code)
 263   (declare (optimize speed (safety 0))
 264            (type char-code code))
 265   (if (> code 127)
 266       nil
 267       code))
 268
 269 (declaim (inline get-ascii-bytes))
 270 (defun get-ascii-bytes (string pos)
 271   (declare (optimize speed (safety 0))
 272            (type simple-string string)
 273            (type array-range pos))
 274   (get-latin-bytes #'code->ascii-mapper :ascii string pos))
 275
 276 (declaim (inline get-latin1-bytes))
 277 (defun get-latin1-bytes (string pos)
 278   (declare (optimize speed (safety 0))
 279            (type simple-string string)
 280            (type array-range pos))
 281   (get-latin-bytes #'identity :latin-1 string pos))
 282
 283 #!+sb-unicode
 284 (progn
 285   (declaim (inline get-latin9-bytes))
 286   (defun get-latin9-bytes (string pos)
 287     (declare (optimize speed (safety 0))
 288              (type simple-string string)
 289              (type array-range pos))
 290     (get-latin-bytes #'code->latin9-mapper :latin-9 string pos)))
 291
 292 (declaim (inline string->latin%))
 293 (defun string->latin% (string sstart send get-bytes null-padding)
 294   (declare (optimize speed)
 295            (type simple-string string)
 296            (type index sstart send)
 297            (type (integer 0 1) null-padding)
 298            (type function get-bytes))
 299   ;; The latin encodings are all unibyte encodings, so just directly
 300   ;; compute the number of octets we're going to generate.
 301   (let ((octets (make-array (+ (- send sstart) null-padding)
 302                             ;; This takes care of any null padding the
 303                             ;; caller requests.
 304                             :initial-element 0
 305                             :element-type '(unsigned-byte 8)))
 306         (index 0)
 307         (error-position 0))
 308     (tagbody
 309      :no-error
 310        (loop for pos of-type index from sstart below send
 311           do (let ((byte (funcall get-bytes string pos)))
 312                (typecase byte
 313                  ((unsigned-byte 8)
 314                   (locally (declare (optimize (sb!c::insert-array-bounds-checks 0)))
 315                     (setf (aref octets index) byte)))
 316                  ((simple-array (unsigned-byte 8) (*))
 317                   ;; KLUDGE: We ran into encoding errors.  Bail and do
 318                   ;; things the slow way (does anybody actually use this
 319                   ;; functionality besides our own test suite?).
 320                   (setf error-position pos)
 321                   (go :error)))
 322                (incf index))
 323           finally (return-from string->latin% octets))
 324      :error
 325        ;; We have encoded INDEX octets so far and we ran into an encoding
 326        ;; error at ERROR-POSITION.
 327        (let ((new-octets (make-array (* index 2)
 328                                      :element-type '(unsigned-byte 8)
 329                                      :adjustable t :fill-pointer index)))
 330          (replace new-octets octets)
 331          (loop for pos of-type index from error-position below send
 332             do (let ((thing (funcall get-bytes string pos)))
 333                  (typecase thing
 334                    ((unsigned-byte 8)
 335                     (vector-push-extend thing new-octets))
 336                    ((simple-array (unsigned-byte 8) (*))
 337                     (dotimes (i (length thing))
 338                       (vector-push-extend (aref thing i) new-octets)))))
 339             finally (return-from string->latin%
 340                       (progn
 341                         (unless (zerop null-padding)
 342                           (vector-push-extend 0 new-octets))
 343                         (copy-seq new-octets))))))))
 344
 345 (defun string->ascii (string sstart send null-padding)
 346   (declare (optimize speed (safety 0))
 347            (type simple-string string)
 348            (type array-range sstart send))
 349   (values (string->latin% string sstart send #'get-ascii-bytes null-padding)))
 350
 351 (defun string->latin1 (string sstart send null-padding)
 352   (declare (optimize speed (safety 0))
 353            (type simple-string string)
 354            (type array-range sstart send))
 355   (values (string->latin% string sstart send #'get-latin1-bytes null-padding)))
 356
 357 #!+sb-unicode
 358 (defun string->latin9 (string sstart send null-padding)
 359   (declare (optimize speed (safety 0))
 360            (type simple-string string)
 361            (type array-range sstart send))
 362   (values (string->latin% string sstart send #'get-latin9-bytes null-padding)))
 363
 364 ;;; to utf8
 365
 366 (declaim (inline char-len-as-utf8))
 367 (defun char-len-as-utf8 (code)
 368   (declare (optimize speed (safety 0))
 369            (type (integer 0 (#.sb!xc:char-code-limit)) code))
 370   (cond ((< code 0) (bug "can't happen"))
 371         ((< code #x80) 1)
 372         ((< code #x800) 2)
 373         ((< code #x10000) 3)
 374         ((< code #x110000) 4)
 375         (t (bug "can't happen"))))
 376
 377 (defun string->utf8 (string sstart send null-padding)
 378   (declare (optimize (speed 3) (safety 0))
 379            (type simple-string string)
 380            (type (integer 0 1) null-padding)
 381            (type array-range sstart send))
 382   (macrolet ((ascii-bash ()
 383                '(let ((array (make-array (+ null-padding (- send sstart))
 384                                          :element-type '(unsigned-byte 8))))
 385                  (loop for i from 0
 386                        and j from sstart below send
 387                        do (setf (aref array i) (char-code (char string j))))
 388                  array)))
 389     (etypecase string
 390       ((simple-array character (*))
 391        (let ((utf8-length 0))
 392          ;; Since it has to fit in a vector, it must be a fixnum!
 393          (declare (type (and unsigned-byte fixnum) utf8-length))
 394          (loop for i of-type index from sstart below send
 395                do (incf utf8-length (char-len-as-utf8 (char-code (char string i)))))
 396          (if (= utf8-length (- send sstart))
 397              (ascii-bash)
 398              (let ((array (make-array (+ null-padding utf8-length)
 399                                       :element-type '(unsigned-byte 8)))
 400                    (index 0))
 401                (declare (type index index))
 402                (flet ((add-byte (b)
 403                         (setf (aref array index) b)
 404                         (incf index)))
 405                  (declare (inline add-byte))
 406                  (loop for i of-type index from sstart below send
 407                        do (let ((code (char-code (char string i))))
 408                             (case (char-len-as-utf8 code)
 409                               (1
 410                                (add-byte code))
 411                               (2
 412                                (add-byte (logior #b11000000 (ldb (byte 5 6) code)))
 413                                (add-byte (logior #b10000000 (ldb (byte 6 0) code))))
 414                               (3
 415                                (add-byte (logior #b11100000 (ldb (byte 4 12) code)))
 416                                (add-byte (logior #b10000000 (ldb (byte 6 6) code)))
 417                                (add-byte (logior #b10000000 (ldb (byte 6 0) code))))
 418                               (4
 419                                (add-byte (logior #b11110000 (ldb (byte 3 18) code)))
 420                                (add-byte (logior #b10000000 (ldb (byte 6 12) code)))
 421                                (add-byte (logior #b10000000 (ldb (byte 6 6) code)))
 422                                (add-byte (logior #b10000000 (ldb (byte 6 0) code))))))
 423                        finally (return array)))))))
 424       #!+sb-unicode
 425       ((simple-array base-char (*))
 426        ;; On unicode builds BASE-STRINGs are limited to ASCII range, so we can take
 427        ;; a fast path -- and get benefit of the element type information. On non-unicode
 428        ;; build BASE-CHAR == CHARACTER.
 429        (ascii-bash))
 430       ((simple-array nil (*))
 431        ;; Just get the error...
 432        (aref string sstart)))))
 433 \f
 434 ;;;; to-string conversions
 435
 436 ;;; from latin (including ascii)
 437
 438 (defmacro define-ascii->string (accessor type)
 439   (let ((name (make-od-name 'ascii->string accessor)))
 440     `(progn
 441       (defun ,name (array astart aend)
 442         (declare (optimize speed)
 443                  (type ,type array)
 444                  (type array-range astart aend))
 445         ;; Since there is such a thing as a malformed ascii byte, a
 446         ;; simple "make the string, fill it in" won't do.
 447         (let ((string (make-array 0 :element-type 'character :fill-pointer 0 :adjustable t)))
 448           (loop for apos from astart below aend
 449                 do (let* ((code (,accessor array apos))
 450                           (string-content
 451                            (if (< code 128)
 452                                (code-char code)
 453                                (decoding-error array apos (1+ apos) :ascii
 454                                                'malformed-ascii apos))))
 455                      (if (characterp string-content)
 456                          (vector-push-extend string-content string)
 457                          (loop for c across string-content
 458                                do (vector-push-extend c string))))
 459                 finally (return (coerce string 'simple-string))))))))
 460 (instantiate-octets-definition define-ascii->string)
 461
 462 (defmacro define-latin->string* (accessor type)
 463   (let ((name (make-od-name 'latin->string* accessor)))
 464     `(progn
 465       (declaim (inline ,name))
 466       (defun ,name (string sstart send array astart aend mapper)
 467         (declare (optimize speed (safety 0))
 468                  (type simple-string string)
 469                  (type ,type array)
 470                  (type array-range sstart send astart aend)
 471                  (function mapper))
 472         (loop for spos from sstart below send
 473            for apos from astart below aend
 474            do (setf (char string spos)
 475                     (code-char (funcall mapper (,accessor array apos))))
 476            finally (return (values string spos apos)))))))
 477 (instantiate-octets-definition define-latin->string*)
 478
 479 (defmacro define-latin1->string* (accessor type)
 480   (declare (ignore type))
 481   (let ((name (make-od-name 'latin1->string* accessor)))
 482     `(progn
 483       (defun ,name (string sstart send array astart aend)
 484         (,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'identity)))))
 485 (instantiate-octets-definition define-latin1->string*)
 486
 487 #!+sb-unicode
 488 (progn
 489   (defmacro define-latin9->string* (accessor type)
 490     (declare (ignore type))
 491     (let ((name (make-od-name 'latin9->string* accessor)))
 492       `(progn
 493         (defun ,name (string sstart send array astart aend)
 494           (,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'latin9->code-mapper)))))
 495   (instantiate-octets-definition define-latin9->string*))
 496
 497 (defmacro define-latin->string (accessor type)
 498   (let ((name (make-od-name 'latin->string accessor)))
 499     `(progn
 500       (declaim (inline ,name))
 501       (defun ,name (array astart aend mapper)
 502         (declare (optimize speed (safety 0))
 503                  (type ,type array)
 504                  (type array-range astart aend)
 505                  (type function mapper))
 506         (let ((length (the array-range (- aend astart))))
 507           (values (,(make-od-name 'latin->string* accessor) (make-string length) 0 length
 508                                                             array astart aend
 509                                                             mapper)))))))
 510 (instantiate-octets-definition define-latin->string)
 511
 512 (defmacro define-latin1->string (accessor type)
 513   (declare (ignore type))
 514   `(defun ,(make-od-name 'latin1->string accessor) (array astart aend)
 515     (,(make-od-name 'latin->string accessor) array astart aend #'identity)))
 516 (instantiate-octets-definition define-latin1->string)
 517
 518 #!+sb-unicode
 519 (progn
 520   (defmacro define-latin9->string (accessor type)
 521     (declare (ignore type))
 522     `(defun ,(make-od-name 'latin9->string accessor) (array astart aend)
 523       (,(make-od-name 'latin->string accessor) array astart aend #'latin9->code-mapper)))
 524   (instantiate-octets-definition define-latin9->string))
 525
 526 ;;; from utf8
 527
 528 (defmacro define-bytes-per-utf8-character (accessor type)
 529   (let ((name (make-od-name 'bytes-per-utf8-character accessor)))
 530     `(progn
 531       ;;(declaim (inline ,name))
 532       (let ((lexically-max
 533              (string->utf8 (string (code-char ,(1- sb!xc:char-code-limit)))
 534                            0 1 0)))
 535         (declare (type (simple-array (unsigned-byte 8) (#!+sb-unicode 4 #!-sb-unicode 2)) lexically-max))
 536         (defun ,name (array pos end)
 537           (declare (optimize speed (safety 0))
 538                    (type ,type array)
 539                    (type array-range pos end))
 540           ;; returns the number of bytes consumed and nil if it's a
 541           ;; valid character or the number of bytes consumed and a
 542           ;; replacement string if it's not.
 543           (let ((initial-byte (,accessor array pos))
 544                 (reject-reason nil)
 545                 (reject-position pos)
 546                 (remaining-bytes (- end pos)))
 547             (declare (type array-range reject-position remaining-bytes))
 548             (labels ((valid-utf8-starter-byte-p (b)
 549                        (declare (type (unsigned-byte 8) b))
 550                        (let ((ok (cond
 551                                    ((zerop (logand b #b10000000)) 1)
 552                                    ((= (logand b #b11100000) #b11000000)
 553                                     2)
 554                                    ((= (logand b #b11110000) #b11100000)
 555                                     3)
 556                                    ((= (logand b #b11111000) #b11110000)
 557                                     4)
 558                                    ((= (logand b #b11111100) #b11111000)
 559                                     5)
 560                                    ((= (logand b #b11111110) #b11111100)
 561                                     6)
 562                                    (t
 563                                     nil))))
 564                          (unless ok
 565                            (setf reject-reason 'invalid-utf8-starter-byte))
 566                          ok))
 567                      (enough-bytes-left-p (x)
 568                        (let ((ok (> end (+ pos (1- x)))))
 569                          (unless ok
 570                            (setf reject-reason 'end-of-input-in-character))
 571                          ok))
 572                      (valid-secondary-p (x)
 573                        (let* ((idx (the array-range (+ pos x)))
 574                               (b (,accessor array idx))
 575                               (ok (= (logand b #b11000000) #b10000000)))
 576                          (unless ok
 577                            (setf reject-reason 'invalid-utf8-continuation-byte)
 578                            (setf reject-position idx))
 579                          ok))
 580                      (preliminary-ok-for-length (maybe-len len)
 581                        (and (eql maybe-len len)
 582                             ;; Has to be done in this order so that
 583                             ;; certain broken sequences (e.g., the
 584                             ;; two-byte sequence `"initial (length 3)"
 585                             ;; "non-continuation"' -- `#xef #x32')
 586                             ;; signal only part of that sequence as
 587                             ;; erroneous.
 588                             (loop for i from 1 below (min len remaining-bytes)
 589                                   always (valid-secondary-p i))
 590                             (enough-bytes-left-p len)))
 591                      (overlong-chk (x y)
 592                        (let ((ok (or (/= initial-byte x)
 593                                      (/= (logior (,accessor array (the array-range (+ pos 1)))
 594                                                  y)
 595                                          y))))
 596                          (unless ok
 597                            (setf reject-reason 'overlong-utf8-sequence))
 598                          ok))
 599                      (character-below-char-code-limit-p ()
 600                        ;; This is only called on a four-byte sequence
 601                        ;; (two in non-unicode builds) to ensure we
 602                        ;; don't go over SBCL's character limts.
 603                        (let ((ok (cond ((< (aref lexically-max 0) (,accessor array pos))
 604                                         nil)
 605                                        ((> (aref lexically-max 0) (,accessor array pos))
 606                                         t)
 607                                        ((< (aref lexically-max 1) (,accessor array (+ pos 1)))
 608                                         nil)
 609                                        #!+sb-unicode
 610                                        ((> (aref lexically-max 1) (,accessor array (+ pos 1)))
 611                                         t)
 612                                        #!+sb-unicode
 613                                        ((< (aref lexically-max 2) (,accessor array (+ pos 2)))
 614                                         nil)
 615                                        #!+sb-unicode
 616                                        ((> (aref lexically-max 2) (,accessor array (+ pos 2)))
 617                                         t)
 618                                        #!+sb-unicode
 619                                        ((< (aref lexically-max 3) (,accessor array (+ pos 3)))
 620                                         nil)
 621                                        (t t))))
 622                          (unless ok
 623                            (setf reject-reason 'character-out-of-range))
 624                          ok)))
 625               (declare (inline valid-utf8-starter-byte-p
 626                                enough-bytes-left-p
 627                                valid-secondary-p
 628                                preliminary-ok-for-length
 629                                overlong-chk))
 630               (let ((maybe-len (valid-utf8-starter-byte-p initial-byte)))
 631                 (cond ((eql maybe-len 1)
 632                        (values 1 nil))
 633                       ((and (preliminary-ok-for-length maybe-len 2)
 634                             (overlong-chk #b11000000 #b10111111)
 635                             (overlong-chk #b11000001 #b10111111)
 636                             #!-sb-unicode (character-below-char-code-limit-p))
 637                        (values 2 nil))
 638                       ((and (preliminary-ok-for-length maybe-len 3)
 639                             (overlong-chk #b11100000 #b10011111)
 640                             #!-sb-unicode (not (setf reject-reason 'character-out-of-range)))
 641                        (values 3 nil))
 642                       ((and (preliminary-ok-for-length maybe-len 4)
 643                             (overlong-chk #b11110000 #b10001111)
 644                             #!-sb-unicode (not (setf reject-reason 'character-out-of-range))
 645                             (character-below-char-code-limit-p))
 646                        (values 4 nil))
 647                       ((and (preliminary-ok-for-length maybe-len 5)
 648                             (overlong-chk #b11111000 #b10000111)
 649                             (not (setf reject-reason 'character-out-of-range)))
 650                        (bug "can't happen"))
 651                       ((and (preliminary-ok-for-length maybe-len 6)
 652                             (overlong-chk #b11111100 #b10000011)
 653                             (not (setf reject-reason 'character-out-of-range)))
 654                        (bug "can't happen"))
 655                       (t
 656                        (let* ((bad-end (ecase reject-reason
 657                                          (invalid-utf8-starter-byte
 658                                           (1+ pos))
 659                                          (end-of-input-in-character
 660                                           end)
 661                                          (invalid-utf8-continuation-byte
 662                                           reject-position)
 663                                          ((overlong-utf8-sequence character-out-of-range)
 664                                           (+ pos maybe-len))))
 665                               (bad-len (- bad-end pos)))
 666                          (declare (type array-range bad-end bad-len))
 667                          (let ((replacement (decoding-error array pos bad-end :utf-8 reject-reason reject-position)))
 668                            (values bad-len replacement)))))))))))))
 669 (instantiate-octets-definition define-bytes-per-utf8-character)
 670
 671 (defmacro define-simple-get-utf8-char (accessor type)
 672   (let ((name (make-od-name 'simple-get-utf8-char accessor)))
 673     `(progn
 674       (declaim (inline ,name))
 675       (defun ,name (array pos bytes)
 676         (declare (optimize speed (safety 0))
 677                  (type ,type array)
 678                  (type array-range pos)
 679                  (type (integer 1 4) bytes))
 680         (flet ((cref (x)
 681                  (,accessor array (the array-range (+ pos x)))))
 682           (declare (inline cref))
 683           (code-char (ecase bytes
 684                        (1 (cref 0))
 685                        (2 (logior (ash (ldb (byte 5 0) (cref 0)) 6)
 686                                   (ldb (byte 6 0) (cref 1))))
 687                        (3 (logior (ash (ldb (byte 4 0) (cref 0)) 12)
 688                                   (ash (ldb (byte 6 0) (cref 1)) 6)
 689                                   (ldb (byte 6 0) (cref 2))))
 690                        (4 (logior (ash (ldb (byte 3 0) (cref 0)) 18)
 691                                   (ash (ldb (byte 6 0) (cref 1)) 12)
 692                                   (ash (ldb (byte 6 0) (cref 2)) 6)
 693                                   (ldb (byte 6 0) (cref 3)))))))))))
 694 (instantiate-octets-definition define-simple-get-utf8-char)
 695
 696 (defmacro define-utf8->string (accessor type)
 697   (let ((name (make-od-name 'utf8->string accessor)))
 698     `(progn
 699       (defun ,name (array astart aend)
 700         (declare (optimize speed (safety 0))
 701                  (type ,type array)
 702                  (type array-range astart aend))
 703         (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
 704           (loop with pos = astart
 705                 while (< pos aend)
 706                 do (multiple-value-bind (bytes invalid)
 707                        (,(make-od-name 'bytes-per-utf8-character accessor) array pos aend)
 708                      (declare (type (or null string) invalid))
 709                      (cond
 710                        ((null invalid)
 711                         (vector-push-extend (,(make-od-name 'simple-get-utf8-char accessor) array pos bytes) string))
 712                        (t
 713                         (dotimes (i (length invalid))
 714                           (vector-push-extend (char invalid i) string))))
 715                      (incf pos bytes)))
 716           (coerce string 'simple-string))))))
 717 (instantiate-octets-definition define-utf8->string)
 718 \f
 719 ;;;; external formats
 720
 721 (defvar *default-external-format* nil)
 722
 723 (defun default-external-format ()
 724   (or *default-external-format*
 725       ;; On non-unicode, use iso-8859-1 instead of detecting it from
 726       ;; the locale settings. Defaulting to an external-format which
 727       ;; can represent characters that the CHARACTER type can't
 728       ;; doesn't seem very sensible.
 729       #!-sb-unicode
 730       (setf *default-external-format* :latin-1)
 731       (let ((external-format #!-win32 (intern (or (sb!alien:alien-funcall
 732                                                     (extern-alien
 733                                                       "nl_langinfo"
 734                                                       (function (c-string :external-format :latin-1)
 735                                                                 int))
 736                                                     sb!unix:codeset)
 737                                                   "LATIN-1")
 738                                               "KEYWORD")
 739                              #!+win32 (sb!win32::ansi-codepage)))
 740         (/show0 "cold-printing defaulted external-format:")
 741         #!+sb-show
 742         (cold-print external-format)
 743         (/show0 "matching to known aliases")
 744         (dolist (entry *external-formats*
 745                  (progn
 746                    ;;; FIXME! This WARN would try to do printing
 747                    ;;; before the streams have been initialized,
 748                    ;;; causing an infinite erroring loop. We should
 749                    ;;; either print it by calling to C, or delay the
 750                    ;;; warning until later. Since we're in freeze
 751                    ;;; right now, and the warning isn't really
 752                    ;;; essential, I'm doing what's least likely to
 753                    ;;; cause damage, and commenting it out. This
 754                    ;;; should be revisited after 0.9.17. -- JES,
 755                    ;;; 2006-09-21
 756                    #+nil
 757                    (warn "Invalid external-format ~A; using LATIN-1"
 758                          external-format)
 759                    (setf external-format :latin-1)))
 760           (/show0 "cold printing known aliases:")
 761           #!+sb-show
 762           (dolist (alias (first entry)) (cold-print alias))
 763           (/show0 "done cold-printing known aliases")
 764           (when (member external-format (first entry))
 765             (/show0 "matched")
 766             (return)))
 767         (/show0 "/default external format ok")
 768         (setf *default-external-format* external-format))))
 769
 770 ;;; FIXME: OAOOM here vrt. DEFINE-EXTERNAL-FORMAT in fd-stream.lisp
 771 (defparameter *external-format-functions* (make-hash-table))
 772
 773 (defun add-external-format-funs (format-names funs)
 774   (dolist (name format-names (values))
 775     (setf (gethash name *external-format-functions*) funs)))
 776
 777 (add-external-format-funs
 778  '(:ascii :us-ascii :ansi_x3.4-1968 :iso-646 :iso-646-us :|646|)
 779  '(ascii->string-aref string->ascii))
 780 (add-external-format-funs
 781  '(:latin1 :latin-1 :iso-8859-1 :iso8859-1)
 782  '(latin1->string-aref string->latin1))
 783 #!+sb-unicode
 784 (add-external-format-funs
 785  '(:latin9 :latin-9 :iso-8859-15 :iso8859-15)
 786  '(latin9->string-aref string->latin9))
 787 (add-external-format-funs '(:utf8 :utf-8) '(utf8->string-aref string->utf8))
 788
 789 (defun external-formats-funs (external-format)
 790   (when (eql external-format :default)
 791     (setf external-format (default-external-format)))
 792   (or (gethash external-format *external-format-functions*)
 793       (error "Unknown external-format ~S" external-format)))
 794 \f
 795 ;;;; public interface
 796
 797 (defun octets-to-string (vector &key (external-format :default) (start 0) end)
 798   (declare (type (vector (unsigned-byte 8)) vector))
 799   (with-array-data ((vector vector)
 800                     (start start)
 801                     (end end)
 802                     :check-fill-pointer t)
 803     (declare (type (simple-array (unsigned-byte 8) (*)) vector))
 804     (funcall (symbol-function (first (external-formats-funs external-format)))
 805              vector start end)))
 806
 807 (defun string-to-octets (string &key (external-format :default)
 808                          (start 0) end null-terminate)
 809   (declare (type string string))
 810   (with-array-data ((string string)
 811                     (start start)
 812                     (end end)
 813                     :check-fill-pointer t)
 814     (declare (type simple-string string))
 815     (funcall (symbol-function (second (external-formats-funs external-format)))
 816              string start end (if null-terminate 1 0))))
 817
 818 #!+sb-unicode
 819 (defvar +unicode-replacement-character+ (string (code-char #xfffd)))
 820 #!+sb-unicode
 821 (defun use-unicode-replacement-char (condition)
 822   (use-value +unicode-replacement-character+ condition))
 823
 824 ;;; Utilities that maybe should be exported
 825
 826 #!+sb-unicode
 827 (defmacro with-standard-replacement-character (&body body)
 828   `(handler-bind ((octet-encoding-error #'use-unicode-replacement-char))
 829     ,@body))
 830
 831 (defmacro with-default-decoding-replacement ((c) &body body)
 832   (let ((cname (gensym)))
 833   `(let ((,cname ,c))
 834     (handler-bind
 835         ((octet-decoding-error (lambda (c)
 836                                  (use-value ,cname c))))
 837       ,@body))))