src/code/target-char.lisp

   1 ;;;; character functions
   2
   3 ;;;; This software is part of the SBCL system. See the README file for
   4 ;;;; more information.
   5 ;;;;
   6 ;;;; This software is derived from the CMU CL system, which was
   7 ;;;; written at Carnegie Mellon University and released into the
   8 ;;;; public domain. The software is in the public domain and is
   9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
  10 ;;;; files for more information.
  11
  12 (in-package "SB!IMPL")
  13
  14 ;;; We compile some trivial character operations via inline expansion.
  15 #!-sb-fluid
  16 (declaim (inline standard-char-p graphic-char-p alpha-char-p
  17                  alphanumericp))
  18 (declaim (maybe-inline upper-case-p lower-case-p both-case-p
  19                        digit-char-p))
  20
  21 (deftype char-code ()
  22   `(integer 0 (,sb!xc:char-code-limit)))
  23
  24 (defglobal **unicode-character-name-huffman-tree** ())
  25
  26 (declaim (inline pack-3-codepoints))
  27 (defun pack-3-codepoints (first &optional (second 0) (third 0))
  28   (declare (type (unsigned-byte 21) first second third))
  29   (sb!c::mask-signed-field 63 (logior first (ash second 21) (ash third 42))))
  30
  31 (macrolet ((frob ()
  32              (flet ((coerce-it (array)
  33                       (!coerce-to-specialized array '(unsigned-byte 8)))
  34                     (file (name type)
  35                       (let ((dir (sb-cold:prepend-genfile-path "output/")))
  36                         (make-pathname :directory (pathname-directory (merge-pathnames dir))
  37                                        :name name :type type)))
  38                     (read-ub8-vector (pathname)
  39                       (with-open-file (stream pathname
  40                                               :element-type '(unsigned-byte 8))
  41                         (let* ((length (file-length stream))
  42                                (array (make-array
  43                                        length :element-type '(unsigned-byte 8))))
  44                           (read-sequence array stream)
  45                           array)))
  46                     (init-global (name type &optional length)
  47                       `(progn
  48                          (defglobal ,name
  49                              ,(if (eql type 'hash-table)
  50                                   `(make-hash-table)
  51                                   `(make-array ,length :element-type ',type)))
  52                          (declaim (type ,(if (eql type 'hash-table)
  53                                              'hash-table
  54                                              `(simple-array ,type (,length))) ,name)))))
  55                (let ((misc-database (coerce-it (read-ub8-vector (file "ucdmisc" "dat"))))
  56                      (ucd-high-pages (coerce-it (read-ub8-vector (file "ucdhigh" "dat"))))
  57                      (ucd-low-pages (coerce-it (read-ub8-vector (file "ucdlow" "dat"))))
  58                      (decompositions (coerce-it (read-ub8-vector (file "decomp" "dat"))))
  59                      (primary-compositions (coerce-it (read-ub8-vector (file "comp" "dat"))))
  60                      (case-data (coerce-it (read-ub8-vector (file "case" "dat"))))
  61                      (case-pages (coerce-it (read-ub8-vector (file "casepages" "dat"))))
  62                      (collations (coerce-it (read-ub8-vector (file "collation" "dat")))))
  63                  `(progn
  64                     ;; KLUDGE: All temporary values, fixed up in cold-load
  65                     ,(init-global '**character-misc-database** '(unsigned-byte 8)
  66                                   (length misc-database))
  67                     ,(init-global '**character-high-pages** '(unsigned-byte 16)
  68                                   (/ (length ucd-high-pages) 2))
  69                     ,(init-global '**character-low-pages** '(unsigned-byte 16)
  70                                   (/ (length ucd-low-pages) 2))
  71                     ,(init-global '**character-decompositions** '(unsigned-byte 21)
  72                                   (/ (length decompositions) 3))
  73                     ,(init-global '**character-case-pages** '(unsigned-byte 8)
  74                                   (length case-pages))
  75                     ,(init-global '**character-primary-compositions** 'hash-table)
  76                     ,(init-global '**character-unicode-cases** t
  77                                   (* 64 (1+ (aref case-pages
  78                                                   (1- (length case-pages))))))
  79                     ,(init-global '**character-cases** '(unsigned-byte 32)
  80                                   (* 2 64 (1+ (aref case-pages
  81                                                     (1- (length case-pages))))))
  82                     ,(init-global '**character-collations** 'hash-table)
  83
  84                     (defun !character-database-cold-init ()
  85                       (flet ((make-ubn-vector (raw-bytes n)
  86                                (let ((new-array
  87                                        (make-array
  88                                         (/ (length raw-bytes) n)
  89                                         :element-type (list 'unsigned-byte (* 8 n)))))
  90                                  (loop for i from 0 below (length raw-bytes) by n
  91                                        for element = 0 do
  92                                        (loop for offset from 0 below n do
  93                                              (incf element
  94                                                    (ash (aref raw-bytes (+ i offset))
  95                                                         (* 8 (- n offset 1)))))
  96                                        (setf (aref new-array (/ i n)) element))
  97                                  new-array)))
  98                         (setf **character-misc-database**
  99                               ,misc-database
 100                               **character-high-pages**
 101                               (make-ubn-vector ,ucd-high-pages 2)
 102                               **character-low-pages**
 103                               (make-ubn-vector ,ucd-low-pages 2)
 104                               **character-case-pages**
 105                               ,(coerce-it case-pages)
 106                               **character-decompositions**
 107                               (make-ubn-vector ,decompositions 3))
 108
 109                         (setf **character-primary-compositions**
 110                               (let ((table (make-hash-table))
 111                                     (info (make-ubn-vector ,primary-compositions 3)))
 112                                 (dotimes (i (/ (length info) 3))
 113                                   (setf (gethash (dpb (aref info (* 3 i)) (byte 21 21)
 114                                                       (aref info (1+ (* 3 i))))
 115                                                  table)
 116                                         (aref info (+ (* 3 i) 2))))
 117                                 table))
 118
 119                         (let* ((unicode-table
 120                                  (make-array
 121                                   (* 64 (1+ (aref **character-case-pages**
 122                                                   (1- (length **character-case-pages**)))))))
 123                                (table (make-array
 124                                        (* 2 (length unicode-table))
 125                                        :element-type '(unsigned-byte 32)))
 126                                (info ,case-data)
 127                                (index 0)
 128                                (length (length info)))
 129                           (labels ((read-codepoint ()
 130                                      (let* ((b1 (aref info index))
 131                                             (b2 (aref info (incf index)))
 132                                             (b3 (aref info (incf index))))
 133                                        (incf index)
 134                                        (dpb b1 (byte 8 16)
 135                                             (dpb b2 (byte 8 8) b3))))
 136                                    (read-length-tagged ()
 137                                      (let ((len (aref info index)) ret)
 138                                        (incf index)
 139                                        (cond ((zerop len)
 140                                               (read-codepoint))
 141                                              (t
 142                                               (dotimes (i len)
 143                                                 (push (read-codepoint) ret))
 144                                               (nreverse ret))))))
 145                             (loop until (>= index length)
 146                                   for key = (read-codepoint)
 147                                   for upper = (read-length-tagged)
 148                                   for lower = (read-length-tagged)
 149                                   for page = (aref **character-case-pages** (ash key -6))
 150                                   for i = (+ (ash page 6) (ldb (byte 6 0) key))
 151                                   do
 152                                   (setf (aref unicode-table i) (cons upper lower))
 153                                   when
 154                                   (flet (#!+sb-unicode
 155                                          (both-case-p (code)
 156                                            (logbitp 7 (aref **character-misc-database**
 157                                                             (+ 5 (misc-index (code-char code)))))))
 158                                     (and (atom upper)
 159                                          (atom lower)
 160                                          ;; Some characters are only equal under unicode rules,
 161                                          ;; e.g. #\MICRO_SIGN and #\GREEK_CAPITAL_LETTER_MU
 162                                          #!+sb-unicode
 163                                          (both-case-p lower)
 164                                          #!+sb-unicode
 165                                          (both-case-p upper)))
 166                                   do
 167                                   (setf (aref table (* i 2)) lower
 168                                         (aref table (1+ (* i 2))) upper)))
 169                           (setf **character-unicode-cases** unicode-table)
 170                           (setf **character-cases** table))
 171
 172                         (setf **character-collations**
 173                               (let* ((table (make-hash-table))
 174                                      (index 0)
 175                                      (info (make-ubn-vector ,collations 4))
 176                                      (len (length info)))
 177                                 (loop while (< index len) do
 178                                       (let* ((entry-head (aref info index))
 179                                              (cp-length (ldb (byte 4 28) entry-head))
 180                                              (key-length (ldb (byte 5 23) entry-head))
 181                                              (key (make-array
 182                                                    key-length
 183                                                    :element-type '(unsigned-byte 32)))
 184                                              (codepoints nil))
 185                                         (assert (and (/= cp-length 0) (/= key-length 0)))
 186                                         (loop repeat cp-length do
 187                                               (push (dpb 0 (byte 10 22) (aref info index))
 188                                                     codepoints)
 189                                               (incf index))
 190                                         (setf codepoints (nreverse codepoints))
 191                                         (dotimes (i key-length)
 192                                           (setf (aref key i) (aref info index))
 193                                           (incf index))
 194                                         (setf (gethash
 195                                                (apply #'pack-3-codepoints codepoints)
 196                                                table) (logically-readonlyize key))))
 197                                 table))))
 198
 199                     ,(with-open-file
 200                          (stream (file "ucd-names" "lisp-expr"))
 201                        (with-open-file (u1-stream (file "ucd1-names" "lisp-expr"))
 202                          (flet ((convert-to-double-vector (vector &optional reversed)
 203                                   (let ((result (make-array (* (length vector) 2))))
 204                                     (loop for (code . name) across vector
 205                                           for i by 2
 206                                           do
 207                                           (when reversed
 208                                             (rotatef code name))
 209                                           (setf (aref result i) code
 210                                                 (aref result (1+ i)) name))
 211                                     result)))
 212                            (let ((names (make-hash-table))
 213                                  (u1-names (make-hash-table)))
 214                              (loop
 215                                for code-point = (read stream nil nil)
 216                                for char-name = (string-upcase (read stream nil nil))
 217                                while code-point
 218                                do (setf (gethash code-point names) char-name))
 219                              (loop
 220                                for code-point = (read u1-stream nil nil)
 221                                for char-name = (string-upcase (read u1-stream nil nil))
 222                                while code-point
 223                                do (setf (gethash code-point u1-names) char-name))
 224                              (let ((tree
 225                                      (make-huffman-tree
 226                                       (let (list)
 227                                         (maphash (lambda (code name)
 228                                                    (declare (ignore code))
 229                                                    (push name list))
 230                                                  names)
 231                                         (maphash (lambda (code u1-name)
 232                                                    (declare (ignore code))
 233                                                    (push u1-name list))
 234                                                  u1-names)
 235                                         list)))
 236                                    (code->name
 237                                      (make-array (hash-table-count names)
 238                                                  :fill-pointer 0))
 239                                    (name->code nil)
 240                                    (code->u1-name
 241                                      (make-array (hash-table-count u1-names)
 242                                                  :fill-pointer 0))
 243                                    (u1-name->code nil))
 244                                (maphash (lambda (code name)
 245                                           (vector-push
 246                                            (cons code (huffman-encode name tree))
 247                                            code->name))
 248                                         names)
 249                                (maphash (lambda (code name)
 250                                           (vector-push
 251                                            (cons code (huffman-encode name tree))
 252                                            code->u1-name))
 253                                         u1-names)
 254                                (setf name->code
 255                                      (sort (copy-seq code->name) #'< :key #'cdr)
 256                                      code->name
 257                                      (sort (copy-seq name->code) #'< :key #'car)
 258                                      u1-name->code
 259                                      (sort (copy-seq code->u1-name) #'< :key #'cdr)
 260                                      code->u1-name
 261                                      (sort (copy-seq u1-name->code) #'< :key #'car))
 262                                `(progn
 263                                   ,(init-global '**unicode-char-name-database** t
 264                                                 (* 2 (length code->name)))
 265                                   ,(init-global '**unicode-name-char-database** t
 266                                                 (* 2 (length name->code)))
 267                                   ,(init-global '**unicode-1-char-name-database** t
 268                                                 (* 2 (length code->u1-name)))
 269                                   ,(init-global '**unicode-1-name-char-database** t
 270                                                 (* 2 (length u1-name->code)))
 271                                   (defun !character-name-database-cold-init ()
 272                                     (setf **unicode-character-name-huffman-tree** ',tree
 273                                           **unicode-char-name-database**
 274                                           ',(convert-to-double-vector code->name)
 275                                           **unicode-name-char-database**
 276                                           ',(convert-to-double-vector name->code t)
 277                                           **unicode-1-char-name-database**
 278                                           ',(convert-to-double-vector code->u1-name)
 279                                           **unicode-1-name-char-database**
 280                                           ',(convert-to-double-vector u1-name->code t))))))))))))))
 281
 282   (frob))
 283 #+sb-xc-host (!character-name-database-cold-init)
 284
 285 (defglobal *base-char-name-alist*
 286   ;; Note: The *** markers here indicate character names which are
 287   ;; required by the ANSI specification of #'CHAR-NAME. For the others,
 288   ;; we prefer the ASCII standard name.
 289   '((#x00 "Nul" "Null" "^@")
 290     (#x01 "Soh" "^a")
 291     (#x02 "Stx" "^b")
 292     (#x03 "Etx" "^c")
 293     (#x04 "Eot" "^d")
 294     (#x05 "Enq" "^e")
 295     (#x06 "Ack" "^f")
 296     ;; Don't alias to Bell, another Unicode character has that name.
 297     (#x07 "Bel" "^g")
 298     (#x08 "Backspace" "^h" "Bs") ; *** See Note above
 299     (#x09 "Tab" "^i" "Ht") ; *** See Note above
 300     (#x0A "Newline" "Linefeed" "^j" "Lf" "Nl") ; *** See Note above
 301     (#x0B "Vt" "^k")
 302     (#x0C "Page" "^l" "Form" "Formfeed" "Ff" "Np") ; *** See Note above
 303     (#x0D "Return" "^m" "Cr") ; *** See Note above
 304     (#x0E "So" "^n")
 305     (#x0F "Si" "^o")
 306     (#x10 "Dle" "^p")
 307     (#x11 "Dc1" "^q")
 308     (#x12 "Dc2" "^r")
 309     (#x13 "Dc3" "^s")
 310     (#x14 "Dc4" "^t")
 311     (#x15 "Nak" "^u")
 312     (#x16 "Syn" "^v")
 313     (#x17 "Etb" "^w")
 314     (#x18 "Can" "^x")
 315     (#x19 "Em" "^y")
 316     (#x1A "Sub" "^z")
 317     (#x1B "Esc" "Escape" "^[" "Altmode" "Alt")
 318     (#x1C "Fs" "^\\")
 319     (#x1D "Gs" "^]")
 320     (#x1E "Rs" "^^")
 321     (#x1F "Us" "^_")
 322     (#x20 "Space" "Sp") ; *** See Note above
 323     (#x7f "Rubout" "Delete" "Del")
 324     (#x80 "C80")
 325     (#x81 "C81")
 326     (#x82 "Break-Permitted")
 327     (#x83 "No-Break-Permitted")
 328     (#x84 "C84")
 329     (#x85 "Next-Line")
 330     (#x86 "Start-Selected-Area")
 331     (#x87 "End-Selected-Area")
 332     (#x88 "Character-Tabulation-Set")
 333     (#x89 "Character-Tabulation-With-Justification")
 334     (#x8A "Line-Tabulation-Set")
 335     (#x8B "Partial-Line-Forward")
 336     (#x8C "Partial-Line-Backward")
 337     (#x8D "Reverse-Linefeed")
 338     (#x8E "Single-Shift-Two")
 339     (#x8F "Single-Shift-Three")
 340     (#x90 "Device-Control-String")
 341     (#x91 "Private-Use-One")
 342     (#x92 "Private-Use-Two")
 343     (#x93 "Set-Transmit-State")
 344     (#x94 "Cancel-Character")
 345     (#x95 "Message-Waiting")
 346     (#x96 "Start-Guarded-Area")
 347     (#x97 "End-Guarded-Area")
 348     (#x98 "Start-String")
 349     (#x99 "C99")
 350     (#x9A "Single-Character-Introducer")
 351     (#x9B "Control-Sequence-Introducer")
 352     (#x9C "String-Terminator")
 353     (#x9D "Operating-System-Command")
 354     (#x9E "Privacy-Message")
 355     (#x9F "Application-Program-Command"))) ; *** See Note above
 356 \f
 357 ;;;; UCD accessor functions
 358
 359 ;;; The character database is made of several arrays.
 360 ;;; **CHARACTER-MISC-DATABASE** is an array of bytes that encode character
 361 ;;; attributes. Each entry in the misc database is +misc-width+ (currently 8)
 362 ;;; bytes wide. Within each entry, the bytes represent: general category, BIDI
 363 ;;; class, canonical combining class, digit value, decomposition info, other
 364 ;;; flags, script, line break class, and age, respectively. Several of the
 365 ;;; entries have additional information encoded in them at the bit level. The
 366 ;;; digit value field is equal to 128 (has only its high bit set) if characters
 367 ;;; with that set of attribute are not digits. Bit 6 is set if that entry
 368 ;;; encodes decimal digits, that is, characters that are DIGIT-CHAR-P. The rest
 369 ;;; of the value is the digit value of characters with that entry. Decomposition
 370 ;;; info contains the length of the decomposition of characters with that entry,
 371 ;;; and also sets its high bit if the decompositions are compatibility
 372 ;;; decompositions. The other flags byte encodes boolean properties. Bit 7 is
 373 ;;; set if the entry's characters are BOTH-CASE-P in the Common Lisp sense. Bit
 374 ;;; 6 is set if the entry's characters hav a defined case transformation in
 375 ;;; Unicode. Bit 5 is set if the characters have the property BIDI_Mirrored=Y.
 376 ;;; Bits 3-0 encode the entry's East Asian Width. Bit 4 is unused. Age stores
 377 ;;; the minor version in bits 0-2, and the major version in the remaining 5
 378 ;;; bits.
 379 ;;;
 380 ;;; To find which entry in **CHARACTER-MISC-DATABASE** encodes a character's
 381 ;;; attributes, first index **CHARACTER-HIGH-PAGES** (an array of 16-bit
 382 ;;; values) with the high 13 bits of the character's codepoint. If the result
 383 ;;; value has its high bit set, the character is in a "compressed page". To
 384 ;;; find the misc entry number, simply clear the high bit. If the high bit is
 385 ;;; not set, the misc entry number must be looked up in
 386 ;;; **CHARACTER-LOW-PAGES**, which is an array of 16-bit values. Each entry in
 387 ;;; the array consists of two such values, the misc entry number and the
 388 ;;; decomposition index. To find the misc entry number, index into
 389 ;;; **CHARACTER-LOW-PAGES** using the value retreived from
 390 ;;; **CHARACTER-HIGH-PAGES** (shifted left 8 bits) plus the low 8 bits of the
 391 ;;; codepoint, all times two to account for the widtth of the entries. The
 392 ;;; value in **CHARACTER-LOW-PAGES** at this point is the misc entry number. To
 393 ;;; transform a misc entry number into an index into
 394 ;;; **CHARACTER-MISC-DATABASE**, multiply it by +misc-width*. This gives the
 395 ;;; index of the start of the charater's misc entry in
 396 ;;; **CHARACTER-MISC-DATABASE**.
 397 ;;;
 398 ;;; To look up a character's decomposition, first retreive its
 399 ;;; decomposition-info from the misc database as described above. If the
 400 ;;; decomposition info is not 0, the character has a decomposition with a
 401 ;;; length given by the decomposition info with the high bit (which indicates
 402 ;;; compatibility/canonical status) cleared. To find the decomposition, move
 403 ;;; one value past the character's misc entry number in
 404 ;;; **CHARACTER-LOW-DATABASE**, which gives an index into
 405 ;;; **CHARACTER-DECOMPOSITIONS**. The next LENGTH values in
 406 ;;; **CHARACTER-DECOMPOSITIONS** (an array of codepoints), starting at this
 407 ;;; index, are the decomposition of the character. This proceduce does not
 408 ;;; apply to Hangul syllables, which have their own decomposition algorithm.
 409 ;;;
 410 ;;; Case information is stored in **CHARACTER-UNICODE-CASES**, an array that
 411 ;;; indirectly maps a character's codepoint to (cons uppercase
 412 ;;; lowercase). Uppercase and lowercase are either a single codepoint,
 413 ;;; which is the upper- or lower-case of the given character, or a
 414 ;;; list of codepoints which taken as a whole are the upper- or
 415 ;;; lower-case. These case lists are only used in Unicode case
 416 ;;; transformations, not in Common Lisp ones.
 417 ;;;
 418 ;;; **CHARACTER-CASES** is similar to the above but it stores codes in
 419 ;;; a flat array twice as large, and it includes only the standard casing rules,
 420 ;;; so there's always just two characters.
 421 ;;;
 422 ;;; Similarly, composition information is stored in **CHARACTER-COMPOSITIONS**,
 423 ;;; which is a hash table of codepoints indexed by (+ (ash codepoint1 21)
 424 ;;; codepoint2).
 425
 426 (declaim (inline clear-flag))
 427 (defun clear-flag (bit integer)
 428   (logandc2 integer (ash 1 bit)))
 429
 430 (defconstant +misc-width+ 9)
 431
 432 (declaim (ftype (sfunction (t) (unsigned-byte 16)) misc-index))
 433 (defun misc-index (char)
 434   (let* ((cp (char-code char))
 435          (cp-high (ash cp -8))
 436          (high-index (aref **character-high-pages** cp-high)))
 437     (if (logbitp 15 high-index)
 438         (* +misc-width+ (clear-flag 15 high-index))
 439         (* +misc-width+
 440            (aref **character-low-pages**
 441                  (* 2 (+ (ldb (byte 8 0) cp) (ash high-index 8))))))))
 442
 443 (declaim (ftype (sfunction (t) (unsigned-byte 8)) ucd-general-category)
 444          (inline ucd-general-category))
 445 (defun ucd-general-category (char)
 446   (aref **character-misc-database** (misc-index char)))
 447
 448 (defun ucd-decimal-digit (char)
 449   (let ((digit (aref **character-misc-database**
 450                      (+ 3 (misc-index char)))))
 451     (when (logbitp 6 digit) ; decimalp flag
 452       (ldb (byte 4 0) digit))))
 453
 454 (defun char-code (char)
 455   "Return the integer code of CHAR."
 456   (char-code char))
 457
 458 (defun char-int (char)
 459   "Return the integer code of CHAR. (In SBCL this is the same as CHAR-CODE, as
 460 there are no character bits or fonts.)"
 461   (char-code char))
 462
 463 (defun code-char (code)
 464   "Return the character with the code CODE."
 465   (code-char code))
 466
 467 (defun character (object)
 468   "Coerce OBJECT into a CHARACTER if possible. Legal inputs are characters,
 469 strings and symbols of length 1."
 470   (flet ((do-error (control args)
 471            (declare (optimize allow-non-returning-tail-call))
 472            (error 'simple-type-error
 473                   :datum object
 474                   ;;?? how to express "symbol with name of length 1"?
 475                   :expected-type '(or character (string 1))
 476                   :format-control control
 477                   :format-arguments args)))
 478     (typecase object
 479       (character object)
 480       (string (if (= 1 (length (the string object)))
 481                   (char object 0)
 482                   (do-error
 483                    "String is not of length one: ~S" (list object))))
 484       (symbol (if (= 1 (length (symbol-name object)))
 485                   (schar (symbol-name object) 0)
 486                   (do-error
 487                    "Symbol name is not of length one: ~S" (list object))))
 488       (t (do-error "~S cannot be coerced to a character." (list object))))))
 489
 490 (defun char-name (char)
 491   "Return the name (a STRING) for a CHARACTER object."
 492   (let ((char-code (char-code char)))
 493     (or (second (assoc char-code *base-char-name-alist*))
 494         (let ((h-code (double-vector-binary-search char-code
 495                                                    **unicode-char-name-database**)))
 496           (cond
 497             (h-code
 498              (huffman-decode h-code **unicode-character-name-huffman-tree**))
 499             (t
 500              (format nil "U~X" char-code)))))))
 501
 502 (defun name-char (name)
 503   "Given an argument acceptable to STRING, NAME-CHAR returns a character whose
 504 name is that string, if one exists. Otherwise, NIL is returned."
 505   (let ((char-code (car (rassoc-if (lambda (names)
 506                                      (member name names :test #'string-equal))
 507                                    *base-char-name-alist*))))
 508     (cond (char-code
 509            (code-char char-code))
 510           ((let ((start (cond ((eql (string-not-equal "U+" name) 2)
 511                                2)
 512                               ((eql (string-not-equal "U" name) 1)
 513                                1))))
 514              (and start
 515                   (loop for i from start
 516                         below (length name)
 517                         always (digit-char-p (char name i) 16))
 518                   (code-char (parse-integer name :start start :radix 16)))))
 519           (t
 520            (let ((encoding (huffman-encode (string-upcase name)
 521                                            **unicode-character-name-huffman-tree**)))
 522              (when encoding
 523                (let ((char-code
 524                        (or
 525                         (double-vector-binary-search encoding
 526                                                      **unicode-name-char-database**)
 527                         (double-vector-binary-search encoding
 528                                                      **unicode-1-name-char-database**))))
 529                  (and char-code
 530                       (code-char char-code)))))))))
 531 \f
 532 ;;;; predicates
 533
 534 (defun standard-char-p (char)
 535   "The argument must be a character object. STANDARD-CHAR-P returns T if the
 536 argument is a standard character -- one of the 95 ASCII printing characters or
 537 <return>."
 538   (and (typep char 'base-char)
 539        (let ((n (char-code (the base-char char))))
 540          (or (< 31 n 127)
 541              (= n 10)))))
 542
 543 (defun %standard-char-p (thing)
 544   "Return T if and only if THING is a standard-char. Differs from
 545 STANDARD-CHAR-P in that THING doesn't have to be a character."
 546   (and (characterp thing) (standard-char-p thing)))
 547
 548 (defun graphic-char-p (char)
 549   "The argument must be a character object. GRAPHIC-CHAR-P returns T if the
 550 argument is a printing character (space through ~ in ASCII), otherwise returns
 551 NIL."
 552   (let ((n (char-code char)))
 553     (or (< 31 n 127)
 554         (< 159 n))))
 555
 556 (defun alpha-char-p (char)
 557   "The argument must be a character object. ALPHA-CHAR-P returns T if the
 558 argument is an alphabetic character, A-Z or a-z; otherwise NIL."
 559   (< (ucd-general-category char) 5))
 560
 561 (defmacro with-case-info ((char index-var cases-var
 562                            &key miss-value)
 563                           &body body)
 564   (let ((code-var (gensym "CODE"))
 565         (shifted-var (gensym "SHIFTED"))
 566         (page-var (gensym "PAGE")))
 567     `(block nil
 568        (locally
 569            (declare (optimize (sb!c::insert-array-bounds-checks 0)))
 570          (let ((,code-var (char-code ,char)))
 571            (let* ((,shifted-var (ash ,code-var -6))
 572                   (,page-var (if (>= ,shifted-var (length **character-case-pages**))
 573                                  (return ,miss-value)
 574                                  (aref **character-case-pages** ,shifted-var))))
 575              (if (= ,page-var 255)
 576                  ,miss-value
 577                  (let ((,index-var (* (+ (ash ,page-var 6)
 578                                          (ldb (byte 6 0) ,code-var))
 579                                       2))
 580                        (,cases-var **character-cases**))
 581                    ,@body))))))))
 582
 583 (defun both-case-p (char)
 584   "The argument must be a character object. BOTH-CASE-P returns T if the
 585 argument is an alphabetic character and if the character exists in both upper
 586 and lower case. For ASCII, this is the same as ALPHA-CHAR-P."
 587   (with-case-info (char index cases)
 588     (plusp (aref cases index))))
 589
 590 (defun upper-case-p (char)
 591   "The argument must be a character object; UPPER-CASE-P returns T if the
 592 argument is an upper-case character, NIL otherwise."
 593   (with-case-info (char index cases)
 594     (= (aref cases (1+ index))
 595        (char-code char))))
 596
 597 (defun lower-case-p (char)
 598   "The argument must be a character object; LOWER-CASE-P returns T if the
 599 argument is a lower-case character, NIL otherwise."
 600   (with-case-info (char index cases)
 601     (= (aref cases index)
 602        (char-code char))))
 603
 604 (defun char-upcase (char)
 605   "Return CHAR converted to upper-case if that is possible. Don't convert
 606 lowercase eszet (U+DF)."
 607   (with-case-info (char index cases
 608                    :miss-value char)
 609     (let ((code (aref cases (1+ index))))
 610       (if (zerop code)
 611           char
 612           (code-char code)))))
 613
 614 (defun char-downcase (char)
 615   "Return CHAR converted to lower-case if that is possible."
 616   (with-case-info (char index cases
 617                    :miss-value char)
 618     (let ((code (aref cases index)))
 619       (if (zerop code)
 620           char
 621           (code-char code)))))
 622
 623 (defun alphanumericp (char)
 624   "Given a character-object argument, ALPHANUMERICP returns T if the argument
 625 is either numeric or alphabetic."
 626   (let ((gc (ucd-general-category char)))
 627     (or (< gc 5)
 628         (= gc 13))))
 629
 630 ;;; EQUAL-CHAR-CODE is used by the following functions as a version of CHAR-INT
 631 ;;;  which loses font, bits, and case info.
 632
 633 ;;; Return a cons with (upper-case . lower-case), where it either can
 634 ;;; be a character code or a list of character codes if the character
 635 ;;; donwcases or upcases into multiple characters.
 636 (declaim (inline char-case-info))
 637 (defun char-case-info (character)
 638   (let* ((code (char-code character))
 639          (page (aref **character-case-pages** (ash code -6))))
 640     ;; Pages with 255 means the character is not both-case.
 641     ;; **character-cases** has 0 for those characters.
 642     (aref **character-unicode-cases**
 643           (+ (ash page 6)
 644              (ldb (byte 6 0) code)))))
 645
 646 ;;; Returns the downcased code or the character code
 647 (declaim (inline equal-char-code))
 648 (defun equal-char-code (char)
 649   (let* ((code (char-code char))
 650          (shifted (ash code -6))
 651          (page (if (>= shifted (length **character-case-pages**))
 652                    (return-from equal-char-code code)
 653                    (aref **character-case-pages** shifted))))
 654     (if (= page 255)
 655         code
 656         (let ((down-code
 657                 (aref **character-cases**
 658                       (* (+ (ash page 6)
 659                             (ldb (byte 6 0) code))
 660                          2))))
 661           (if (zerop down-code)
 662               code
 663               down-code)))))
 664
 665 (declaim (inline two-arg-char-equal-inline))
 666 (defun two-arg-char-equal-inline (c1 c2)
 667   (flet ((base-char-equal-p ()
 668            (let* ((code1 (char-code c1))
 669                   (code2 (char-code c2))
 670                   (sum (logxor code1 code2)))
 671              (when (eql sum #x20)
 672                (let ((sum (+ code1 code2)))
 673                  (or (and (< 161 sum 213))
 674                      (and (< 415 sum 461))
 675                      (and (< 463 sum 477))))))))
 676     (declare (inline base-char-equal-p))
 677     (cond ((eq c1 c2))
 678           #!-sb-unicode
 679           (t
 680            (base-char-equal-p))
 681           #!+sb-unicode
 682           ((base-char-p c1)
 683            (and (base-char-p c2)
 684                 (base-char-equal-p)))
 685           #!+sb-unicode
 686           ((base-char-p c2)
 687            nil)
 688           #!+sb-unicode
 689           (t
 690            (with-case-info (c1 index cases)
 691              (or (= (aref cases index) (char-code c2)) ;; lower case
 692                  (= (aref cases (1+ index)) (char-code c2))))))))
 693
 694 ;;; There are transforms on two-arg-char-equal, don't make it inlinable itself.
 695 (defun two-arg-char-equal (c1 c2)
 696   (two-arg-char-equal-inline c1 c2))
 697
 698 (defun two-arg-char-not-equal (c1 c2)
 699   (not (two-arg-char-equal-inline c1 c2)))
 700
 701 (macrolet ((def (name test doc)
 702              `(defun ,name (character &rest more-characters)
 703                 ,doc
 704                 (if more-characters
 705                     (do ((c character (nth i more-characters))
 706                          (i 0 (1+ i)))
 707                         ((>= i (length more-characters)) t)
 708                       (do-rest-arg ((c2) more-characters i)
 709                         (when ,test
 710                           (return-from ,name nil))))
 711                     ;; CHAR-NOT-EQUAL has explicit check attribute
 712                     (progn (the character character) t)))))
 713   (def char/= (eq c (the character c2))
 714        "Return T if no two of the arguments are the same character.")
 715   (def char-not-equal (two-arg-char-equal c c2)
 716        "Return T if no two of the arguments are the same character.
 717 Case is ignored."))
 718
 719 (defun two-arg-char-lessp (c1 c2)
 720   (< (equal-char-code c1) (equal-char-code c2)))
 721
 722 (defun two-arg-char-greaterp (c1 c2)
 723   (> (equal-char-code c1) (equal-char-code c2)))
 724
 725 (defun two-arg-char-not-greaterp (c1 c2)
 726   (<= (equal-char-code c1) (equal-char-code c2)))
 727
 728 (defun two-arg-char-not-lessp (c1 c2)
 729   (>= (equal-char-code c1) (equal-char-code c2)))
 730
 731 (macrolet ((def (op test doc &optional explicit-check)
 732              `(defun ,op (character &rest more-characters)
 733                 ,doc
 734                 ,@(when explicit-check `((declare (explicit-check))))
 735                 (let ((c1 character))
 736                   (declare (character c1))
 737                   (do-rest-arg ((c2 i) more-characters 0 t)
 738                      (if ,test
 739                          (setq c1 c2)
 740                          (return (do-rest-arg ((c) more-characters (1+ i))
 741                                    (the character c))))))))) ; for effect
 742   ;; case-sensitive
 743   (def char= (eq c1 (the character c2))
 744     "Return T if all of the arguments are the same character.")
 745   (def char< (< (char-int c1) (char-int c2))
 746     "Return T if the arguments are in strictly increasing alphabetic order.")
 747   (def char> (> (char-int c1) (char-int c2))
 748     "Return T if the arguments are in strictly decreasing alphabetic order.")
 749   (def char<= (<= (char-int c1) (char-int c2))
 750     "Return T if the arguments are in strictly non-decreasing alphabetic order.")
 751   (def char>= (>= (char-int c1) (char-int c2))
 752     "Return T if the arguments are in strictly non-increasing alphabetic order.")
 753
 754   ;; case-insensitive
 755   (def char-equal (two-arg-char-equal c1 c2)
 756     "Return T if all of the arguments are the same character.
 757 Case is ignored." t)
 758   (def char-lessp (two-arg-char-lessp c1 c2)
 759     "Return T if the arguments are in strictly increasing alphabetic order.
 760 Case is ignored." t)
 761   (def char-greaterp (two-arg-char-greaterp c1 c2)
 762     "Return T if the arguments are in strictly decreasing alphabetic order.
 763 Case is ignored." t)
 764   (def char-not-greaterp (two-arg-char-not-greaterp c1 c2)
 765     "Return T if the arguments are in strictly non-decreasing alphabetic order.
 766 Case is ignored." t)
 767   (def char-not-lessp (two-arg-char-not-lessp c1 c2)
 768     "Return T if the arguments are in strictly non-increasing alphabetic order.
 769 Case is ignored." t))
 770
 771 \f
 772 (defun digit-char-p (char &optional (radix 10.))
 773   "If char is a digit in the specified radix, returns the fixnum for which
 774 that digit stands, else returns NIL."
 775   (if (<= (char-code char) 127)
 776       (let ((weight (- (char-code char) 48)))
 777         (cond ((minusp weight) nil)
 778               ((<= radix 10.)
 779                ;; Special-case ASCII digits in decimal and smaller radices.
 780                (if (< weight radix) weight nil))
 781               ;; Digits 0 - 9 are used as is, since radix is larger.
 782               ((< weight 10) weight)
 783               ;; Check for upper case A - Z.
 784               ((and (>= (decf weight 7) 10) (< weight radix)) weight)
 785               ;; Also check lower case a - z.
 786               ((and (>= (decf weight 32) 10) (< weight radix)) weight)))
 787       (let ((number (ucd-decimal-digit char)))
 788         (when (and number (< (truly-the fixnum number) radix))
 789           number))))
 790
 791 (defun digit-char (weight &optional (radix 10))
 792   "All arguments must be integers. Returns a character object that represents
 793 a digit of the given weight in the specified radix. Returns NIL if no such
 794 character exists."
 795   (and (typep weight 'fixnum)
 796        (>= weight 0) (< weight radix) (< weight 36)
 797        (code-char (if (< weight 10) (+ 48 weight) (+ 55 weight)))))