Inline expansion of two-arg-char-equal without using notinline.
[sbcl.git] / src / code / target-char.lisp
blobfc58891b362f4e9878a83214596f4f1bb183baa2
1 ;;;; character functions
3 ;;;; This software is part of the SBCL system. See the README file for
4 ;;;; more information.
5 ;;;;
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
12 (in-package "SB!IMPL")
14 ;;; We compile some trivial character operations via inline expansion.
15 #!-sb-fluid
16 (declaim (inline standard-char-p graphic-char-p alpha-char-p
17 alphanumericp))
18 (declaim (maybe-inline upper-case-p lower-case-p both-case-p
19 digit-char-p))
21 (deftype char-code ()
22 `(integer 0 (,sb!xc:char-code-limit)))
24 (defglobal **unicode-character-name-huffman-tree** ())
26 (declaim (inline pack-3-codepoints))
27 (defun pack-3-codepoints (first &optional (second 0) (third 0))
28 (declare (type (unsigned-byte 21) first second third))
29 (sb!c::mask-signed-field 63 (logior first (ash second 21) (ash third 42))))
31 (macrolet ((frob ()
32 (flet ((coerce-it (array)
33 (!coerce-to-specialized array '(unsigned-byte 8)))
34 (file (name type)
35 (let ((dir (sb-cold:prepend-genfile-path "output/")))
36 (make-pathname :directory (pathname-directory (merge-pathnames dir))
37 :name name :type type)))
38 (read-ub8-vector (pathname)
39 (with-open-file (stream pathname
40 :element-type '(unsigned-byte 8))
41 (let* ((length (file-length stream))
42 (array (make-array
43 length :element-type '(unsigned-byte 8))))
44 (read-sequence array stream)
45 array)))
46 (init-global (name type &optional length)
47 `(progn
48 (defglobal ,name
49 ,(if (eql type 'hash-table)
50 `(make-hash-table)
51 `(make-array ,length :element-type ',type)))
52 (declaim (type ,(if (eql type 'hash-table)
53 'hash-table
54 `(simple-array ,type (,length))) ,name)))))
55 (let ((misc-database (coerce-it (read-ub8-vector (file "ucdmisc" "dat"))))
56 (ucd-high-pages (coerce-it (read-ub8-vector (file "ucdhigh" "dat"))))
57 (ucd-low-pages (coerce-it (read-ub8-vector (file "ucdlow" "dat"))))
58 (decompositions (coerce-it (read-ub8-vector (file "decomp" "dat"))))
59 (primary-compositions (coerce-it (read-ub8-vector (file "comp" "dat"))))
60 (case-data (coerce-it (read-ub8-vector (file "case" "dat"))))
61 (case-pages (coerce-it (read-ub8-vector (file "casepages" "dat"))))
62 (collations (coerce-it (read-ub8-vector (file "collation" "dat")))))
63 `(progn
64 ;; KLUDGE: All temporary values, fixed up in cold-load
65 ,(init-global '**character-misc-database** '(unsigned-byte 8)
66 (length misc-database))
67 ,(init-global '**character-high-pages** '(unsigned-byte 16)
68 (/ (length ucd-high-pages) 2))
69 ,(init-global '**character-low-pages** '(unsigned-byte 16)
70 (/ (length ucd-low-pages) 2))
71 ,(init-global '**character-decompositions** '(unsigned-byte 21)
72 (/ (length decompositions) 3))
73 ,(init-global '**character-case-pages** '(unsigned-byte 8)
74 (length case-pages))
75 ,(init-global '**character-primary-compositions** 'hash-table)
76 ,(init-global '**character-unicode-cases** t
77 (* 64 (1+ (aref case-pages
78 (1- (length case-pages))))))
79 ,(init-global '**character-cases** '(unsigned-byte 32)
80 (* 2 64 (1+ (aref case-pages
81 (1- (length case-pages))))))
82 ,(init-global '**character-collations** 'hash-table)
84 (defun !character-database-cold-init ()
85 (flet ((make-ubn-vector (raw-bytes n)
86 (let ((new-array
87 (make-array
88 (/ (length raw-bytes) n)
89 :element-type (list 'unsigned-byte (* 8 n)))))
90 (loop for i from 0 below (length raw-bytes) by n
91 for element = 0 do
92 (loop for offset from 0 below n do
93 (incf element
94 (ash (aref raw-bytes (+ i offset))
95 (* 8 (- n offset 1)))))
96 (setf (aref new-array (/ i n)) element))
97 new-array)))
98 (setf **character-misc-database**
99 ,misc-database
100 **character-high-pages**
101 (make-ubn-vector ,ucd-high-pages 2)
102 **character-low-pages**
103 (make-ubn-vector ,ucd-low-pages 2)
104 **character-case-pages**
105 ,(coerce-it case-pages)
106 **character-decompositions**
107 (make-ubn-vector ,decompositions 3))
109 (setf **character-primary-compositions**
110 (let ((table (make-hash-table))
111 (info (make-ubn-vector ,primary-compositions 3)))
112 (dotimes (i (/ (length info) 3))
113 (setf (gethash (dpb (aref info (* 3 i)) (byte 21 21)
114 (aref info (1+ (* 3 i))))
115 table)
116 (aref info (+ (* 3 i) 2))))
117 table))
119 (let* ((unicode-table
120 (make-array
121 (* 64 (1+ (aref **character-case-pages**
122 (1- (length **character-case-pages**)))))))
123 (table (make-array
124 (* 2 (length unicode-table))
125 :element-type '(unsigned-byte 32)))
126 (info ,case-data)
127 (index 0)
128 (length (length info)))
129 (labels ((read-codepoint ()
130 (let* ((b1 (aref info index))
131 (b2 (aref info (incf index)))
132 (b3 (aref info (incf index))))
133 (incf index)
134 (dpb b1 (byte 8 16)
135 (dpb b2 (byte 8 8) b3))))
136 (read-length-tagged ()
137 (let ((len (aref info index)) ret)
138 (incf index)
139 (cond ((zerop len)
140 (read-codepoint))
142 (dotimes (i len)
143 (push (read-codepoint) ret))
144 (nreverse ret))))))
145 (loop until (>= index length)
146 for key = (read-codepoint)
147 for upper = (read-length-tagged)
148 for lower = (read-length-tagged)
149 for page = (aref **character-case-pages** (ash key -6))
150 for i = (+ (ash page 6) (ldb (byte 6 0) key))
152 (setf (aref unicode-table i) (cons upper lower))
153 when
154 (flet (#!+sb-unicode
155 (both-case-p (code)
156 (logbitp 7 (aref **character-misc-database**
157 (+ 5 (misc-index (code-char code)))))))
158 (and (atom upper)
159 (atom lower)
160 ;; Some characters are only equal under unicode rules,
161 ;; e.g. #\MICRO_SIGN and #\GREEK_CAPITAL_LETTER_MU
162 #!+sb-unicode
163 (both-case-p lower)
164 #!+sb-unicode
165 (both-case-p upper)))
167 (setf (aref table (* i 2)) lower
168 (aref table (1+ (* i 2))) upper)))
169 (setf **character-unicode-cases** unicode-table)
170 (setf **character-cases** table))
172 (setf **character-collations**
173 (let* ((table (make-hash-table))
174 (index 0)
175 (info (make-ubn-vector ,collations 4))
176 (len (length info)))
177 (loop while (< index len) do
178 (let* ((entry-head (aref info index))
179 (cp-length (ldb (byte 4 28) entry-head))
180 (key-length (ldb (byte 5 23) entry-head))
181 (key (make-array
182 key-length
183 :element-type '(unsigned-byte 32)))
184 (codepoints nil))
185 (assert (and (/= cp-length 0) (/= key-length 0)))
186 (loop repeat cp-length do
187 (push (dpb 0 (byte 10 22) (aref info index))
188 codepoints)
189 (incf index))
190 (setf codepoints (nreverse codepoints))
191 (dotimes (i key-length)
192 (setf (aref key i) (aref info index))
193 (incf index))
194 (setf (gethash
195 (apply #'pack-3-codepoints codepoints)
196 table) (logically-readonlyize key))))
197 table))))
199 ,(with-open-file
200 (stream (file "ucd-names" "lisp-expr"))
201 (with-open-file (u1-stream (file "ucd1-names" "lisp-expr"))
202 (flet ((convert-to-double-vector (vector &optional reversed)
203 (let ((result (make-array (* (length vector) 2))))
204 (loop for (code . name) across vector
205 for i by 2
207 (when reversed
208 (rotatef code name))
209 (setf (aref result i) code
210 (aref result (1+ i)) name))
211 result)))
212 (let ((names (make-hash-table))
213 (u1-names (make-hash-table)))
214 (loop
215 for code-point = (read stream nil nil)
216 for char-name = (string-upcase (read stream nil nil))
217 while code-point
218 do (setf (gethash code-point names) char-name))
219 (loop
220 for code-point = (read u1-stream nil nil)
221 for char-name = (string-upcase (read u1-stream nil nil))
222 while code-point
223 do (setf (gethash code-point u1-names) char-name))
224 (let ((tree
225 (make-huffman-tree
226 (let (list)
227 (maphash (lambda (code name)
228 (declare (ignore code))
229 (push name list))
230 names)
231 (maphash (lambda (code u1-name)
232 (declare (ignore code))
233 (push u1-name list))
234 u1-names)
235 list)))
236 (code->name
237 (make-array (hash-table-count names)
238 :fill-pointer 0))
239 (name->code nil)
240 (code->u1-name
241 (make-array (hash-table-count u1-names)
242 :fill-pointer 0))
243 (u1-name->code nil))
244 (maphash (lambda (code name)
245 (vector-push
246 (cons code (huffman-encode name tree))
247 code->name))
248 names)
249 (maphash (lambda (code name)
250 (vector-push
251 (cons code (huffman-encode name tree))
252 code->u1-name))
253 u1-names)
254 (setf name->code
255 (sort (copy-seq code->name) #'< :key #'cdr)
256 code->name
257 (sort (copy-seq name->code) #'< :key #'car)
258 u1-name->code
259 (sort (copy-seq code->u1-name) #'< :key #'cdr)
260 code->u1-name
261 (sort (copy-seq u1-name->code) #'< :key #'car))
262 `(progn
263 ,(init-global '**unicode-char-name-database** t
264 (* 2 (length code->name)))
265 ,(init-global '**unicode-name-char-database** t
266 (* 2 (length name->code)))
267 ,(init-global '**unicode-1-char-name-database** t
268 (* 2 (length code->u1-name)))
269 ,(init-global '**unicode-1-name-char-database** t
270 (* 2 (length u1-name->code)))
271 (defun !character-name-database-cold-init ()
272 (setf **unicode-character-name-huffman-tree** ',tree
273 **unicode-char-name-database**
274 ',(convert-to-double-vector code->name)
275 **unicode-name-char-database**
276 ',(convert-to-double-vector name->code t)
277 **unicode-1-char-name-database**
278 ',(convert-to-double-vector code->u1-name)
279 **unicode-1-name-char-database**
280 ',(convert-to-double-vector u1-name->code t))))))))))))))
282 (frob))
283 #+sb-xc-host (!character-name-database-cold-init)
285 (defglobal *base-char-name-alist*
286 ;; Note: The *** markers here indicate character names which are
287 ;; required by the ANSI specification of #'CHAR-NAME. For the others,
288 ;; we prefer the ASCII standard name.
289 '((#x00 "Nul" "Null" "^@")
290 (#x01 "Soh" "^a")
291 (#x02 "Stx" "^b")
292 (#x03 "Etx" "^c")
293 (#x04 "Eot" "^d")
294 (#x05 "Enq" "^e")
295 (#x06 "Ack" "^f")
296 ;; Don't alias to Bell, another Unicode character has that name.
297 (#x07 "Bel" "^g")
298 (#x08 "Backspace" "^h" "Bs") ; *** See Note above
299 (#x09 "Tab" "^i" "Ht") ; *** See Note above
300 (#x0A "Newline" "Linefeed" "^j" "Lf" "Nl") ; *** See Note above
301 (#x0B "Vt" "^k")
302 (#x0C "Page" "^l" "Form" "Formfeed" "Ff" "Np") ; *** See Note above
303 (#x0D "Return" "^m" "Cr") ; *** See Note above
304 (#x0E "So" "^n")
305 (#x0F "Si" "^o")
306 (#x10 "Dle" "^p")
307 (#x11 "Dc1" "^q")
308 (#x12 "Dc2" "^r")
309 (#x13 "Dc3" "^s")
310 (#x14 "Dc4" "^t")
311 (#x15 "Nak" "^u")
312 (#x16 "Syn" "^v")
313 (#x17 "Etb" "^w")
314 (#x18 "Can" "^x")
315 (#x19 "Em" "^y")
316 (#x1A "Sub" "^z")
317 (#x1B "Esc" "Escape" "^[" "Altmode" "Alt")
318 (#x1C "Fs" "^\\")
319 (#x1D "Gs" "^]")
320 (#x1E "Rs" "^^")
321 (#x1F "Us" "^_")
322 (#x20 "Space" "Sp") ; *** See Note above
323 (#x7f "Rubout" "Delete" "Del")
324 (#x80 "C80")
325 (#x81 "C81")
326 (#x82 "Break-Permitted")
327 (#x83 "No-Break-Permitted")
328 (#x84 "C84")
329 (#x85 "Next-Line")
330 (#x86 "Start-Selected-Area")
331 (#x87 "End-Selected-Area")
332 (#x88 "Character-Tabulation-Set")
333 (#x89 "Character-Tabulation-With-Justification")
334 (#x8A "Line-Tabulation-Set")
335 (#x8B "Partial-Line-Forward")
336 (#x8C "Partial-Line-Backward")
337 (#x8D "Reverse-Linefeed")
338 (#x8E "Single-Shift-Two")
339 (#x8F "Single-Shift-Three")
340 (#x90 "Device-Control-String")
341 (#x91 "Private-Use-One")
342 (#x92 "Private-Use-Two")
343 (#x93 "Set-Transmit-State")
344 (#x94 "Cancel-Character")
345 (#x95 "Message-Waiting")
346 (#x96 "Start-Guarded-Area")
347 (#x97 "End-Guarded-Area")
348 (#x98 "Start-String")
349 (#x99 "C99")
350 (#x9A "Single-Character-Introducer")
351 (#x9B "Control-Sequence-Introducer")
352 (#x9C "String-Terminator")
353 (#x9D "Operating-System-Command")
354 (#x9E "Privacy-Message")
355 (#x9F "Application-Program-Command"))) ; *** See Note above
357 ;;;; UCD accessor functions
359 ;;; The character database is made of several arrays.
360 ;;; **CHARACTER-MISC-DATABASE** is an array of bytes that encode character
361 ;;; attributes. Each entry in the misc database is +misc-width+ (currently 8)
362 ;;; bytes wide. Within each entry, the bytes represent: general category, BIDI
363 ;;; class, canonical combining class, digit value, decomposition info, other
364 ;;; flags, script, line break class, and age, respectively. Several of the
365 ;;; entries have additional information encoded in them at the bit level. The
366 ;;; digit value field is equal to 128 (has only its high bit set) if characters
367 ;;; with that set of attribute are not digits. Bit 6 is set if that entry
368 ;;; encodes decimal digits, that is, characters that are DIGIT-CHAR-P. The rest
369 ;;; of the value is the digit value of characters with that entry. Decomposition
370 ;;; info contains the length of the decomposition of characters with that entry,
371 ;;; and also sets its high bit if the decompositions are compatibility
372 ;;; decompositions. The other flags byte encodes boolean properties. Bit 7 is
373 ;;; set if the entry's characters are BOTH-CASE-P in the Common Lisp sense. Bit
374 ;;; 6 is set if the entry's characters hav a defined case transformation in
375 ;;; Unicode. Bit 5 is set if the characters have the property BIDI_Mirrored=Y.
376 ;;; Bits 3-0 encode the entry's East Asian Width. Bit 4 is unused. Age stores
377 ;;; the minor version in bits 0-2, and the major version in the remaining 5
378 ;;; bits.
380 ;;; To find which entry in **CHARACTER-MISC-DATABASE** encodes a character's
381 ;;; attributes, first index **CHARACTER-HIGH-PAGES** (an array of 16-bit
382 ;;; values) with the high 13 bits of the character's codepoint. If the result
383 ;;; value has its high bit set, the character is in a "compressed page". To
384 ;;; find the misc entry number, simply clear the high bit. If the high bit is
385 ;;; not set, the misc entry number must be looked up in
386 ;;; **CHARACTER-LOW-PAGES**, which is an array of 16-bit values. Each entry in
387 ;;; the array consists of two such values, the misc entry number and the
388 ;;; decomposition index. To find the misc entry number, index into
389 ;;; **CHARACTER-LOW-PAGES** using the value retreived from
390 ;;; **CHARACTER-HIGH-PAGES** (shifted left 8 bits) plus the low 8 bits of the
391 ;;; codepoint, all times two to account for the widtth of the entries. The
392 ;;; value in **CHARACTER-LOW-PAGES** at this point is the misc entry number. To
393 ;;; transform a misc entry number into an index into
394 ;;; **CHARACTER-MISC-DATABASE**, multiply it by +misc-width*. This gives the
395 ;;; index of the start of the charater's misc entry in
396 ;;; **CHARACTER-MISC-DATABASE**.
398 ;;; To look up a character's decomposition, first retreive its
399 ;;; decomposition-info from the misc database as described above. If the
400 ;;; decomposition info is not 0, the character has a decomposition with a
401 ;;; length given by the decomposition info with the high bit (which indicates
402 ;;; compatibility/canonical status) cleared. To find the decomposition, move
403 ;;; one value past the character's misc entry number in
404 ;;; **CHARACTER-LOW-DATABASE**, which gives an index into
405 ;;; **CHARACTER-DECOMPOSITIONS**. The next LENGTH values in
406 ;;; **CHARACTER-DECOMPOSITIONS** (an array of codepoints), starting at this
407 ;;; index, are the decomposition of the character. This proceduce does not
408 ;;; apply to Hangul syllables, which have their own decomposition algorithm.
410 ;;; Case information is stored in **CHARACTER-UNICODE-CASES**, an array that
411 ;;; indirectly maps a character's codepoint to (cons uppercase
412 ;;; lowercase). Uppercase and lowercase are either a single codepoint,
413 ;;; which is the upper- or lower-case of the given character, or a
414 ;;; list of codepoints which taken as a whole are the upper- or
415 ;;; lower-case. These case lists are only used in Unicode case
416 ;;; transformations, not in Common Lisp ones.
418 ;;; **CHARACTER-CASES** is similar to the above but it stores codes in
419 ;;; a flat array twice as large, and it includes only the standard casing rules,
420 ;;; so there's always just two characters.
422 ;;; Similarly, composition information is stored in **CHARACTER-COMPOSITIONS**,
423 ;;; which is a hash table of codepoints indexed by (+ (ash codepoint1 21)
424 ;;; codepoint2).
426 (declaim (inline clear-flag))
427 (defun clear-flag (bit integer)
428 (logandc2 integer (ash 1 bit)))
430 (defconstant +misc-width+ 9)
432 (declaim (ftype (sfunction (t) (unsigned-byte 16)) misc-index))
433 (defun misc-index (char)
434 (let* ((cp (char-code char))
435 (cp-high (ash cp -8))
436 (high-index (aref **character-high-pages** cp-high)))
437 (if (logbitp 15 high-index)
438 (* +misc-width+ (clear-flag 15 high-index))
439 (* +misc-width+
440 (aref **character-low-pages**
441 (* 2 (+ (ldb (byte 8 0) cp) (ash high-index 8))))))))
443 (declaim (ftype (sfunction (t) (unsigned-byte 8)) ucd-general-category)
444 (inline ucd-general-category))
445 (defun ucd-general-category (char)
446 (aref **character-misc-database** (misc-index char)))
448 (defun ucd-decimal-digit (char)
449 (let ((digit (aref **character-misc-database**
450 (+ 3 (misc-index char)))))
451 (when (logbitp 6 digit) ; decimalp flag
452 (ldb (byte 4 0) digit))))
454 (defun char-code (char)
455 "Return the integer code of CHAR."
456 (char-code char))
458 (defun char-int (char)
459 "Return the integer code of CHAR. (In SBCL this is the same as CHAR-CODE, as
460 there are no character bits or fonts.)"
461 (char-code char))
463 (defun code-char (code)
464 "Return the character with the code CODE."
465 (code-char code))
467 (defun character (object)
468 "Coerce OBJECT into a CHARACTER if possible. Legal inputs are characters,
469 strings and symbols of length 1."
470 (flet ((do-error (control args)
471 (declare (optimize allow-non-returning-tail-call))
472 (error 'simple-type-error
473 :datum object
474 ;;?? how to express "symbol with name of length 1"?
475 :expected-type '(or character (string 1))
476 :format-control control
477 :format-arguments args)))
478 (typecase object
479 (character object)
480 (string (if (= 1 (length (the string object)))
481 (char object 0)
482 (do-error
483 "String is not of length one: ~S" (list object))))
484 (symbol (if (= 1 (length (symbol-name object)))
485 (schar (symbol-name object) 0)
486 (do-error
487 "Symbol name is not of length one: ~S" (list object))))
488 (t (do-error "~S cannot be coerced to a character." (list object))))))
490 (defun char-name (char)
491 "Return the name (a STRING) for a CHARACTER object."
492 (let ((char-code (char-code char)))
493 (or (second (assoc char-code *base-char-name-alist*))
494 (let ((h-code (double-vector-binary-search char-code
495 **unicode-char-name-database**)))
496 (cond
497 (h-code
498 (huffman-decode h-code **unicode-character-name-huffman-tree**))
500 (format nil "U~X" char-code)))))))
502 (defun name-char (name)
503 "Given an argument acceptable to STRING, NAME-CHAR returns a character whose
504 name is that string, if one exists. Otherwise, NIL is returned."
505 (let ((char-code (car (rassoc-if (lambda (names)
506 (member name names :test #'string-equal))
507 *base-char-name-alist*))))
508 (cond (char-code
509 (code-char char-code))
510 ((let ((start (cond ((eql (string-not-equal "U+" name) 2)
512 ((eql (string-not-equal "U" name) 1)
513 1))))
514 (and start
515 (loop for i from start
516 below (length name)
517 always (digit-char-p (char name i) 16))
518 (code-char (parse-integer name :start start :radix 16)))))
520 (let ((encoding (huffman-encode (string-upcase name)
521 **unicode-character-name-huffman-tree**)))
522 (when encoding
523 (let ((char-code
525 (double-vector-binary-search encoding
526 **unicode-name-char-database**)
527 (double-vector-binary-search encoding
528 **unicode-1-name-char-database**))))
529 (and char-code
530 (code-char char-code)))))))))
532 ;;;; predicates
534 (defun standard-char-p (char)
535 "The argument must be a character object. STANDARD-CHAR-P returns T if the
536 argument is a standard character -- one of the 95 ASCII printing characters or
537 <return>."
538 (and (typep char 'base-char)
539 (let ((n (char-code (the base-char char))))
540 (or (< 31 n 127)
541 (= n 10)))))
543 (defun %standard-char-p (thing)
544 "Return T if and only if THING is a standard-char. Differs from
545 STANDARD-CHAR-P in that THING doesn't have to be a character."
546 (and (characterp thing) (standard-char-p thing)))
548 (defun graphic-char-p (char)
549 "The argument must be a character object. GRAPHIC-CHAR-P returns T if the
550 argument is a printing character (space through ~ in ASCII), otherwise returns
551 NIL."
552 (let ((n (char-code char)))
553 (or (< 31 n 127)
554 (< 159 n))))
556 (defun alpha-char-p (char)
557 "The argument must be a character object. ALPHA-CHAR-P returns T if the
558 argument is an alphabetic character, A-Z or a-z; otherwise NIL."
559 (< (ucd-general-category char) 5))
561 (defmacro with-case-info ((char index-var cases-var
562 &key miss-value)
563 &body body)
564 (let ((code-var (gensym "CODE"))
565 (shifted-var (gensym "SHIFTED"))
566 (page-var (gensym "PAGE")))
567 `(block nil
568 (locally
569 (declare (optimize (sb!c::insert-array-bounds-checks 0)))
570 (let ((,code-var (char-code ,char)))
571 (let* ((,shifted-var (ash ,code-var -6))
572 (,page-var (if (>= ,shifted-var (length **character-case-pages**))
573 (return ,miss-value)
574 (aref **character-case-pages** ,shifted-var))))
575 (if (= ,page-var 255)
576 ,miss-value
577 (let ((,index-var (* (+ (ash ,page-var 6)
578 (ldb (byte 6 0) ,code-var))
580 (,cases-var **character-cases**))
581 ,@body))))))))
583 (defun both-case-p (char)
584 "The argument must be a character object. BOTH-CASE-P returns T if the
585 argument is an alphabetic character and if the character exists in both upper
586 and lower case. For ASCII, this is the same as ALPHA-CHAR-P."
587 (with-case-info (char index cases)
588 (plusp (aref cases index))))
590 (defun upper-case-p (char)
591 "The argument must be a character object; UPPER-CASE-P returns T if the
592 argument is an upper-case character, NIL otherwise."
593 (with-case-info (char index cases)
594 (= (aref cases (1+ index))
595 (char-code char))))
597 (defun lower-case-p (char)
598 "The argument must be a character object; LOWER-CASE-P returns T if the
599 argument is a lower-case character, NIL otherwise."
600 (with-case-info (char index cases)
601 (= (aref cases index)
602 (char-code char))))
604 (defun char-upcase (char)
605 "Return CHAR converted to upper-case if that is possible. Don't convert
606 lowercase eszet (U+DF)."
607 (with-case-info (char index cases
608 :miss-value char)
609 (let ((code (aref cases (1+ index))))
610 (if (zerop code)
611 char
612 (code-char code)))))
614 (defun char-downcase (char)
615 "Return CHAR converted to lower-case if that is possible."
616 (with-case-info (char index cases
617 :miss-value char)
618 (let ((code (aref cases index)))
619 (if (zerop code)
620 char
621 (code-char code)))))
623 (defun alphanumericp (char)
624 "Given a character-object argument, ALPHANUMERICP returns T if the argument
625 is either numeric or alphabetic."
626 (let ((gc (ucd-general-category char)))
627 (or (< gc 5)
628 (= gc 13))))
630 ;;; EQUAL-CHAR-CODE is used by the following functions as a version of CHAR-INT
631 ;;; which loses font, bits, and case info.
633 ;;; Return a cons with (upper-case . lower-case), where it either can
634 ;;; be a character code or a list of character codes if the character
635 ;;; donwcases or upcases into multiple characters.
636 (declaim (inline char-case-info))
637 (defun char-case-info (character)
638 (let* ((code (char-code character))
639 (page (aref **character-case-pages** (ash code -6))))
640 ;; Pages with 255 means the character is not both-case.
641 ;; **character-cases** has 0 for those characters.
642 (aref **character-unicode-cases**
643 (+ (ash page 6)
644 (ldb (byte 6 0) code)))))
646 ;;; Returns the downcased code or the character code
647 (declaim (inline equal-char-code))
648 (defun equal-char-code (char)
649 (let* ((code (char-code char))
650 (shifted (ash code -6))
651 (page (if (>= shifted (length **character-case-pages**))
652 (return-from equal-char-code code)
653 (aref **character-case-pages** shifted))))
654 (if (= page 255)
655 code
656 (let ((down-code
657 (aref **character-cases**
658 (* (+ (ash page 6)
659 (ldb (byte 6 0) code))
660 2))))
661 (if (zerop down-code)
662 code
663 down-code)))))
665 (declaim (inline two-arg-char-equal-inline))
666 (defun two-arg-char-equal-inline (c1 c2)
667 (flet ((base-char-equal-p ()
668 (let* ((code1 (char-code c1))
669 (code2 (char-code c2))
670 (sum (logxor code1 code2)))
671 (when (eql sum #x20)
672 (let ((sum (+ code1 code2)))
673 (or (and (< 161 sum 213))
674 (and (< 415 sum 461))
675 (and (< 463 sum 477))))))))
676 (declare (inline base-char-equal-p))
677 (cond ((eq c1 c2))
678 #!-sb-unicode
680 (base-char-equal-p))
681 #!+sb-unicode
682 ((base-char-p c1)
683 (and (base-char-p c2)
684 (base-char-equal-p)))
685 #!+sb-unicode
686 ((base-char-p c2)
687 nil)
688 #!+sb-unicode
690 (with-case-info (c1 index cases)
691 (or (= (aref cases index) (char-code c2)) ;; lower case
692 (= (aref cases (1+ index)) (char-code c2))))))))
694 ;;; There are transforms on two-arg-char-equal, don't make it inlinable itself.
695 (defun two-arg-char-equal (c1 c2)
696 (two-arg-char-equal-inline c1 c2))
698 (defun two-arg-char-not-equal (c1 c2)
699 (not (two-arg-char-equal-inline c1 c2)))
701 (macrolet ((def (name test doc)
702 `(defun ,name (character &rest more-characters)
703 ,doc
704 (if more-characters
705 (do ((c character (nth i more-characters))
706 (i 0 (1+ i)))
707 ((>= i (length more-characters)) t)
708 (do-rest-arg ((c2) more-characters i)
709 (when ,test
710 (return-from ,name nil))))
711 ;; CHAR-NOT-EQUAL has explicit check attribute
712 (progn (the character character) t)))))
713 (def char/= (eq c (the character c2))
714 "Return T if no two of the arguments are the same character.")
715 (def char-not-equal (two-arg-char-equal c c2)
716 "Return T if no two of the arguments are the same character.
717 Case is ignored."))
719 (defun two-arg-char-lessp (c1 c2)
720 (< (equal-char-code c1) (equal-char-code c2)))
722 (defun two-arg-char-greaterp (c1 c2)
723 (> (equal-char-code c1) (equal-char-code c2)))
725 (defun two-arg-char-not-greaterp (c1 c2)
726 (<= (equal-char-code c1) (equal-char-code c2)))
728 (defun two-arg-char-not-lessp (c1 c2)
729 (>= (equal-char-code c1) (equal-char-code c2)))
731 (macrolet ((def (op test doc &optional explicit-check)
732 `(defun ,op (character &rest more-characters)
733 ,doc
734 ,@(when explicit-check `((declare (explicit-check))))
735 (let ((c1 character))
736 (declare (character c1))
737 (do-rest-arg ((c2 i) more-characters 0 t)
738 (if ,test
739 (setq c1 c2)
740 (return (do-rest-arg ((c) more-characters (1+ i))
741 (the character c))))))))) ; for effect
742 ;; case-sensitive
743 (def char= (eq c1 (the character c2))
744 "Return T if all of the arguments are the same character.")
745 (def char< (< (char-int c1) (char-int c2))
746 "Return T if the arguments are in strictly increasing alphabetic order.")
747 (def char> (> (char-int c1) (char-int c2))
748 "Return T if the arguments are in strictly decreasing alphabetic order.")
749 (def char<= (<= (char-int c1) (char-int c2))
750 "Return T if the arguments are in strictly non-decreasing alphabetic order.")
751 (def char>= (>= (char-int c1) (char-int c2))
752 "Return T if the arguments are in strictly non-increasing alphabetic order.")
754 ;; case-insensitive
755 (def char-equal (two-arg-char-equal c1 c2)
756 "Return T if all of the arguments are the same character.
757 Case is ignored." t)
758 (def char-lessp (two-arg-char-lessp c1 c2)
759 "Return T if the arguments are in strictly increasing alphabetic order.
760 Case is ignored." t)
761 (def char-greaterp (two-arg-char-greaterp c1 c2)
762 "Return T if the arguments are in strictly decreasing alphabetic order.
763 Case is ignored." t)
764 (def char-not-greaterp (two-arg-char-not-greaterp c1 c2)
765 "Return T if the arguments are in strictly non-decreasing alphabetic order.
766 Case is ignored." t)
767 (def char-not-lessp (two-arg-char-not-lessp c1 c2)
768 "Return T if the arguments are in strictly non-increasing alphabetic order.
769 Case is ignored." t))
772 (defun digit-char-p (char &optional (radix 10.))
773 "If char is a digit in the specified radix, returns the fixnum for which
774 that digit stands, else returns NIL."
775 (if (<= (char-code char) 127)
776 (let ((weight (- (char-code char) 48)))
777 (cond ((minusp weight) nil)
778 ((<= radix 10.)
779 ;; Special-case ASCII digits in decimal and smaller radices.
780 (if (< weight radix) weight nil))
781 ;; Digits 0 - 9 are used as is, since radix is larger.
782 ((< weight 10) weight)
783 ;; Check for upper case A - Z.
784 ((and (>= (decf weight 7) 10) (< weight radix)) weight)
785 ;; Also check lower case a - z.
786 ((and (>= (decf weight 32) 10) (< weight radix)) weight)))
787 (let ((number (ucd-decimal-digit char)))
788 (when (and number (< (truly-the fixnum number) radix))
789 number))))
791 (defun digit-char (weight &optional (radix 10))
792 "All arguments must be integers. Returns a character object that represents
793 a digit of the given weight in the specified radix. Returns NIL if no such
794 character exists."
795 (and (typep weight 'fixnum)
796 (>= weight 0) (< weight radix) (< weight 36)
797 (code-char (if (< weight 10) (+ 48 weight) (+ 55 weight)))))