1 ;;;; character functions
3 ;;;; This software is part of the SBCL system. See the README file for
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
12 (in-package "SB-IMPL")
14 ;;; We compile some trivial character operations via inline expansion.
15 (declaim (inline standard-char-p graphic-char-p alpha-char-p
17 (declaim (maybe-inline upper-case-p lower-case-p both-case-p
20 (declaim (inline clear-flag
))
21 (eval-when (:compile-toplevel
:load-toplevel
:execute
)
22 (defun clear-flag (bit integer
)
23 (logandc2 integer
(ash 1 bit
))))
25 (eval-when (:compile-toplevel
)
26 (defconstant +misc-width
+ 9)
27 (defmacro misc-index-from-char-code
(codepoint high-pages low-pages
)
28 (let ((high-pages-value (and (boundp high-pages
)
29 (symbol-value high-pages
)))
30 (low-pages-value (and (boundp low-pages
)
31 (symbol-value low-pages
))))
32 `(let* ((cp ,codepoint
)
34 (high-index (aref ,high-pages cp-high
)))
36 (if (logbitp 15 high-index
)
40 (loop for x across high-pages-value
42 maximize
(clear-flag 15 x
))
44 (clear-flag 15 high-index
))
48 (loop for i below
(length low-pages-value
) by
2
50 (aref low-pages-value i
))
53 (* 2 (+ (ldb (byte 8 0) cp
)
57 (loop for x across high-pages-value
63 (setf (sb-xc:macro-function
'misc-index-from-char-code
)
65 (declare (ignore env
))
66 (funcall (cl:macro-function
'misc-index-from-char-code
) form nil
))))
69 (flet ((file (name type
)
70 (sb-cold:find-bootstrap-file
(format nil
"output/ucd/~A.~A" name type
)))
71 (make-ubn-vector (raw-bytes n
)
72 (aver (member n
'(1 2)))
73 (ubN-array-from-octets raw-bytes
`(unsigned-byte ,(* 8 n
)) n
)))
74 (let* ((misc-database (read-ub8-vector (file "ucdmisc" "dat")))
75 (ucd-high-pages (read-ub8-vector (file "ucdhigh" "dat")))
76 (ucd-low-pages (read-ub8-vector (file "ucdlow" "dat")))
77 (case-data (read-ub8-vector (file "case" "dat")))
78 (case-pages (read-ub8-vector (file "casepages" "dat")))
79 (high-pages (make-ubn-vector ucd-high-pages
2))
80 (low-pages (make-ubn-vector ucd-low-pages
2))
81 (%
*character-case-pages
*%
(make-ubn-vector case-pages
1)))
84 (defconstant-eqx sb-unicode
::+character-misc-database
+ ,misc-database
#'equalp
)
85 (defconstant-eqx sb-unicode
::+character-high-pages
+ ,high-pages
#'equalp
)
86 (defconstant-eqx sb-unicode
::+character-low-pages
+ ,low-pages
#'equalp
)
87 (defconstant-eqx +character-case-pages
+ ,%
*character-case-pages
*%
#'equalp
)
88 ,@(let* ((unicode-table
90 (* 64 (1+ (aref %
*character-case-pages
*%
91 (1- (length %
*character-case-pages
*%
)))))
93 (table (sb-xc:make-array
94 (* 2 (length unicode-table
))
95 :retain-specialization-for-after-xc-core t
96 :element-type
'(unsigned-byte 32)))
99 (length (length info
)))
100 (labels ((read-codepoint ()
101 (let* ((b1 (aref info index
))
102 (b2 (aref info
(incf index
)))
103 (b3 (aref info
(incf index
))))
106 (dpb b2
(byte 8 8) b3
))))
107 (read-length-tagged ()
108 (let ((len (aref info index
)) ret
)
114 (push (read-codepoint) ret
))
117 (both-case-p-local (code)
118 (logbitp 7 (aref misc-database
119 (+ 5 (misc-index-from-char-code
120 code high-pages low-pages
))))))
122 until
(>= index length
)
123 do
(let* ((key (read-codepoint))
124 (upper (read-length-tagged))
125 (lower (read-length-tagged))
126 (page (aref %
*character-case-pages
*%
(ash key -
6)))
127 (i (+ (ash page
6) (ldb (byte 6 0) key
))))
128 (setf (aref unicode-table i
)
129 (if (or (consp upper
)
132 (dpb upper
(byte 21 21) lower
)))
133 (when (and (atom upper
)
135 ;; Some characters are only equal under unicode rules,
136 ;; e.g. #\MICRO_SIGN and #\GREEK_CAPITAL_LETTER_MU
138 (both-case-p-local lower
)
140 (both-case-p-local upper
))
141 (setf (aref table
(* i
2)) lower
142 (aref table
(1+ (* i
2))) upper
)))))
143 `((defconstant-eqx +character-unicode-cases
+ ,unicode-table
#'equalp
)
144 (defconstant-eqx +character-cases
+ ,table
#'equalp
))))))))
147 ;;;; UCD accessor functions
149 ;;; The character database is made of several arrays.
150 ;;; +CHARACTER-MISC-DATABASE+ is an array of bytes that encode character
151 ;;; attributes. Each entry in the misc database is +MISC-WIDTH+ (currently 9)
152 ;;; bytes wide. Within each entry, the bytes represent: general category, BIDI
153 ;;; class, canonical combining class, digit value, decomposition info, other
154 ;;; flags, script, line break class, and age, respectively. Several of the
155 ;;; entries have additional information encoded in them at the bit level. The
156 ;;; digit value field is equal to 128 (has only its high bit set) if characters
157 ;;; with that set of attribute are not digits. Bit 6 is set if that entry
158 ;;; encodes decimal digits, that is, characters that are DIGIT-CHAR-P. The rest
159 ;;; of the value is the digit value of characters with that entry. Decomposition
160 ;;; info contains the length of the decomposition of characters with that entry,
161 ;;; and also sets its high bit if the decompositions are compatibility
162 ;;; decompositions. The other flags byte encodes boolean properties. Bit 7 is
163 ;;; set if the entry's characters are BOTH-CASE-P in the Common Lisp sense. Bit
164 ;;; 6 is set if the entry's characters have a defined case transformation in
165 ;;; Unicode. Bit 5 is set if the characters have the property BIDI_Mirrored=Y.
166 ;;; Bits 3-0 encode the entry's East Asian Width. Bit 4 is unused. Age stores
167 ;;; the minor version in bits 0-2, and the major version in the remaining 5
170 ;;; To find which entry in +CHARACTER-MISC-DATABASE+ encodes a
171 ;;; character's attributes, first index +CHARACTER-HIGH-PAGES+ (an
172 ;;; array of 16-bit values) with the high 13 bits of the character's
173 ;;; codepoint. If the result value has its high bit set, the character
174 ;;; is in a "compressed page", where all characters on that
175 ;;; 256-character page have the same misc entry. To find the misc
176 ;;; entry number, simply clear the high bit. If the high bit is not
177 ;;; set, the misc entry number must be looked up in
178 ;;; +CHARACTER-LOW-PAGES+, which is an array of 16-bit values. Each
179 ;;; entry in the array consists of two such values, the misc entry
180 ;;; number and the decomposition index. To find the misc entry number,
181 ;;; index into +CHARACTER-LOW-PAGES+ using the value retreived from
182 ;;; +CHARACTER-HIGH-PAGES+ (shifted left 8 bits) plus the low 8 bits
183 ;;; of the codepoint, all times two to account for the widtth of the
184 ;;; entries. The value in +CHARACTER-LOW-PAGES+ at this point is the
185 ;;; misc entry number. To transform a misc entry number into an index
186 ;;; into +CHARACTER-MISC-DATABASE+, multiply it by +MISC-WIDTH+. This
187 ;;; gives the index of the start of the charater's misc entry in
188 ;;; +CHARACTER-MISC-DATABASE+.
190 ;;; To look up a character's decomposition, first retreive its
191 ;;; decomposition-info from the misc database as described above. If the
192 ;;; decomposition info is not 0, the character has a decomposition with a
193 ;;; length given by the decomposition info with the high bit (which indicates
194 ;;; compatibility/canonical status) cleared. To find the decomposition, move
195 ;;; one value past the character's misc entry number in
196 ;;; **CHARACTER-LOW-DATABASE**, which gives an index into
197 ;;; **CHARACTER-DECOMPOSITIONS**. The next LENGTH values in
198 ;;; **CHARACTER-DECOMPOSITIONS** (an array of codepoints), starting at this
199 ;;; index, are the decomposition of the character. This proceduce does not
200 ;;; apply to Hangul syllables, which have their own decomposition algorithm.
202 ;;; Case information is stored in +CHARACTER-UNICODE-CASES+, an array that
203 ;;; indirectly maps a character's codepoint to (cons uppercase
204 ;;; lowercase). Uppercase and lowercase are either a single codepoint,
205 ;;; which is the upper- or lower-case of the given character, or a
206 ;;; list of codepoints which taken as a whole are the upper- or
207 ;;; lower-case. These case lists are only used in Unicode case
208 ;;; transformations, not in Common Lisp ones.
210 ;;; +CHARACTER-CASES+ is similar to the above but it stores codes in
211 ;;; a flat array twice as large, and it includes only the standard casing rules,
212 ;;; so there's always just two characters.
214 ;;; Primary composition information is stored in a hash table local to
215 ;;; PRIMARY-COMPOSITION, with (+ (ash codepoint1 21) codepoint2) as
216 ;;; keys and the composition as the value
218 (defun misc-index (char)
219 (misc-index-from-char-code (char-code char
)
220 sb-unicode
::+character-high-pages
+
221 sb-unicode
::+character-low-pages
+))
223 (aver (csubtypep (global-ftype 'misc-index
)
224 (specifier-type '(sfunction (t) (unsigned-byte 16)))))
225 (proclaim `(ftype ,(type-specifier (global-ftype 'misc-index
)) misc-index
))
227 (declaim (ftype (sfunction (t) (unsigned-byte 8)) ucd-general-category
)
228 (inline ucd-general-category
))
229 (defun ucd-general-category (char)
230 (aref sb-unicode
::+character-misc-database
+ (misc-index char
)))
232 (defun ucd-decimal-digit (char)
233 (let ((digit (aref sb-unicode
::+character-misc-database
+
234 (+ 3 (misc-index char
)))))
235 (when (logbitp 6 digit
) ; decimalp flag
236 (ldb (byte 4 0) digit
))))
237 (proclaim `(ftype ,(type-specifier (global-ftype 'ucd-decimal-digit
)) ucd-decimal-digit
))
239 (defun char-code (char)
240 "Return the integer code of CHAR."
243 (defun char-int (char)
244 "Return the integer code of CHAR. (In SBCL this is the same as CHAR-CODE, as
245 there are no character bits or fonts.)"
248 (defun code-char (code)
249 "Return the character with the code CODE."
252 (defun character (object)
253 "Coerce OBJECT into a CHARACTER if possible. Legal inputs are characters,
254 strings and symbols of length 1."
255 (flet ((do-error (control args
)
256 (error 'simple-type-error
258 ;;?? how to express "symbol with name of length 1"?
259 :expected-type
'(or character
(string 1))
260 :format-control control
261 :format-arguments args
)))
264 (string (if (= 1 (length (the string object
)))
267 "String is not of length one: ~S" (list object
))))
268 (symbol (if (= 1 (length (symbol-name object
)))
269 (schar (symbol-name object
) 0)
271 "Symbol name is not of length one: ~S" (list object
))))
272 (t (do-error "~S cannot be coerced to a character." (list object
))))))
276 (defun standard-char-p (char)
277 "The argument must be a character object. STANDARD-CHAR-P returns T if the
278 argument is a standard character -- one of the 95 ASCII printing characters or
280 (let ((n (char-code char
)))
284 (defun %standard-char-p
(thing)
285 "Return T if and only if THING is a standard-char. Differs from
286 STANDARD-CHAR-P in that THING doesn't have to be a character."
287 (and (characterp thing
) (standard-char-p thing
)))
289 (defun graphic-char-p (char)
290 "The argument must be a character object. GRAPHIC-CHAR-P returns T if the
291 argument is a printing character (space through ~ in ASCII), otherwise returns
293 (let ((n (char-code char
)))
297 (defun alpha-char-p (char)
298 "The argument must be a character object. ALPHA-CHAR-P returns T if the
299 argument is an alphabetic character, A-Z or a-z; otherwise NIL."
300 (< (ucd-general-category char
) 5))
302 (defmacro with-case-info
((char index-var cases-var
304 (cases +character-cases
+))
306 (let ((code-var (gensym "CODE"))
307 (shifted-var (gensym "SHIFTED"))
308 (page-var (gensym "PAGE")))
311 (declare (optimize (sb-c:insert-array-bounds-checks
0)))
312 (let ((,code-var
(char-code ,char
)))
313 (let* ((,shifted-var
(ash ,code-var -
6))
314 (,page-var
(if (>= ,shifted-var
,(length +character-case-pages
+))
316 (aref ,+character-case-pages
+ ,shifted-var
))))
317 (if (= ,page-var
255)
319 (let ((,index-var
(* (+ (ash ,page-var
6)
320 (ldb (byte 6 0) ,code-var
))
325 (defun both-case-p (char)
326 "The argument must be a character object. BOTH-CASE-P returns T if the
327 argument is an alphabetic character and if the character exists in both upper
328 and lower case. For ASCII, this is the same as ALPHA-CHAR-P."
329 (with-case-info (char index cases
)
330 (plusp (aref cases index
))))
332 (defun upper-case-p (char)
333 "The argument must be a character object; UPPER-CASE-P returns T if the
334 argument is an upper-case character, NIL otherwise."
335 (with-case-info (char index cases
)
336 (= (aref cases
(1+ index
))
339 (defun lower-case-p (char)
340 "The argument must be a character object; LOWER-CASE-P returns T if the
341 argument is a lower-case character, NIL otherwise."
342 (with-case-info (char index cases
)
343 (= (aref cases index
)
346 (defun char-upcase (char)
347 "Return CHAR converted to upper-case if that is possible. Don't convert
348 lowercase eszet (U+DF)."
349 (with-case-info (char index cases
351 (let ((code (aref cases
(1+ index
))))
356 (defun char-downcase (char)
357 "Return CHAR converted to lower-case if that is possible."
358 (with-case-info (char index cases
360 (let ((code (aref cases index
)))
365 (defun alphanumericp (char)
366 "Given a character-object argument, ALPHANUMERICP returns T if the argument
367 is either numeric or alphabetic."
368 (let ((gc (ucd-general-category char
)))
372 ;;; EQUAL-CHAR-CODE is used by the following functions as a version of CHAR-INT
373 ;;; which loses font, bits, and case info.
375 ;;; Return a cons with (upper-case . lower-case), where it either can
376 ;;; be a character code or a list of character codes if the character
377 ;;; donwcases or upcases into multiple characters.
378 (declaim (inline char-case-info
))
379 (defun char-case-info (character)
380 (let* ((code (char-code character
))
381 (page (aref +character-case-pages
+ (ash code -
6))))
382 ;; Pages with 255 means the character is not both-case.
383 ;; +CHARACTER-CASES+ has 0 for those characters.
384 (aref +character-unicode-cases
+
386 (ldb (byte 6 0) code
)))))
388 ;;; Returns the downcased code or the character code
389 (declaim (inline equal-char-code
))
390 (defun equal-char-code (char)
391 (let* ((code (char-code char
))
392 (shifted (ash code -
6))
393 (page (if (>= shifted
(length +character-case-pages
+))
394 (return-from equal-char-code code
)
395 (aref #.
+character-case-pages
+ shifted
))))
398 (let* ((page (truly-the (integer 0
399 #.
(loop for x across
+character-case-pages
+
404 (aref #.
+character-cases
+
406 (ldb (byte 6 0) code
))
408 (if (zerop down-code
)
412 (declaim (inline two-arg-char-equal-inline
))
413 (defun two-arg-char-equal-inline (c1 c2
)
414 (flet ((base-char-equal-p ()
415 (let* ((code1 (char-code c1
))
416 (code2 (char-code c2
))
417 (sum (logxor code1 code2
)))
419 (let ((sum (+ code1 code2
)))
420 (or (and (< 161 sum
213))
421 (and (< 415 sum
461))
422 (and (< 463 sum
477))))))))
423 (declare (inline base-char-equal-p
))
430 (and (base-char-p c2
)
431 (base-char-equal-p)))
437 (with-case-info (c1 index cases
)
438 (or (= (aref cases index
) (char-code c2
)) ;; lower case
439 (= (aref cases
(1+ index
)) (char-code c2
))))))))
441 ;;; There are transforms on two-arg-char-equal, don't make it inlinable itself.
442 (defun two-arg-char-equal (c1 c2
)
443 (two-arg-char-equal-inline c1 c2
))
445 (defun two-arg-char-not-equal (c1 c2
)
446 (not (two-arg-char-equal-inline c1 c2
)))
448 (macrolet ((def (name test doc
)
449 `(defun ,name
(character &rest more-characters
)
452 (do ((c character
(nth i more-characters
))
454 ((>= i
(length more-characters
)) t
)
455 (do-rest-arg ((c2) more-characters i
)
457 (return-from ,name nil
))))
458 ;; CHAR-NOT-EQUAL has explicit check attribute
459 (progn (the character character
) t
)))))
460 (def char
/= (eq c
(the character c2
))
461 "Return T if no two of the arguments are the same character.")
462 (def char-not-equal
(two-arg-char-equal c c2
)
463 "Return T if no two of the arguments are the same character.
466 (defun two-arg-char-lessp (c1 c2
)
467 (< (equal-char-code c1
) (equal-char-code c2
)))
469 (defun two-arg-char-greaterp (c1 c2
)
470 (> (equal-char-code c1
) (equal-char-code c2
)))
472 (defun two-arg-char-not-greaterp (c1 c2
)
473 (<= (equal-char-code c1
) (equal-char-code c2
)))
475 (defun two-arg-char-not-lessp (c1 c2
)
476 (>= (equal-char-code c1
) (equal-char-code c2
)))
478 (macrolet ((def (op test doc
&optional explicit-check
)
479 `(defun ,op
(character &rest more-characters
)
481 ,@(when explicit-check
`((declare (explicit-check))))
482 (let ((c1 character
))
483 (declare (character c1
))
484 (do-rest-arg ((c2 i
) more-characters
0 t
)
487 (return (do-rest-arg ((c) more-characters
(1+ i
))
488 (the character c
))))))))) ; for effect
490 (def char
= (eq c1
(the character c2
))
491 "Return T if all of the arguments are the same character.")
492 (def char
< (< (char-int c1
) (char-int c2
))
493 "Return T if the arguments are in strictly increasing alphabetic order.")
494 (def char
> (> (char-int c1
) (char-int c2
))
495 "Return T if the arguments are in strictly decreasing alphabetic order.")
496 (def char
<= (<= (char-int c1
) (char-int c2
))
497 "Return T if the arguments are in strictly non-decreasing alphabetic order.")
498 (def char
>= (>= (char-int c1
) (char-int c2
))
499 "Return T if the arguments are in strictly non-increasing alphabetic order.")
502 (def char-equal
(two-arg-char-equal c1 c2
)
503 "Return T if all of the arguments are the same character.
505 (def char-lessp
(two-arg-char-lessp c1 c2
)
506 "Return T if the arguments are in strictly increasing alphabetic order.
508 (def char-greaterp
(two-arg-char-greaterp c1 c2
)
509 "Return T if the arguments are in strictly decreasing alphabetic order.
511 (def char-not-greaterp
(two-arg-char-not-greaterp c1 c2
)
512 "Return T if the arguments are in strictly non-decreasing alphabetic order.
514 (def char-not-lessp
(two-arg-char-not-lessp c1 c2
)
515 "Return T if the arguments are in strictly non-increasing alphabetic order.
516 Case is ignored." t
))
519 (defun digit-char-p (char &optional
(radix 10.
))
520 "If char is a digit in the specified radix, returns the fixnum for which
521 that digit stands, else returns NIL."
522 (let ((code (char-code char
)))
523 (if (<= code
1632) ;; (loop for code from 127 when (digit-char-p (code-char code)) return code)
524 (let ((weight (- code
48)))
525 (cond ((minusp weight
) nil
)
527 ;; Special-case ASCII digits in decimal and smaller radices.
528 (if (< weight radix
) weight nil
))
529 ;; Digits 0 - 9 are used as is, since radix is larger.
530 ((< weight
10) weight
)
532 (let ((weight (logior #x20 code
))) ;; downcase ASCII characters.
533 (when (and (>= (decf weight
(- (char-code #\a) 10)) 10)
536 (let ((number (ucd-decimal-digit char
)))
537 (when (and number
(< number radix
))
540 (defun digit-char (weight &optional
(radix 10))
541 "All arguments must be integers. Returns a character object that represents
542 a digit of the given weight in the specified radix. Returns NIL if no such
544 (declare (explicit-check weight
))
545 (cond ((typep weight
'(and unsigned-byte fixnum
))
546 (and (< weight radix
)
547 (code-char (if (< weight
10) (+ 48 weight
) (+ 55 weight
)))))
549 (the unsigned-byte weight
)