Fix cross build.
[sbcl.git] / src / code / target-char.lisp
blob5cc824531cb7043d1a7ccb5f82853ad8853669c7
1 ;;;; character functions
3 ;;;; This software is part of the SBCL system. See the README file for
4 ;;;; more information.
5 ;;;;
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
12 (in-package "SB-IMPL")
14 ;;; We compile some trivial character operations via inline expansion.
15 (declaim (inline standard-char-p graphic-char-p alpha-char-p
16 alphanumericp))
17 (declaim (maybe-inline upper-case-p lower-case-p both-case-p
18 digit-char-p))
20 (declaim (inline clear-flag))
21 (eval-when (:compile-toplevel :load-toplevel :execute)
22 (defun clear-flag (bit integer)
23 (logandc2 integer (ash 1 bit))))
25 (eval-when (:compile-toplevel)
26 (defconstant +misc-width+ 9)
27 (defmacro misc-index-from-char-code (codepoint high-pages low-pages)
28 (let ((high-pages-value (and (boundp high-pages)
29 (symbol-value high-pages)))
30 (low-pages-value (and (boundp low-pages)
31 (symbol-value low-pages))))
32 `(let* ((cp ,codepoint)
33 (cp-high (ash cp -8))
34 (high-index (aref ,high-pages cp-high)))
35 (* ,+misc-width+
36 (if (logbitp 15 high-index)
37 (truly-the
38 (integer 0
39 ,(if high-pages-value
40 (loop for x across high-pages-value
41 when (logbitp 15 x)
42 maximize (clear-flag 15 x))
43 '*))
44 (clear-flag 15 high-index))
45 (truly-the
46 (integer 0
47 ,(if low-pages-value
48 (loop for i below (length low-pages-value) by 2
49 maximize
50 (aref low-pages-value i))
51 '*))
52 (aref ,low-pages
53 (* 2 (+ (ldb (byte 8 0) cp)
54 (ash (truly-the
55 (integer 0
56 ,(if high-pages-value
57 (loop for x across high-pages-value
58 unless (logbitp 15 x)
59 maximize x)
60 '*))
61 high-index)
62 8))))))))))
63 (setf (sb-xc:macro-function 'misc-index-from-char-code)
64 (lambda (form env)
65 (declare (ignore env))
66 (funcall (cl:macro-function 'misc-index-from-char-code) form nil))))
68 (macrolet ((frob ()
69 (flet ((file (name type)
70 (sb-cold:find-bootstrap-file (format nil "output/ucd/~A.~A" name type)))
71 (make-ubn-vector (raw-bytes n)
72 (aver (member n '(1 2)))
73 (ubN-array-from-octets raw-bytes `(unsigned-byte ,(* 8 n)) n)))
74 (let* ((misc-database (read-ub8-vector (file "ucdmisc" "dat")))
75 (ucd-high-pages (read-ub8-vector (file "ucdhigh" "dat")))
76 (ucd-low-pages (read-ub8-vector (file "ucdlow" "dat")))
77 (case-data (read-ub8-vector (file "case" "dat")))
78 (case-pages (read-ub8-vector (file "casepages" "dat")))
79 (high-pages (make-ubn-vector ucd-high-pages 2))
80 (low-pages (make-ubn-vector ucd-low-pages 2))
81 (%*character-case-pages*% (make-ubn-vector case-pages 1)))
83 `(progn
84 (defconstant-eqx sb-unicode::+character-misc-database+ ,misc-database #'equalp)
85 (defconstant-eqx sb-unicode::+character-high-pages+ ,high-pages #'equalp)
86 (defconstant-eqx sb-unicode::+character-low-pages+ ,low-pages #'equalp)
87 (defconstant-eqx +character-case-pages+ ,%*character-case-pages*% #'equalp)
88 ,@(let* ((unicode-table
89 (make-array
90 (* 64 (1+ (aref %*character-case-pages*%
91 (1- (length %*character-case-pages*%)))))
92 :initial-element 0))
93 (table (sb-xc:make-array
94 (* 2 (length unicode-table))
95 :retain-specialization-for-after-xc-core t
96 :element-type '(unsigned-byte 32)))
97 (info case-data)
98 (index 0)
99 (length (length info)))
100 (labels ((read-codepoint ()
101 (let* ((b1 (aref info index))
102 (b2 (aref info (incf index)))
103 (b3 (aref info (incf index))))
104 (incf index)
105 (dpb b1 (byte 8 16)
106 (dpb b2 (byte 8 8) b3))))
107 (read-length-tagged ()
108 (let ((len (aref info index)) ret)
109 (incf index)
110 (cond ((zerop len)
111 (read-codepoint))
113 (dotimes (i len)
114 (push (read-codepoint) ret))
115 (nreverse ret)))))
116 #+sb-unicode
117 (both-case-p-local (code)
118 (logbitp 7 (aref misc-database
119 (+ 5 (misc-index-from-char-code
120 code high-pages low-pages))))))
121 (loop
122 until (>= index length)
123 do (let* ((key (read-codepoint))
124 (upper (read-length-tagged))
125 (lower (read-length-tagged))
126 (page (aref %*character-case-pages*% (ash key -6)))
127 (i (+ (ash page 6) (ldb (byte 6 0) key))))
128 (setf (aref unicode-table i)
129 (if (or (consp upper)
130 (consp lower))
131 (cons upper lower)
132 (dpb upper (byte 21 21) lower)))
133 (when (and (atom upper)
134 (atom lower)
135 ;; Some characters are only equal under unicode rules,
136 ;; e.g. #\MICRO_SIGN and #\GREEK_CAPITAL_LETTER_MU
137 #+sb-unicode
138 (both-case-p-local lower)
139 #+sb-unicode
140 (both-case-p-local upper))
141 (setf (aref table (* i 2)) lower
142 (aref table (1+ (* i 2))) upper)))))
143 `((defconstant-eqx +character-unicode-cases+ ,unicode-table #'equalp)
144 (defconstant-eqx +character-cases+ ,table #'equalp))))))))
145 (frob))
147 ;;;; UCD accessor functions
149 ;;; The character database is made of several arrays.
150 ;;; +CHARACTER-MISC-DATABASE+ is an array of bytes that encode character
151 ;;; attributes. Each entry in the misc database is +MISC-WIDTH+ (currently 9)
152 ;;; bytes wide. Within each entry, the bytes represent: general category, BIDI
153 ;;; class, canonical combining class, digit value, decomposition info, other
154 ;;; flags, script, line break class, and age, respectively. Several of the
155 ;;; entries have additional information encoded in them at the bit level. The
156 ;;; digit value field is equal to 128 (has only its high bit set) if characters
157 ;;; with that set of attribute are not digits. Bit 6 is set if that entry
158 ;;; encodes decimal digits, that is, characters that are DIGIT-CHAR-P. The rest
159 ;;; of the value is the digit value of characters with that entry. Decomposition
160 ;;; info contains the length of the decomposition of characters with that entry,
161 ;;; and also sets its high bit if the decompositions are compatibility
162 ;;; decompositions. The other flags byte encodes boolean properties. Bit 7 is
163 ;;; set if the entry's characters are BOTH-CASE-P in the Common Lisp sense. Bit
164 ;;; 6 is set if the entry's characters have a defined case transformation in
165 ;;; Unicode. Bit 5 is set if the characters have the property BIDI_Mirrored=Y.
166 ;;; Bits 3-0 encode the entry's East Asian Width. Bit 4 is unused. Age stores
167 ;;; the minor version in bits 0-2, and the major version in the remaining 5
168 ;;; bits.
170 ;;; To find which entry in +CHARACTER-MISC-DATABASE+ encodes a
171 ;;; character's attributes, first index +CHARACTER-HIGH-PAGES+ (an
172 ;;; array of 16-bit values) with the high 13 bits of the character's
173 ;;; codepoint. If the result value has its high bit set, the character
174 ;;; is in a "compressed page", where all characters on that
175 ;;; 256-character page have the same misc entry. To find the misc
176 ;;; entry number, simply clear the high bit. If the high bit is not
177 ;;; set, the misc entry number must be looked up in
178 ;;; +CHARACTER-LOW-PAGES+, which is an array of 16-bit values. Each
179 ;;; entry in the array consists of two such values, the misc entry
180 ;;; number and the decomposition index. To find the misc entry number,
181 ;;; index into +CHARACTER-LOW-PAGES+ using the value retreived from
182 ;;; +CHARACTER-HIGH-PAGES+ (shifted left 8 bits) plus the low 8 bits
183 ;;; of the codepoint, all times two to account for the widtth of the
184 ;;; entries. The value in +CHARACTER-LOW-PAGES+ at this point is the
185 ;;; misc entry number. To transform a misc entry number into an index
186 ;;; into +CHARACTER-MISC-DATABASE+, multiply it by +MISC-WIDTH+. This
187 ;;; gives the index of the start of the charater's misc entry in
188 ;;; +CHARACTER-MISC-DATABASE+.
190 ;;; To look up a character's decomposition, first retreive its
191 ;;; decomposition-info from the misc database as described above. If the
192 ;;; decomposition info is not 0, the character has a decomposition with a
193 ;;; length given by the decomposition info with the high bit (which indicates
194 ;;; compatibility/canonical status) cleared. To find the decomposition, move
195 ;;; one value past the character's misc entry number in
196 ;;; **CHARACTER-LOW-DATABASE**, which gives an index into
197 ;;; **CHARACTER-DECOMPOSITIONS**. The next LENGTH values in
198 ;;; **CHARACTER-DECOMPOSITIONS** (an array of codepoints), starting at this
199 ;;; index, are the decomposition of the character. This proceduce does not
200 ;;; apply to Hangul syllables, which have their own decomposition algorithm.
202 ;;; Case information is stored in +CHARACTER-UNICODE-CASES+, an array that
203 ;;; indirectly maps a character's codepoint to (cons uppercase
204 ;;; lowercase). Uppercase and lowercase are either a single codepoint,
205 ;;; which is the upper- or lower-case of the given character, or a
206 ;;; list of codepoints which taken as a whole are the upper- or
207 ;;; lower-case. These case lists are only used in Unicode case
208 ;;; transformations, not in Common Lisp ones.
210 ;;; +CHARACTER-CASES+ is similar to the above but it stores codes in
211 ;;; a flat array twice as large, and it includes only the standard casing rules,
212 ;;; so there's always just two characters.
214 ;;; Primary composition information is stored in a hash table local to
215 ;;; PRIMARY-COMPOSITION, with (+ (ash codepoint1 21) codepoint2) as
216 ;;; keys and the composition as the value
218 (defun misc-index (char)
219 (misc-index-from-char-code (char-code char)
220 sb-unicode::+character-high-pages+
221 sb-unicode::+character-low-pages+))
223 (aver (csubtypep (global-ftype 'misc-index)
224 (specifier-type '(sfunction (t) (unsigned-byte 16)))))
225 (proclaim `(ftype ,(type-specifier (global-ftype 'misc-index)) misc-index))
227 (declaim (ftype (sfunction (t) (unsigned-byte 8)) ucd-general-category)
228 (inline ucd-general-category))
229 (defun ucd-general-category (char)
230 (aref sb-unicode::+character-misc-database+ (misc-index char)))
232 (defun ucd-decimal-digit (char)
233 (let ((digit (aref sb-unicode::+character-misc-database+
234 (+ 3 (misc-index char)))))
235 (when (logbitp 6 digit) ; decimalp flag
236 (ldb (byte 4 0) digit))))
237 (proclaim `(ftype ,(type-specifier (global-ftype 'ucd-decimal-digit)) ucd-decimal-digit))
239 (defun char-code (char)
240 "Return the integer code of CHAR."
241 (char-code char))
243 (defun char-int (char)
244 "Return the integer code of CHAR. (In SBCL this is the same as CHAR-CODE, as
245 there are no character bits or fonts.)"
246 (char-code char))
248 (defun code-char (code)
249 "Return the character with the code CODE."
250 (code-char code))
252 (defun character (object)
253 "Coerce OBJECT into a CHARACTER if possible. Legal inputs are characters,
254 strings and symbols of length 1."
255 (flet ((do-error (control args)
256 (error 'simple-type-error
257 :datum object
258 ;;?? how to express "symbol with name of length 1"?
259 :expected-type '(or character (string 1))
260 :format-control control
261 :format-arguments args)))
262 (typecase object
263 (character object)
264 (string (if (= 1 (length (the string object)))
265 (char object 0)
266 (do-error
267 "String is not of length one: ~S" (list object))))
268 (symbol (if (= 1 (length (symbol-name object)))
269 (schar (symbol-name object) 0)
270 (do-error
271 "Symbol name is not of length one: ~S" (list object))))
272 (t (do-error "~S cannot be coerced to a character." (list object))))))
274 ;;;; predicates
276 (defun standard-char-p (char)
277 "The argument must be a character object. STANDARD-CHAR-P returns T if the
278 argument is a standard character -- one of the 95 ASCII printing characters or
279 <return>."
280 (let ((n (char-code char)))
281 (or (< 31 n 127)
282 (= n 10))))
284 (defun %standard-char-p (thing)
285 "Return T if and only if THING is a standard-char. Differs from
286 STANDARD-CHAR-P in that THING doesn't have to be a character."
287 (and (characterp thing) (standard-char-p thing)))
289 (defun graphic-char-p (char)
290 "The argument must be a character object. GRAPHIC-CHAR-P returns T if the
291 argument is a printing character (space through ~ in ASCII), otherwise returns
292 NIL."
293 (let ((n (char-code char)))
294 (or (< 31 n 127)
295 (< 159 n))))
297 (defun alpha-char-p (char)
298 "The argument must be a character object. ALPHA-CHAR-P returns T if the
299 argument is an alphabetic character, A-Z or a-z; otherwise NIL."
300 (< (ucd-general-category char) 5))
302 (defmacro with-case-info ((char index-var cases-var
303 &key miss-value
304 (cases +character-cases+))
305 &body body)
306 (let ((code-var (gensym "CODE"))
307 (shifted-var (gensym "SHIFTED"))
308 (page-var (gensym "PAGE")))
309 `(block nil
310 (locally
311 (declare (optimize (sb-c:insert-array-bounds-checks 0)))
312 (let ((,code-var (char-code ,char)))
313 (let* ((,shifted-var (ash ,code-var -6))
314 (,page-var (if (>= ,shifted-var ,(length +character-case-pages+))
315 (return ,miss-value)
316 (aref ,+character-case-pages+ ,shifted-var))))
317 (if (= ,page-var 255)
318 ,miss-value
319 (let ((,index-var (* (+ (ash ,page-var 6)
320 (ldb (byte 6 0) ,code-var))
322 (,cases-var ,cases))
323 ,@body))))))))
325 (defun both-case-p (char)
326 "The argument must be a character object. BOTH-CASE-P returns T if the
327 argument is an alphabetic character and if the character exists in both upper
328 and lower case. For ASCII, this is the same as ALPHA-CHAR-P."
329 (with-case-info (char index cases)
330 (plusp (aref cases index))))
332 (defun upper-case-p (char)
333 "The argument must be a character object; UPPER-CASE-P returns T if the
334 argument is an upper-case character, NIL otherwise."
335 (with-case-info (char index cases)
336 (= (aref cases (1+ index))
337 (char-code char))))
339 (defun lower-case-p (char)
340 "The argument must be a character object; LOWER-CASE-P returns T if the
341 argument is a lower-case character, NIL otherwise."
342 (with-case-info (char index cases)
343 (= (aref cases index)
344 (char-code char))))
346 (defun char-upcase (char)
347 "Return CHAR converted to upper-case if that is possible. Don't convert
348 lowercase eszet (U+DF)."
349 (with-case-info (char index cases
350 :miss-value char)
351 (let ((code (aref cases (1+ index))))
352 (if (zerop code)
353 char
354 (code-char code)))))
356 (defun char-downcase (char)
357 "Return CHAR converted to lower-case if that is possible."
358 (with-case-info (char index cases
359 :miss-value char)
360 (let ((code (aref cases index)))
361 (if (zerop code)
362 char
363 (code-char code)))))
365 (defun alphanumericp (char)
366 "Given a character-object argument, ALPHANUMERICP returns T if the argument
367 is either numeric or alphabetic."
368 (let ((gc (ucd-general-category char)))
369 (or (< gc 5)
370 (= gc 13))))
372 ;;; EQUAL-CHAR-CODE is used by the following functions as a version of CHAR-INT
373 ;;; which loses font, bits, and case info.
375 ;;; Return a cons with (upper-case . lower-case), where it either can
376 ;;; be a character code or a list of character codes if the character
377 ;;; donwcases or upcases into multiple characters.
378 (declaim (inline char-case-info))
379 (defun char-case-info (character)
380 (let* ((code (char-code character))
381 (page (aref +character-case-pages+ (ash code -6))))
382 ;; Pages with 255 means the character is not both-case.
383 ;; +CHARACTER-CASES+ has 0 for those characters.
384 (aref +character-unicode-cases+
385 (+ (ash page 6)
386 (ldb (byte 6 0) code)))))
388 ;;; Returns the downcased code or the character code
389 (declaim (inline equal-char-code))
390 (defun equal-char-code (char)
391 (let* ((code (char-code char))
392 (shifted (ash code -6))
393 (page (if (>= shifted (length +character-case-pages+))
394 (return-from equal-char-code code)
395 (aref #.+character-case-pages+ shifted))))
396 (if (= page 255)
397 code
398 (let* ((page (truly-the (integer 0
399 #.(loop for x across +character-case-pages+
400 unless (= x 255)
401 maximize x))
402 page))
403 (down-code
404 (aref #.+character-cases+
405 (* (+ (ash page 6)
406 (ldb (byte 6 0) code))
407 2))))
408 (if (zerop down-code)
409 code
410 down-code)))))
412 (declaim (inline two-arg-char-equal-inline))
413 (defun two-arg-char-equal-inline (c1 c2)
414 (flet ((base-char-equal-p ()
415 (let* ((code1 (char-code c1))
416 (code2 (char-code c2))
417 (sum (logxor code1 code2)))
418 (when (eql sum #x20)
419 (let ((sum (+ code1 code2)))
420 (or (and (< 161 sum 213))
421 (and (< 415 sum 461))
422 (and (< 463 sum 477))))))))
423 (declare (inline base-char-equal-p))
424 (cond ((eq c1 c2))
425 #-sb-unicode
427 (base-char-equal-p))
428 #+sb-unicode
429 ((base-char-p c1)
430 (and (base-char-p c2)
431 (base-char-equal-p)))
432 #+sb-unicode
433 ((base-char-p c2)
434 nil)
435 #+sb-unicode
437 (with-case-info (c1 index cases)
438 (or (= (aref cases index) (char-code c2)) ;; lower case
439 (= (aref cases (1+ index)) (char-code c2))))))))
441 ;;; There are transforms on two-arg-char-equal, don't make it inlinable itself.
442 (defun two-arg-char-equal (c1 c2)
443 (two-arg-char-equal-inline c1 c2))
445 (defun two-arg-char-not-equal (c1 c2)
446 (not (two-arg-char-equal-inline c1 c2)))
448 (macrolet ((def (name test doc)
449 `(defun ,name (character &rest more-characters)
450 ,doc
451 (if more-characters
452 (do ((c character (nth i more-characters))
453 (i 0 (1+ i)))
454 ((>= i (length more-characters)) t)
455 (do-rest-arg ((c2) more-characters i)
456 (when ,test
457 (return-from ,name nil))))
458 ;; CHAR-NOT-EQUAL has explicit check attribute
459 (progn (the character character) t)))))
460 (def char/= (eq c (the character c2))
461 "Return T if no two of the arguments are the same character.")
462 (def char-not-equal (two-arg-char-equal c c2)
463 "Return T if no two of the arguments are the same character.
464 Case is ignored."))
466 (defun two-arg-char-lessp (c1 c2)
467 (< (equal-char-code c1) (equal-char-code c2)))
469 (defun two-arg-char-greaterp (c1 c2)
470 (> (equal-char-code c1) (equal-char-code c2)))
472 (defun two-arg-char-not-greaterp (c1 c2)
473 (<= (equal-char-code c1) (equal-char-code c2)))
475 (defun two-arg-char-not-lessp (c1 c2)
476 (>= (equal-char-code c1) (equal-char-code c2)))
478 (macrolet ((def (op test doc &optional explicit-check)
479 `(defun ,op (character &rest more-characters)
480 ,doc
481 ,@(when explicit-check `((declare (explicit-check))))
482 (let ((c1 character))
483 (declare (character c1))
484 (do-rest-arg ((c2 i) more-characters 0 t)
485 (if ,test
486 (setq c1 c2)
487 (return (do-rest-arg ((c) more-characters (1+ i))
488 (the character c))))))))) ; for effect
489 ;; case-sensitive
490 (def char= (eq c1 (the character c2))
491 "Return T if all of the arguments are the same character.")
492 (def char< (< (char-int c1) (char-int c2))
493 "Return T if the arguments are in strictly increasing alphabetic order.")
494 (def char> (> (char-int c1) (char-int c2))
495 "Return T if the arguments are in strictly decreasing alphabetic order.")
496 (def char<= (<= (char-int c1) (char-int c2))
497 "Return T if the arguments are in strictly non-decreasing alphabetic order.")
498 (def char>= (>= (char-int c1) (char-int c2))
499 "Return T if the arguments are in strictly non-increasing alphabetic order.")
501 ;; case-insensitive
502 (def char-equal (two-arg-char-equal c1 c2)
503 "Return T if all of the arguments are the same character.
504 Case is ignored." t)
505 (def char-lessp (two-arg-char-lessp c1 c2)
506 "Return T if the arguments are in strictly increasing alphabetic order.
507 Case is ignored." t)
508 (def char-greaterp (two-arg-char-greaterp c1 c2)
509 "Return T if the arguments are in strictly decreasing alphabetic order.
510 Case is ignored." t)
511 (def char-not-greaterp (two-arg-char-not-greaterp c1 c2)
512 "Return T if the arguments are in strictly non-decreasing alphabetic order.
513 Case is ignored." t)
514 (def char-not-lessp (two-arg-char-not-lessp c1 c2)
515 "Return T if the arguments are in strictly non-increasing alphabetic order.
516 Case is ignored." t))
519 (defun digit-char-p (char &optional (radix 10.))
520 "If char is a digit in the specified radix, returns the fixnum for which
521 that digit stands, else returns NIL."
522 (let ((code (char-code char)))
523 (if (<= code 1632) ;; (loop for code from 127 when (digit-char-p (code-char code)) return code)
524 (let ((weight (- code 48)))
525 (cond ((minusp weight) nil)
526 ((<= radix 10.)
527 ;; Special-case ASCII digits in decimal and smaller radices.
528 (if (< weight radix) weight nil))
529 ;; Digits 0 - 9 are used as is, since radix is larger.
530 ((< weight 10) weight)
532 (let ((weight (logior #x20 code))) ;; downcase ASCII characters.
533 (when (and (>= (decf weight (- (char-code #\a) 10)) 10)
534 (< weight radix))
535 weight) ))))
536 (let ((number (ucd-decimal-digit char)))
537 (when (and number (< number radix))
538 number)))))
540 (defun digit-char (weight &optional (radix 10))
541 "All arguments must be integers. Returns a character object that represents
542 a digit of the given weight in the specified radix. Returns NIL if no such
543 character exists."
544 (declare (explicit-check weight))
545 (cond ((typep weight '(and unsigned-byte fixnum))
546 (and (< weight radix)
547 (code-char (if (< weight 10) (+ 48 weight) (+ 55 weight)))))
549 (the unsigned-byte weight)
550 nil)))