Remove some test noise. A drop in the ocean unfortunately.
[sbcl.git] / src / code / target-unicode.lisp
blob5faed41ba792159043ddbce5a3584e913b780418
1 ;;;; Unicode functions
3 ;;;; This software is part of the SBCL system. See the README file for
4 ;;;; more information.
5 ;;;;
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
12 (in-package "SB!UNICODE")
14 (declaim (type simple-vector **special-numerics**))
15 (sb!impl::defglobal **special-numerics**
16 #.(with-open-file (stream
17 (merge-pathnames
18 (make-pathname
19 :directory
20 '(:relative :up :up "output")
21 :name "numerics" :type "lisp-expr")
22 sb!xc:*compile-file-truename*)
23 :direction :input
24 :element-type 'character)
25 (read stream)))
28 (declaim (type (simple-array (unsigned-byte 32) (*)) **block-ranges**))
29 (sb!impl::defglobal **block-ranges**
30 #.(sb!int:!coerce-to-specialized
31 (with-open-file (stream
32 (merge-pathnames
33 (make-pathname
34 :directory
35 '(:relative :up :up "output")
36 :name "blocks" :type "lisp-expr")
37 sb!xc:*compile-file-truename*)
38 :direction :input
39 :element-type 'character)
40 (read stream))
41 '(unsigned-byte 32)))
43 (macrolet ((unicode-property-init ()
44 (let ((proplist-dump
45 (with-open-file (stream
46 (merge-pathnames
47 (make-pathname
48 :directory
49 '(:relative :up :up "output")
50 :name "misc-properties" :type "lisp-expr")
51 sb!xc:*compile-file-truename*)
52 :direction :input
53 :element-type 'character)
54 (read stream)))
55 (confusable-sets
56 (with-open-file (stream
57 (merge-pathnames
58 (make-pathname
59 :directory
60 '(:relative :up :up "output")
61 :name "confusables" :type "lisp-expr")
62 sb!xc:*compile-file-truename*)
63 :direction :input
64 :element-type 'character)
65 (read stream)))
66 (bidi-mirroring-list
67 (with-open-file (stream
68 (merge-pathnames
69 (make-pathname
70 :directory
71 '(:relative :up :up "output")
72 :name "bidi-mirrors" :type "lisp-expr")
73 sb!xc:*compile-file-truename*)
74 :direction :input
75 :element-type 'character)
76 (read stream))))
77 `(progn
78 (sb!impl::defglobal **proplist-properties** ',proplist-dump)
79 (sb!impl::defglobal **confusables** ',confusable-sets)
80 (sb!impl::defglobal **bidi-mirroring-glyphs** ',bidi-mirroring-list)
81 (defun !unicode-properties-cold-init ()
82 (let ((hash (make-hash-table)) (list ',proplist-dump))
83 (do ((k (car list) (car list)) (v (cadr list) (cadr list)))
84 ((not list) hash)
85 (setf (gethash k hash) v)
86 (setf list (cddr list)))
87 (setf **proplist-properties** hash))
88 (let ((hash (make-hash-table :test #'equal)))
89 (loop for set in ',confusable-sets
90 for items = (mapcar #'(lambda (item)
91 (map 'simple-string
92 #'code-char item))
93 #!+sb-unicode set
94 #!-sb-unicode
95 (remove-if-not
96 #'(lambda (item)
97 (every
98 #'(lambda (x)
99 (< x sb!xc:char-code-limit))
100 item)) set))
101 do (dolist (i items)
102 (setf (gethash i hash) (first items))))
103 (setf **confusables** hash))
104 (let ((hash (make-hash-table)) (list ',bidi-mirroring-list))
105 (loop for (k v) in list do
106 (setf (gethash k hash) v))
107 (setf **bidi-mirroring-glyphs** hash)))))))
108 (unicode-property-init))
110 ;;; Unicode property access
111 (defun ordered-ranges-member (item vector)
112 (declare (type simple-vector vector)
113 (type fixnum item)
114 (optimize speed))
115 (labels ((recurse (start end)
116 (declare (type index start end))
117 (when (< start end)
118 (let* ((i (+ start (truncate (- end start) 2)))
119 (index (* 2 i))
120 (elt1 (svref vector index))
121 (elt2 (svref vector (1+ index))))
122 (declare (type index i)
123 (fixnum elt1 elt2))
124 (cond ((< item elt1)
125 (recurse start i))
126 ((> item elt2)
127 (recurse (+ 1 i) end))
129 item))))))
130 (recurse 0 (truncate (length vector) 2))))
132 ;; Returns which range `item` was found in or NIL
133 ;; First range = 0, second range = 1 ...
134 (defun ordered-ranges-position (item vector)
135 (declare (type (simple-array (unsigned-byte 32) (*)) vector)
136 (type fixnum item))
137 (labels ((recurse (start end)
138 (declare (type index start end))
139 (when (< start end)
140 (let* ((i (+ start (truncate (- end start) 2)))
141 (index (* 2 i))
142 (elt1 (aref vector index))
143 (elt2 (aref vector (1+ index))))
144 (declare (type index i))
145 (cond ((< item elt1)
146 (recurse start i))
147 ((> item elt2)
148 (recurse (+ 1 i) end))
150 i))))))
151 (recurse 0 (truncate (length vector) 2))))
153 (defun proplist-p (character property)
154 #!+sb-doc
155 "Returns T if CHARACTER has the specified PROPERTY.
156 PROPERTY is a keyword representing one of the properties from PropList.txt,
157 with underscores replaced by dashes."
158 (ordered-ranges-member (char-code character)
159 (gethash property **proplist-properties**)))
161 ;; WARNING: These have to be manually kept in sync with the values in ucd.lisp
162 (declaim (type simple-vector *general-categories* *bidi-classes* *east-asian-widths*
163 *scripts* *line-break-classes* *blocks*))
164 (sb!impl::defglobal *general-categories*
165 #(:Lu :Ll :Lt :Lm :Lo :Cc :Cf :Co :Cs :Cn :Mc :Me :Mn :Nd
166 :Nl :No :Pc :Pd :Pe :Pf :Pi :Po :Ps :Sc :Sk :Sm :So :Zl
167 :Zp :Zs))
169 (sb!impl::defglobal *bidi-classes*
170 #(:BN :AL :AN :B :CS :EN :ES :ET :L :LRE :LRO :NSM :ON
171 :PDF :R :RLE :RLO :S :WS :LRI :RLI :FSI :PDI))
173 (sb!impl::defglobal *east-asian-widths*
174 #(:N :A :H :W :F :Na))
176 (sb!impl::defglobal *scripts*
177 #(:Unknown :Common :Latin :Greek :Cyrillic :Armenian :Hebrew :Arabic :Syriac
178 :Thaana :Devanagari :Bengali :Gurmukhi :Gujarati :Oriya :Tamil :Telugu
179 :Kannada :Malayalam :Sinhala :Thai :Lao :Tibetan :Myanmar :Georgian :Hangul
180 :Ethiopic :Cherokee :Canadian-Aboriginal :Ogham :Runic :Khmer :Mongolian
181 :Hiragana :Katakana :Bopomofo :Han :Yi :Old-Italic :Gothic :Deseret
182 :Inherited :Tagalog :Hanunoo :Buhid :Tagbanwa :Limbu :Tai-Le :Linear-B
183 :Ugaritic :Shavian :Osmanya :Cypriot :Braille :Buginese :Coptic :New-Tai-Lue
184 :Glagolitic :Tifinagh :Syloti-Nagri :Old-Persian :Kharoshthi :Balinese
185 :Cuneiform :Phoenician :Phags-Pa :Nko :Sundanese :Lepcha :Ol-Chiki :Vai
186 :Saurashtra :Kayah-Li :Rejang :Lycian :Carian :Lydian :Cham :Tai-Tham
187 :Tai-Viet :Avestan :Egyptian-Hieroglyphs :Samaritan :Lisu :Bamum :Javanese
188 :Meetei-Mayek :Imperial-Aramaic :Old-South-Arabian :Inscriptional-Parthian
189 :Inscriptional-Pahlavi :Old-Turkic :Kaithi :Batak :Brahmi :Mandaic :Chakma
190 :Meroitic-Cursive :Meroitic-Hieroglyphs :Miao :Sharada :Sora-Sompeng
191 :Takri :Bassa-Vah :Mahajani :Pahawh-Hmong :Caucasian-Albanian :Manichaean
192 :Palmyrene :Duployan :Mende-Kikakui :Pau-Cin-Hau :Elbasan :Modi
193 :Psalter-Pahlavi :Grantha :Mro :Siddham :Khojki :Nabataean :Tirhuta
194 :Khudawadi :Old-North-Arabian :Warang-Citi :Linear-A :Old-Permic))
196 (sb!impl::defglobal *line-break-classes*
197 #(:XX :AI :AL :B2 :BA :BB :BK :CB :CJ :CL :CM :CP :CR :EX :GL
198 :HL :HY :ID :IN :IS :LF :NL :NS :NU :OP :PO :PR :QU :RI :SA
199 :SG :SP :SY :WJ :ZW))
201 (sb!impl::defglobal *blocks*
202 #(:Basic-Latin :Latin-1-Supplement :Latin-Extended-A :Latin-Extended-B
203 :IPA-Extensions :Spacing-Modifier-Letters :Combining-Diacritical-Marks
204 :Greek-and-Coptic :Cyrillic :Cyrillic-Supplement :Armenian :Hebrew :Arabic
205 :Syriac :Arabic-Supplement :Thaana :NKo :Samaritan :Mandaic
206 :Arabic-Extended-A :Devanagari :Bengali :Gurmukhi :Gujarati :Oriya :Tamil
207 :Telugu :Kannada :Malayalam :Sinhala :Thai :Lao :Tibetan :Myanmar :Georgian
208 :Hangul-Jamo :Ethiopic :Ethiopic-Supplement :Cherokee
209 :Unified-Canadian-Aboriginal-Syllabics :Ogham :Runic :Tagalog :Hanunoo
210 :Buhid :Tagbanwa :Khmer :Mongolian
211 :Unified-Canadian-Aboriginal-Syllabics-Extended :Limbu :Tai-Le :New-Tai-Lue
212 :Khmer-Symbols :Buginese :Tai-Tham :Combining-Diacritical-Marks-Extended
213 :Balinese :Sundanese :Batak :Lepcha :Ol-Chiki :Sundanese-Supplement
214 :Vedic-Extensions :Phonetic-Extensions :Phonetic-Extensions-Supplement
215 :Combining-Diacritical-Marks-Supplement :Latin-Extended-Additional
216 :Greek-Extended :General-Punctuation :Superscripts-and-Subscripts
217 :Currency-Symbols :Combining-Diacritical-Marks-for-Symbols
218 :Letterlike-Symbols :Number-Forms :Arrows :Mathematical-Operators
219 :Miscellaneous-Technical :Control-Pictures :Optical-Character-Recognition
220 :Enclosed-Alphanumerics :Box-Drawing :Block-Elements :Geometric-Shapes
221 :Miscellaneous-Symbols :Dingbats :Miscellaneous-Mathematical-Symbols-A
222 :Supplemental-Arrows-A :Braille-Patterns :Supplemental-Arrows-B
223 :Miscellaneous-Mathematical-Symbols-B :Supplemental-Mathematical-Operators
224 :Miscellaneous-Symbols-and-Arrows :Glagolitic :Latin-Extended-C :Coptic
225 :Georgian-Supplement :Tifinagh :Ethiopic-Extended :Cyrillic-Extended-A
226 :Supplemental-Punctuation :CJK-Radicals-Supplement :Kangxi-Radicals
227 :Ideographic-Description-Characters :CJK-Symbols-and-Punctuation :Hiragana
228 :Katakana :Bopomofo :Hangul-Compatibility-Jamo :Kanbun :Bopomofo-Extended
229 :CJK-Strokes :Katakana-Phonetic-Extensions :Enclosed-CJK-Letters-and-Months
230 :CJK-Compatibility :CJK-Unified-Ideographs-Extension-A
231 :Yijing-Hexagram-Symbols :CJK-Unified-Ideographs :Yi-Syllables :Yi-Radicals
232 :Lisu :Vai :Cyrillic-Extended-B :Bamum :Modifier-Tone-Letters
233 :Latin-Extended-D :Syloti-Nagri :Common-Indic-Number-Forms :Phags-pa
234 :Saurashtra :Devanagari-Extended :Kayah-Li :Rejang :Hangul-Jamo-Extended-A
235 :Javanese :Myanmar-Extended-B :Cham :Myanmar-Extended-A :Tai-Viet
236 :Meetei-Mayek-Extensions :Ethiopic-Extended-A :Latin-Extended-E
237 :Meetei-Mayek :Hangul-Syllables :Hangul-Jamo-Extended-B :High-Surrogates
238 :High-Private-Use-Surrogates :Low-Surrogates :Private-Use-Area
239 :CJK-Compatibility-Ideographs :Alphabetic-Presentation-Forms
240 :Arabic-Presentation-Forms-A :Variation-Selectors :Vertical-Forms
241 :Combining-Half-Marks :CJK-Compatibility-Forms :Small-Form-Variants
242 :Arabic-Presentation-Forms-B :Halfwidth-and-Fullwidth-Forms :Specials
243 :Linear-B-Syllabary :Linear-B-Ideograms :Aegean-Numbers
244 :Ancient-Greek-Numbers :Ancient-Symbols :Phaistos-Disc :Lycian :Carian
245 :Coptic-Epact-Numbers :Old-Italic :Gothic :Old-Permic :Ugaritic :Old-Persian
246 :Deseret :Shavian :Osmanya :Elbasan :Caucasian-Albanian :Linear-A
247 :Cypriot-Syllabary :Imperial-Aramaic :Palmyrene :Nabataean :Phoenician
248 :Lydian :Meroitic-Hieroglyphs :Meroitic-Cursive :Kharoshthi
249 :Old-South-Arabian :Old-North-Arabian :Manichaean :Avestan
250 :Inscriptional-Parthian :Inscriptional-Pahlavi :Psalter-Pahlavi :Old-Turkic
251 :Rumi-Numeral-Symbols :Brahmi :Kaithi :Sora-Sompeng :Chakma :Mahajani
252 :Sharada :Sinhala-Archaic-Numbers :Khojki :Khudawadi :Grantha :Tirhuta
253 :Siddham :Modi :Takri :Warang-Citi :Pau-Cin-Hau :Cuneiform
254 :Cuneiform-Numbers-and-Punctuation :Egyptian-Hieroglyphs :Bamum-Supplement
255 :Mro :Bassa-Vah :Pahawh-Hmong :Miao :Kana-Supplement :Duployan
256 :Shorthand-Format-Controls :Byzantine-Musical-Symbols :Musical-Symbols
257 :Ancient-Greek-Musical-Notation :Tai-Xuan-Jing-Symbols
258 :Counting-Rod-Numerals :Mathematical-Alphanumeric-Symbols :Mende-Kikakui
259 :Arabic-Mathematical-Alphabetic-Symbols :Mahjong-Tiles :Domino-Tiles
260 :Playing-Cards :Enclosed-Alphanumeric-Supplement
261 :Enclosed-Ideographic-Supplement :Miscellaneous-Symbols-and-Pictographs
262 :Emoticons :Ornamental-Dingbats :Transport-and-Map-Symbols
263 :Alchemical-Symbols :Geometric-Shapes-Extended :Supplemental-Arrows-C
264 :CJK-Unified-Ideographs-Extension-B :CJK-Unified-Ideographs-Extension-C
265 :CJK-Unified-Ideographs-Extension-D :CJK-Compatibility-Ideographs-Supplement
266 :Tags :Variation-Selectors-Supplement :Supplementary-Private-Use-Area-A
267 :Supplementary-Private-Use-Area-B))
269 (declaim (inline svref-or-null))
270 (defun svref-or-null (vector index)
271 (and (< index (length vector))
272 (svref vector index)))
274 (defun general-category (character)
275 #!+sb-doc
276 "Returns the general category of CHARACTER as it appears in UnicodeData.txt"
277 (svref-or-null *general-categories* (sb!impl::ucd-general-category character)))
279 (defun bidi-class (character)
280 #!+sb-doc
281 "Returns the bidirectional class of CHARACTER"
282 (if (and (eql (general-category character) :Cn)
283 (default-ignorable-p character))
285 (svref-or-null
286 *bidi-classes*
287 (aref **character-misc-database** (1+ (misc-index character))))))
289 (defun combining-class (character)
290 #!+sb-doc
291 "Returns the canonical combining class (CCC) of CHARACTER"
292 (aref **character-misc-database** (+ 2 (misc-index character))))
294 (defun decimal-value (character)
295 #!+sb-doc
296 "Returns the decimal digit value associated with CHARACTER or NIL if
297 there is no such value.
299 The only characters in Unicode with a decimal digit value are those
300 that are part of a range of characters that encode the digits 0-9.
301 Because of this, `(decimal-digit c) <=> (digit-char-p c 10)` in
302 #+sb-unicode builds"
303 (sb!impl::ucd-decimal-digit character))
305 (defun digit-value (character)
306 #!+sb-doc
307 "Returns the Unicode digit value of CHARACTER or NIL if it doesn't exist.
309 Digit values are guaranteed to be integers between 0 and 9 inclusive.
310 All characters with decimal digit values have the same digit value,
311 but there are characters (such as digits of number systems without a 0 value)
312 that have a digit value but no decimal digit value"
313 (let ((%digit (clear-flag 6
314 (aref **character-misc-database**
315 (+ 3 (misc-index character))))))
316 (if (< %digit 10) %digit nil)))
318 (defun numeric-value (character)
319 #!+sb-doc
320 "Returns the numeric value of CHARACTER or NIL if there is no such value.
321 Numeric value is the most general of the Unicode numeric properties.
322 The only constraint on the numeric value is that it be a rational number."
323 (or (double-vector-binary-search (char-code character)
324 **special-numerics**)
325 (digit-value character)))
327 (defun mirrored-p (character)
328 #!+sb-doc
329 "Returns T if CHARACTER needs to be mirrored in bidirectional text.
330 Otherwise, returns NIL."
331 (logbitp 5 (aref **character-misc-database**
332 (+ 5 (misc-index character)))))
334 (defun bidi-mirroring-glyph (character)
335 #!+sb-doc
336 "Returns the mirror image of CHARACTER if it exists.
337 Otherwise, returns NIL."
338 (when (mirrored-p character)
339 (let ((ret (gethash (char-code character) **bidi-mirroring-glyphs**)))
340 (when ret (code-char ret)))))
342 (defun east-asian-width (character)
343 #!+sb-doc
344 "Returns the East Asian Width property of CHARACTER as
345 one of the keywords :N (Narrow), :A (Ambiguous), :H (Halfwidth),
346 :W (Wide), :F (Fullwidth), or :NA (Not applicable)"
347 (svref-or-null *east-asian-widths*
348 (ldb (byte 3 0)
349 (aref **character-misc-database**
350 (+ 5 (misc-index character))))))
352 (defun script (character)
353 #!+sb-doc
354 "Returns the Script property of CHARACTER as a keyword.
355 If CHARACTER does not have a known script, returns :UNKNOWN"
356 (svref-or-null *scripts*
357 (aref **character-misc-database** (+ 6 (misc-index character)))))
359 (defun char-block (character)
360 #!+sb-doc
361 "Returns the Unicode block in which CHARACTER resides as a keyword.
362 If CHARACTER does not have a known block, returns :NO-BLOCK"
363 (let* ((code (char-code character))
364 (block-index (ordered-ranges-position code **block-ranges**)))
365 (if block-index
366 (aref *blocks* block-index) :no-block)))
368 (defun unicode-1-name (character)
369 #!+sb-doc
370 "Returns the name assigned to CHARACTER in Unicode 1.0 if it is distinct
371 from the name currently assigned to CHARACTER. Otherwise, returns NIL.
372 This property has been officially obsoleted by the Unicode standard, and
373 is only included for backwards compatibility."
374 (let* ((char-code (char-code character))
375 (h-code (double-vector-binary-search char-code
376 **unicode-1-char-name-database**)))
377 (when h-code
378 (huffman-decode h-code **unicode-character-name-huffman-tree**))))
380 (defun age (character)
381 #!+sb-doc
382 "Returns the version of Unicode in which CHARACTER was assigned as a pair
383 of values, both integers, representing the major and minor version respectively.
384 If CHARACTER is not assigned in Unicode, returns NIL for both values."
385 (let* ((value (aref **character-misc-database** (+ 8 (misc-index character))))
386 (major (ash value -3))
387 (minor (ldb (byte 3 0) value)))
388 (if (zerop value) (values nil nil) (values major minor))))
390 (defun hangul-syllable-type (character)
391 #!+sb-doc
392 "Returns the Hangul syllable type of CHARACTER.
393 The syllable type can be one of :L, :V, :T, :LV, or :LVT.
394 If the character is not a Hangul syllable or Jamo, returns NIL"
395 (let ((cp (char-code character)))
396 (cond
397 ((or
398 (and (<= #x1100 cp) (<= cp #x115f))
399 (and (<= #xa960 cp) (<= cp #xa97c))) :L)
400 ((or
401 (and (<= #x1160 cp) (<= cp #x11a7))
402 (and (<= #xd7B0 cp) (<= cp #xd7C6))) :V)
403 ((or
404 (and (<= #x11a8 cp) (<= cp #x11ff))
405 (and (<= #xd7c8 cp) (<= cp #xd7fb))) :T)
406 ((and (<= #xac00 cp) (<= cp #xd7a3))
407 (if (= 0 (rem (- cp #xac00) 28)) :LV :LVT)))))
409 (defun line-break-class (character &key resolve)
410 #!+sb-doc
411 "Returns the line breaking class of CHARACTER, as specified in UAX #14.
412 If :RESOLVE is NIL, returns the character class found in the property file.
413 If :RESOLVE is non-NIL, centain line-breaking classes will be mapped to othec
414 classes as specified in the applicable standards. Addinionally, if :RESOLVE
415 is :EAST-ASIAN, Ambigious (class :AI) characters will be mapped to the
416 Ideographic (:ID) class instead of Alphabetic (:AL)."
417 (when (and resolve (listp character)) (setf character (car character)))
418 (when (and resolve (not character)) (return-from line-break-class :nil))
419 (let ((raw-class
420 (svref-or-null *line-break-classes*
421 (aref **character-misc-database** (+ 7 (misc-index character)))))
422 (syllable-type (hangul-syllable-type character)))
423 (when syllable-type
424 (setf raw-class
425 (cdr (assoc syllable-type
426 '((:l . :JL) (:v . :JV) (:t . :JT)
427 (:lv . :H2) (:lvt . :H3))))))
428 (when resolve
429 (setf raw-class
430 (case raw-class
431 (:ai (if (eql resolve :east-asion) :ID :AL))
432 ; If we see :CM when resolving, we have a CM that isn't subject
433 ; to LB9, so we do LB10
434 ((:xx :cm) :al)
435 (:sa (if (member (general-category character) '(:Mn :Mc))
436 :CM :AL))
437 (:cj :ns)
438 (:sg (error "The character ~S is a surrogate, which should not
439 appear in an SBCL string. The line-breaking behavior of surrogates is undefined."
440 character))
441 (t raw-class))))
442 raw-class))
444 (defun uppercase-p (character)
445 #!+sb-doc
446 "Returns T if CHARACTER has the Unicode property Uppercase and NIL otherwise"
447 (or (eql (general-category character) :Lu) (proplist-p character :other-uppercase)))
449 (defun lowercase-p (character)
450 #!+sb-doc
451 "Returns T if CHARACTER has the Unicode property Lowercase and NIL otherwise"
452 (or (eql (general-category character) :Ll) (proplist-p character :other-lowercase)))
454 (defun cased-p (character)
455 #!+sb-doc
456 "Returns T if CHARACTER has a (Unicode) case, and NIL otherwise"
457 (or (uppercase-p character) (lowercase-p character)
458 (eql (general-category character) :Lt)))
460 (defun case-ignorable-p (character)
461 #!+sb-doc
462 "Returns T if CHARACTER is Case Ignorable as defined in Unicode 6.3, Chapter
464 (or (member (general-category character)
465 '(:Mn :Me :Cf :Lm :Sk))
466 (member (word-break-class character)
467 '(:midletter :midnumlet :single-quote))))
469 (defun alphabetic-p (character)
470 #!+sb-doc
471 "Returns T if CHARACTER is Alphabetic according to the Unicode standard
472 and NIL otherwise"
473 (or (member (general-category character) '(:Lu :Ll :Lt :Lm :Lo :Nl))
474 (proplist-p character :other-alphabetic)))
476 (defun ideographic-p (character)
477 #!+sb-doc
478 "Returns T if CHARACTER has the Unicode property Ideographic,
479 which loosely corresponds to the set of \"Chinese characters\""
480 (proplist-p character :ideographic))
482 (defun math-p (character)
483 #!+sb-doc
484 "Returns T if CHARACTER is a mathematical symbol according to Unicode and
485 NIL otherwise"
486 (or (eql (general-category character) :sm) (proplist-p character :other-math)))
488 (defun whitespace-p (character)
489 #!+sb-doc
490 "Returns T if CHARACTER is whitespace according to Unicode
491 and NIL otherwise"
492 (proplist-p character :white-space))
494 (defun hex-digit-p (character &key ascii)
495 #!+sb-doc
496 "Returns T if CHARACTER is a hexadecimal digit and NIL otherwise.
497 If :ASCII is non-NIL, fullwidth equivalents of the Latin letters A through F
498 are excluded."
499 (proplist-p character (if ascii :ascii-hex-digit :hex-digit)))
501 (defun soft-dotted-p (character)
502 #!+sb-doc
503 "Returns T if CHARACTER has a soft dot (such as the dots on i and j) which
504 disappears when accents are placed on top of it. and NIL otherwise"
505 (proplist-p character :soft-dotted))
507 (defun default-ignorable-p (character)
508 #!+sb-doc
509 "Returns T if CHARACTER is a Default_Ignorable_Code_Point"
510 (and
511 (or (proplist-p character :other-default-ignorable-code-point)
512 (eql (general-category character) :cf)
513 (proplist-p character :variation-selector))
514 (not
515 (or (whitespace-p character)
516 (ordered-ranges-member
517 (char-code character)
518 #(#x0600 #x0604 #x06DD #x06DD #x070F #x070F #xFFF9 #xFFFB
519 #x110BD #x110BD))))))
522 ;;; Implements UAX#15: Normalization Forms
523 (defun char-decomposition-info (char)
524 (let ((value (aref **character-misc-database**
525 (+ 4 (misc-index char)))))
526 (values (clear-flag 7 value) (logbitp 7 value))))
528 (defun char-decomposition (char length callback)
529 (declare (function callback))
530 ;; Caller should have gotten length from char-decomposition-info
531 (let* ((cp (char-code char))
532 (cp-high (ash cp -8))
533 (decompositions **character-decompositions**)
534 (high-page (aref **character-high-pages** cp-high))
535 (index (unless (logbitp 15 high-page) ;; Hangul syllable
536 (aref **character-low-pages**
537 (+ 1 (* 2 (+ (ldb (byte 8 0) cp) (ash high-page 8))))))))
538 (cond ((= length 1)
539 (funcall callback (code-char (aref decompositions index))))
540 ((<= #xac00 cp #xd7a3)
541 ;; see Unicode 6.2, section 3-12
542 (let* ((sbase #xac00)
543 (lbase #x1100)
544 (vbase #x1161)
545 (tbase #x11a7)
546 (vcount 21)
547 (tcount 28)
548 (ncount (* vcount tcount))
549 (sindex (- cp sbase))
550 (lindex (floor sindex ncount))
551 (vindex (floor (mod sindex ncount) tcount))
552 (tindex (mod sindex tcount)))
553 (funcall callback (code-char (+ lbase lindex)))
554 (funcall callback (code-char (+ vbase vindex)))
555 (when (> tindex 0)
556 (funcall callback (code-char (+ tbase tindex))))))
559 (loop for i below length
561 (funcall callback (code-char (aref decompositions (+ index i)))))))))
563 (defun decompose-char (char compatibility callback)
564 (declare (function callback))
565 (multiple-value-bind (info compat) (char-decomposition-info char)
566 (if (and (plusp info)
567 (or compatibility
568 (not compat)))
569 (if compatibility
570 (dx-flet ((callback (char)
571 (decompose-char char t callback)))
572 (char-decomposition char info #'callback))
573 (char-decomposition char info callback))
574 (funcall callback char))))
576 (defun decompose-string (string &optional (kind :canonical))
577 (let ((compatibility (ecase kind
578 (:compatibility t)
579 (:canonical nil))))
580 (let (chars
581 (length 0)
582 previous-char
583 (previous-combining-class 0))
584 (dx-flet ((callback (char)
585 (let ((combining-class (combining-class char)))
586 (incf length)
587 (cond ((< 0 combining-class previous-combining-class)
588 ;; Ensure it's sorted
589 (loop for cons on chars
590 for next-char = (cadr cons)
591 when (or (not next-char)
592 (<= 0 (combining-class next-char) combining-class))
593 do (setf (cdr cons)
594 (cons char (cdr cons)))
595 (return)))
597 (push char chars)
598 (setf previous-char char
599 previous-combining-class combining-class))))))
600 (loop for char across string
602 (decompose-char char compatibility #'callback))
603 (let ((result (make-string length)))
604 (loop for char in (nreverse chars)
605 for i from 0
606 do (setf (schar result i) char))
607 result)))))
609 (defun composition-hangul-syllable-type (cp)
610 (cond
611 ((and (<= #x1100 cp) (<= cp #x1112)) :L)
612 ((and (<= #x1161 cp) (<= cp #x1175)) :V)
613 ((and (<= #x11a8 cp) (<= cp #x11c2)) :T)
614 ((and (<= #xac00 cp) (<= cp #.(+ #xac00 11171)))
615 (if (= 0 (rem (- cp #xac00) 28)) :LV :LVT))))
617 (defun primary-composition (char1 char2)
618 (flet ((maybe (fn x) (when x (funcall fn x))))
619 (let ((c1 (char-code char1))
620 (c2 (char-code char2)))
621 (maybe
622 #'code-char
623 (cond
624 ((gethash (dpb c1 (byte 21 21) c2)
625 **character-primary-compositions**))
626 ((and (eql (composition-hangul-syllable-type c1) :L)
627 (eql (composition-hangul-syllable-type c2) :V))
628 (let ((lindex (- c1 #x1100))
629 (vindex (- c2 #x1161)))
630 (+ #xac00 (* lindex 588) (* vindex 28))))
631 ((and (eql (composition-hangul-syllable-type c1) :LV)
632 (eql (composition-hangul-syllable-type c2) :T))
633 (+ c1 (- c2 #x11a7))))))))
635 ;;; This implements a sequence data structure, specialized for
636 ;;; efficient deletion of characters at an index, along with tolerable
637 ;;; random access. The purpose is to support the canonical
638 ;;; composition algorithm from Unicode, which involves replacing (not
639 ;;; necessarily consecutive) pairs of code points with a single code
640 ;;; point (e.g. [#\e #\combining_acute_accent] with
641 ;;; #\latin_small_letter_e_with_acute). The data structure is a list
642 ;;; of three-element lists, each denoting a chunk of string data
643 ;;; starting at the first index and ending at the second.
645 ;;; Actually, the implementation isn't particularly efficient, and
646 ;;; would probably benefit from being rewritten in terms of displaced
647 ;;; arrays, which would substantially reduce copying.
649 ;;; (also, generic sequences. *sigh*.)
650 (defun lref (lstring index)
651 (dolist (l lstring)
652 (when (and (<= (first l) index)
653 (< index (second l)))
654 (return (aref (third l) (- index (first l)))))))
656 (defun (setf lref) (newchar lstring index)
657 (dolist (l lstring)
658 (when (and (<= (first l) index)
659 (< index (second l)))
660 (return (setf (aref (third l) (- index (first l))) newchar)))))
662 (defun llength (lstring)
663 (second (first (last lstring))))
665 (defun lstring (lstring)
666 (let ((result (make-string (llength lstring))))
667 (dolist (l lstring result)
668 (replace result (third l) :start1 (first l) :end1 (second l)))))
670 (defun ldelete (lstring index)
671 (do* ((ls lstring (cdr ls))
672 (l (car ls) (car ls))
673 so-fars)
674 ((and (<= (first l) index)
675 (< index (second l)))
676 (append
677 (nreverse so-fars)
678 (cond
679 ((= (first l) index)
680 (list (list (first l) (1- (second l)) (subseq (third l) 1))))
681 ((= index (1- (second l)))
682 (list (list (first l) (1- (second l)) (subseq (third l) 0 (1- (length (third l)))))))
684 (list
685 (list (first l) index
686 (subseq (third l) 0 (- index (first l))))
687 (list index (1- (second l))
688 (subseq (third l) (1+ (- index (first l))))))))
689 (mapcar (lambda (x) (list (1- (first x)) (1- (second x)) (third x)))
690 (cdr ls))))
691 (push l so-fars)))
693 (defun canonically-compose (string)
694 (let* ((result (list (list 0 (length string) string)))
695 (previous-starter-index (position 0 string :key #'combining-class))
696 (i (and previous-starter-index (1+ previous-starter-index))))
697 (when (or (not i) (= i (length string)))
698 (return-from canonically-compose string))
699 (tagbody
700 again
701 (when (and (>= (- i previous-starter-index) 2)
702 ;; test for Blocked (Unicode 3.11 para. D115)
704 ;; (assumes here that string has sorted combiners,
705 ;; so can look back just one step)
706 (>= (combining-class (lref result (1- i)))
707 (combining-class (lref result i))))
708 (when (= (combining-class (lref result i)) 0)
709 (setf previous-starter-index i))
710 (incf i)
711 (go next))
713 (let ((comp (primary-composition (lref result previous-starter-index)
714 (lref result i))))
715 (cond
716 (comp
717 (setf (lref result previous-starter-index) comp)
718 (setf result (ldelete result i)))
720 (when (= (combining-class (lref result i)) 0)
721 (setf previous-starter-index i))
722 (incf i))))
723 next
724 (unless (= i (llength result))
725 (go again)))
726 (if (= i (length string))
727 string
728 (lstring result))))
730 (defun normalize-string (string &optional (form :nfd))
731 #!+sb-doc
732 "Normalize STRING to the Unicode normalization form form.
733 Acceptable values for form are :NFD, :NFC, :NFKD, and :NFKC"
734 (declare (type (member :nfd :nfkd :nfc :nfkc) form))
735 #!-sb-unicode
736 (etypecase string
737 ((array nil (*)) string)
738 (string
739 (ecase form
740 ((:nfc :nfkc) string)
741 ((:nfd :nfkd) (error "Cannot normalize to ~A form in #-SB-UNICODE builds" form)))))
742 #!+sb-unicode
743 (etypecase string
744 (base-string string)
745 ((array character (*))
746 (ecase form
747 ((:nfc)
748 (canonically-compose (decompose-string string)))
749 ((:nfd)
750 (decompose-string string))
751 ((:nfkc)
752 (canonically-compose (decompose-string string :compatibility)))
753 ((:nfkd)
754 (decompose-string string :compatibility))))
755 ((array nil (*)) string)))
757 (defun normalized-p (string &optional (form :nfd))
758 #!+sb-doc
759 "Tests if STRING is normalized to FORM"
760 ;; FIXME: can be optimized
761 (string= string (normalize-string string form)))
764 ;;; Unicode case algorithms
765 ;; FIXME: Make these parts less redundant (macro?)
766 (defparameter **special-titlecases**
767 '#.(with-open-file (stream
768 (merge-pathnames
769 (make-pathname
770 :directory
771 '(:relative :up :up "output")
772 :name "titlecases" :type "lisp-expr")
773 sb!xc:*compile-file-truename*)
774 :direction :input
775 :element-type 'character)
776 (read stream)))
778 (defparameter **special-casefolds**
779 '#.(with-open-file (stream
780 (merge-pathnames
781 (make-pathname
782 :directory
783 '(:relative :up :up "output")
784 :name "foldcases" :type "lisp-expr")
785 sb!xc:*compile-file-truename*)
786 :direction :input
787 :element-type 'character)
788 (read stream)))
790 (defun has-case-p (char)
791 ;; Bit 6 is the Unicode case flag, as opposed to the Common Lisp one
792 (logbitp 6 (aref **character-misc-database** (+ 5 (misc-index char)))))
794 (defun char-uppercase (char)
795 (if (has-case-p char)
796 (let ((cp (car (char-case-info char))))
797 (if (atom cp) (list (code-char cp)) (mapcar #'code-char cp)))
798 (list char)))
800 (defun char-lowercase (char)
801 (if (has-case-p char)
802 (let ((cp (cdr (char-case-info char))))
803 (if (atom cp) (list (code-char cp)) (mapcar #'code-char cp)))
804 (list char)))
806 (defun char-titlecase (char)
807 (unless (has-case-p char) (return-from char-titlecase (list char)))
808 (let* ((cp (char-code char))
809 (value (assoc cp **special-titlecases**)))
810 (if value
811 (if (atom (cdr value))
812 (list (code-char (cdr value)))
813 (mapcar #'code-char (cdr value)))
814 (char-uppercase char))))
816 (defun char-foldcase (char)
817 (unless (has-case-p char) (return-from char-foldcase (list char)))
818 (let* ((cp (char-code char))
819 (value (assoc cp **special-casefolds**)))
820 (if value
821 (if (atom (cdr value))
822 (list (code-char (cdr value)))
823 (mapcar #'code-char (cdr value)))
824 (char-lowercase char))))
826 (defun string-somethingcase (fn string special-fn)
827 (let (result (len (length string)))
828 (loop for index from 0 below len
829 for char = (char string index)
830 for cased = (or (funcall special-fn char index len)
831 (funcall fn char))
832 do (loop for c in (remove :none cased) do (push c result)))
833 (setf result (nreverse result))
834 (coerce result 'string)))
836 (declaim (type function sb!unix::posix-getenv))
837 (defun get-user-locale ()
838 (let ((raw-locale
839 #!+(or win32 unix) (or (sb!unix::posix-getenv "LC_ALL")
840 (sb!unix::posix-getenv "LANG"))
841 #!-(or win32 unix) nil))
842 (when raw-locale
843 (let ((lang-code (string-upcase
844 (subseq raw-locale 0 (position #\_ raw-locale)))))
845 (when lang-code
846 (intern lang-code "KEYWORD"))))))
849 (defun uppercase (string &key locale)
850 #!+sb-doc
851 "Returns the full uppercase of STRING according to the Unicode standard.
852 The result is not guaranteed to have the same length as the input. If :LOCALE
853 is NIL, no language-specific case transformations are applied. If :LOCALE is a
854 keyword representing a two-letter ISO country code, the case transforms of that
855 locale are used. If :LOCALE is T, the user's current locale is used (Unix and
856 Win32 only)."
857 (when (eq locale t) (setf locale (get-user-locale)))
858 (string-somethingcase
859 #'char-uppercase string
860 #!-sb-unicode (constantly nil)
861 #!+sb-unicode ;; code-char with a constant > 255 breaks the build
862 #'(lambda (char index len)
863 (declare (ignore len))
864 (cond
865 ((and (eql locale :lt) (char= char (code-char #x0307))
866 (loop for i from (1- index) downto 0
867 for c = (char string i)
868 do (case (combining-class c)
869 (0 (return (soft-dotted-p c)))
870 (230 (return nil))
871 (t t))
872 finally (return nil)))
873 '(:none))
874 ((and (or (eql locale :tr) (eql locale :az))
875 (char= char #\i))
876 (list (code-char #x0130)))
877 (t nil)))))
879 (defun lowercase (string &key locale)
880 #!+sb-doc
881 "Returns the full lowercase of STRING according to the Unicode standard.
882 The result is not guaranteed to have the same length as the input.
883 :LOCALE has the same semantics as the :LOCALE argument to UPPERCASE."
884 (when (eq locale t) (setf locale (get-user-locale)))
885 (string-somethingcase
886 #'char-lowercase string
887 #!-sb-unicode (constantly nil)
888 #!+sb-unicode
889 #'(lambda (char index len)
890 (cond
891 ((and (char= char (code-char #x03A3))
892 (loop for i from (1- index) downto 0
893 for c = (char string i)
894 do (cond ((cased-p c) (return t))
895 ((case-ignorable-p c))
896 (t (return nil)))
897 finally (return nil))
898 (loop for i from (1+ index) below len
899 for c = (char string i)
900 do (cond ((cased-p c) (return nil))
901 ((case-ignorable-p c))
902 (t (return t)))
903 finally (return t)))
904 (list (code-char #x03C2)))
905 ((eql locale :lt)
906 (mapcar
907 #'code-char
908 (cdr (or
909 (assoc (char-code char)
910 '((#x00CC . (#x0069 #x0307 #x0300))
911 (#x00CD . (#x0069 #x0307 #x0301))
912 (#x0128 . (#x0069 #x0307 #x0303))))
913 (and (loop for i from (1+ index) below len
914 for c = (char string i)
915 do (case (combining-class c)
916 (230 (return t))
917 (0 (return nil))
918 (t t))
919 finally (return nil))
920 (assoc (char-code char)
921 '((#x0049 . (#x0069 #x0307))
922 (#x004A . (#x006A #x0307))
923 (#x012E . (#x012F #x0307)))))))))
924 ((or (eql locale :tr) (eql locale :az))
925 (cond
926 ((char= char (code-char #x0130)) (list #\i))
927 ((and (char= char (code-char #x0307))
928 (loop for i from (1- index) downto 0
929 for c = (char string i)
930 do (case (combining-class c)
931 (0 (return (char= c #\I)))
932 (230 (return nil))
933 (t t))
934 finally (return nil)))
935 '(:none))
936 ((and (char= char #\I)
937 (loop for i from (1+ index) below len
938 for c = (char string i)
939 do (case (combining-class c)
940 (0 (return t))
941 (230 (return (char/= c (code-char #x0307))))
942 (t t))
943 finally (return t)))
944 (list (code-char #x0131)))
945 (t nil)))
946 (t nil)))))
948 (defun titlecase (string &key locale)
949 #!+sb-doc
950 "Returns the titlecase of STRING. The resulting string can
951 be longer than the input.
952 :LOCALE has the same semantics as the :LOCALE argument to UPPERCASE."
953 (when (eq locale t) (setf locale (get-user-locale)))
954 (let ((words (words string))
955 (cased nil))
956 (loop for word in words
957 for first-cased = (or (position-if #'cased-p word) 0)
958 for pre = (subseq word 0 first-cased)
959 for initial = (char word first-cased)
960 for rest = (subseq word (1+ first-cased))
961 do (let ((up (char-titlecase initial)) (down (lowercase rest)))
962 #!+sb-unicode
963 (when (and (or (eql locale :tr) (eql locale :az))
964 (eql initial #\i))
965 (setf up (list (code-char #x0130))))
966 #!+sb-unicode
967 (when (and (eql locale :lt)
968 (soft-dotted-p initial)
969 (eql (char down
970 (position-if
971 #'(lambda (c)
972 (or (eql (combining-class c) 0)
973 (eql (combining-class c) 230))) down))
974 (code-char #x0307)))
975 (setf down (delete (code-char #x0307) down :count 1)))
976 (push (concatenate 'string pre up down) cased)))
977 (apply #'concatenate 'string (nreverse cased))))
979 (defun casefold (string)
980 #!+sb-doc
981 "Returns the full casefolding of STRING according to the Unicode standard.
982 Casefolding remove case information in a way that allaws the results to be used
983 for case-insensitive comparisons.
984 The result is not guaranteed to have the same length as the input."
985 (string-somethingcase #'char-foldcase string (constantly nil)))
988 ;;; Unicode break algorithms
989 ;;; In all the breaking methods:
990 ;;; (brk) establishes a break between `first` and `second`
991 ;;; (nobrk) prevents a break between `first` and `second`
992 ;;; Setting flag=T/state=:nobrk-next prevents a break between `second` and `htird`
994 ;; Word breaking sets this to make their algorithms less tricky
995 (defvar *other-break-special-graphemes* nil)
996 (defun grapheme-break-class (char)
997 #!+sb-doc
998 "Returns the grapheme breaking class of CHARACTER, as specified in UAX #29."
999 (let ((cp (when char (char-code char)))
1000 (gc (when char (general-category char)))
1001 (not-spacing-mark
1002 #(#x102B #x102C #x1038 #x1062 #x1063 #x1064 #x1067 #x1068 #x1069
1003 #x106A #x106B #x106C #x106D #x1083 #x1087 #x1088 #x1089 #x108A
1004 #x108B #x108C #x108F #x109A #x109B #x109C #x19B0 #x19B1 #x19B2
1005 #x19B3 #x19B4 #x19B8 #x19B9 #x19BB #x19BC #x19BD #x19BE #x19BF
1006 #x19C0 #x19C8 #x19C9 #x1A61 #x1A63 #x1A64 #xAA7B #xAA7D)))
1007 (cond
1008 ((not char) nil)
1009 ((= cp 10) :LF)
1010 ((= cp 13) :CR)
1011 ((or (member gc '(:Mn :Me))
1012 (proplist-p char :other-grapheme-extend)
1013 (and *other-break-special-graphemes*
1014 (member gc '(:Mc :Cf)) (not (<= #x200B cp #x200D))))
1015 :extend)
1016 ((or (member gc '(:Zl :Zp :Cc :Cs :Cf))
1017 ;; From Cn and Default_Ignorable_Code_Point
1018 (eql cp #x2065) (eql cp #xE0000)
1019 (<= #xFFF0 cp #xFFF8)
1020 (<= #xE0002 cp #xE001F)
1021 (<= #xE0080 cp #xE00FF)
1022 (<= #xE01F0 cp #xE0FFF)) :control)
1023 ((<= #x1F1E6 cp #x1F1FF) :regional-indicator)
1024 ((and (or (eql gc :Mc)
1025 (eql cp #x0E33) (eql cp #x0EB3))
1026 (not (binary-search cp not-spacing-mark))) :spacing-mark)
1027 (t (hangul-syllable-type char)))))
1029 (defun graphemes (string)
1030 #!+sb-doc
1031 "Breaks STRING into graphemes acording to the default
1032 grapheme breaking rules specified in UAX #29, returning a list of strings."
1033 (let* ((chars (coerce string 'list)) clusters (cluster (list (car chars))))
1034 (do ((first (car chars) second)
1035 (tail (cdr chars) (when tail (cdr tail)))
1036 (second (cadr chars) (when tail (cadr tail))))
1037 ((not first) (nreverse (mapcar #'(lambda (l) (coerce l 'string)) clusters)))
1038 (flet ((brk () (push (nreverse cluster) clusters) (setf cluster (list second)))
1039 (nobrk () (push second cluster)))
1040 (let ((c1 (grapheme-break-class first))
1041 (c2 (grapheme-break-class second)))
1042 (cond
1043 ((and (eql c1 :cr) (eql c2 :lf)) (nobrk))
1044 ((or (member c1 '(:control :cr :lf))
1045 (member c2 '(:control :cr :lf))) (brk))
1046 ((or (and (eql c1 :l) (member c2 '(:l :v :lv :lvt)))
1047 (and (or (eql c1 :v) (eql c1 :lv))
1048 (or (eql c2 :v) (eql c2 :t)))
1049 (and (eql c2 :t) (or (eql c1 :lvt) (eql c1 :t))))
1050 (nobrk))
1051 ((and (eql c1 :regional-indicator) (eql c2 :regional-indicator)) (nobrk))
1052 ((or (eql c2 :extend) (eql c2 :spacing-mark) (eql c1 :prepend)) (nobrk))
1053 (t (brk))))))))
1055 (defun word-break-class (char)
1056 #!+sb-doc
1057 "Returns the word breaking class of CHARACTER, as specified in UAX #29."
1058 ;; Words use graphemes as characters to deal with the ignore rule
1059 (when (listp char) (setf char (car char)))
1060 (let ((cp (when char (char-code char)))
1061 (gc (when char (general-category char)))
1062 (newlines #(#xB #xC #x0085 #x0085 #x2028 #x2029))
1063 (also-katakana
1064 #(#x3031 #x3035 #x309B #x309C
1065 #x30A0 #x30A0 #x30FC #x30FC
1066 #xFF70 #xFF70))
1067 (midnumlet #(#x002E #x2018 #x2019 #x2024 #xFE52 #xFF07 #xFF0E))
1068 (midletter
1069 #(#x003A #x00B7 #x002D7 #x0387 #x05F4 #x2027 #xFE13 #xFE55 #xFF1A))
1070 (midnum
1071 ;; Grepping of Line_Break = IS adjusted per UAX #29
1072 #(#x002C #x003B #x037E #x0589 #x060C #x060D #x066C #x07F8 #x2044
1073 #xFE10 #xFE14 #xFE50 #xFE54 #xFF0C #xFF1B)))
1074 (cond
1075 ((not char) nil)
1076 ((= cp 10) :LF)
1077 ((= cp 13) :CR)
1078 ((= cp #x27) :single-quote)
1079 ((= cp #x22) :double-quote)
1080 ((ordered-ranges-member cp newlines) :newline)
1081 ((or (eql (grapheme-break-class char) :extend)
1082 (eql gc :mc)) :extend)
1083 ((<= #x1F1E6 cp #x1F1FF) :regional-indicator)
1084 ((and (eql gc :Cf) (not (<= #x200B cp #x200D))) :format)
1085 ((or (eql (script char) :katakana)
1086 (ordered-ranges-member cp also-katakana)) :katakana)
1087 ((and (eql (script char) :Hebrew) (eql gc :lo)) :hebrew-letter)
1088 ((and (or (alphabetic-p char) (= cp #x05F3))
1089 (not (or (ideographic-p char)
1090 (eql (line-break-class char) :sa)
1091 (eql (script char) :hiragana)))) :aletter)
1092 ((binary-search cp midnumlet) :midnumlet)
1093 ((binary-search cp midletter) :midletter)
1094 ((binary-search cp midnum) :midnum)
1095 ((or (and (eql gc :Nd) (not (<= #xFF10 cp #xFF19))) ;Fullwidth digits
1096 (eql cp #x066B)) :numeric)
1097 ((eql gc :Pc) :extendnumlet)
1098 (t nil))))
1100 (defmacro flatpush (thing list)
1101 (let ((%thing (gensym)) (%i (gensym)))
1102 `(let ((,%thing ,thing))
1103 (if (listp ,%thing)
1104 (dolist (,%i ,%thing)
1105 (push ,%i ,list))
1106 (push ,%thing ,list)))))
1108 (defun words (string)
1109 #!+sb-doc
1110 "Breaks STRING into words acording to the default
1111 word breaking rules specified in UAX #29. Returns a list of strings"
1112 (let ((chars (mapcar
1113 #'(lambda (s)
1114 (let ((l (coerce s 'list)))
1115 (if (cdr l) l (car l))))
1116 (let ((*other-break-special-graphemes* t)) (graphemes string))))
1117 words word flag)
1118 (flatpush (car chars) word)
1119 (do ((first (car chars) second)
1120 (tail (cdr chars) (cdr tail))
1121 (second (cadr chars) (cadr tail)))
1122 ((not first) (nreverse (mapcar #'(lambda (l) (coerce l 'string)) words)))
1123 (flet ((brk () (push (nreverse word) words) (setf word nil) (flatpush second word))
1124 (nobrk () (flatpush second word)))
1125 (let ((c1 (word-break-class first))
1126 (c2 (word-break-class second))
1127 (c3 (when (and tail (cdr tail)) (word-break-class (cadr tail)))))
1128 (cond
1129 (flag (nobrk) (setf flag nil))
1130 ;; CR+LF are bound together by the grapheme clustering
1131 ((or (eql c1 :newline) (eql c1 :cr) (eql c1 :lf)
1132 (eql c2 :newline) (eql c2 :cr) (eql c2 :lf)) (brk))
1133 ((or (eql c2 :format) (eql c2 :extend)) (nobrk))
1134 ((and (or (eql c1 :aletter) (eql c1 :hebrew-letter))
1135 (or (eql c2 :aletter) (eql c2 :hebrew-letter))) (nobrk))
1136 ((and (or (eql c1 :aletter) (eql c1 :hebrew-letter))
1137 (member c2 '(:midletter :midnumlet :single-quote))
1138 (or (eql c3 :aletter) (eql c3 :hebrew-letter)))
1139 (nobrk) (setf flag t)) ; Handle the multiple breaks from this rule
1140 ((and (eql c1 :hebrew-letter) (eql c2 :double-quote)
1141 (eql c3 :hebrew-letter))
1142 (nobrk) (setf flag t))
1143 ((and (eql c1 :hebrew-letter) (eql c2 :single-quote)) (nobrk))
1144 ((or (and (eql c1 :numeric) (member c2 '(:numeric :aletter :hebrew-letter)))
1145 (and (eql c2 :numeric) (member c1 '(:numeric :aletter :hebrew-letter))))
1146 (nobrk))
1147 ((and (eql c1 :numeric)
1148 (member c2 '(:midnum :midnumlet :single-quote))
1149 (eql c3 :numeric))
1150 (nobrk) (setf flag t))
1151 ((and (eql c1 :katakana) (eql c2 :katakana)) (nobrk))
1152 ((or (and (member c1
1153 '(:aletter :hebrew-letter :katakana
1154 :numeric :extendnumlet)) (eql c2 :extendnumlet))
1155 (and (member c2
1156 '(:aletter :hebrew-letter :katakana
1157 :numeric :extendnumlet)) (eql c1 :extendnumlet)))
1158 (nobrk))
1159 ((and (eql c1 :regional-indicator) (eql c2 :regional-indicator)) (nobrk))
1160 (t (brk))))))))
1162 (defun sentence-break-class (char)
1163 #!+sb-doc
1164 "Returns the sentence breaking class of CHARACTER, as specified in UAX #29."
1165 (when (listp char) (setf char (car char)))
1166 (let ((cp (when char (char-code char)))
1167 (gc (when char (general-category char)))
1168 (aterms #(#x002E #x2024 #xFE52 #xFF0E))
1169 (scontinues
1170 #(#x002C #x002D #x003A #x055D #x060C #x060D #x07F8 #x1802 #x1808
1171 #x2013 #x2014 #x3001 #xFE10 #xFE11 #xFE13 #xFE31 #xFE32 #xFE50
1172 #xFE51 #xFE55 #xFE58 #xFE63 #xFF0C #xFF0D #xFF1A #xFF64)))
1173 (cond
1174 ((not char) nil)
1175 ((= cp 10) :LF)
1176 ((= cp 13) :CR)
1177 ((or (eql (grapheme-break-class char) :extend)
1178 (eql gc :mc)) :extend)
1179 ((or (eql cp #x0085) (<= #x2028 cp #x2029)) :sep)
1180 ((and (eql gc :Cf) (not (<= #x200C cp #x200D))) :format)
1181 ((whitespace-p char) :sp)
1182 ((lowercase-p char) :lower)
1183 ((or (uppercase-p char) (eql gc :Lt)) :upper)
1184 ((or (alphabetic-p char) (eql cp #x00A0) (eql cp #x05F3)) :oletter)
1185 ((or (and (eql gc :Nd) (not (<= #xFF10 cp #xFF19))) ;Fullwidth digits
1186 (<= #x066B cp #x066C)) :numeric)
1187 ((binary-search cp aterms) :aterm)
1188 ((binary-search cp scontinues) :scontinue)
1189 ((proplist-p char :sterm) :sterm)
1190 ((and (or (member gc '(:Po :Ps :Pe :Pf :Pi))
1191 (eql (line-break-class char) :qu))
1192 (not (eql cp #x05F3))) :close)
1193 (t nil))))
1195 (defun sentence-prebreak (string)
1196 #!+sb-doc
1197 "Pre-combines some sequences of characters to make the sentence-break
1198 algorithm simpler..
1199 Specifically,
1200 - Combines any character with the following extend of format characters
1201 - Combines CR + LF into '(CR LF)
1202 - Combines any run of :cp*:close* into one character"
1203 (let ((chars (coerce string 'list))
1204 cluster clusters last-seen sp-run)
1205 (labels ((flush () (if (cdr cluster) (push (nreverse cluster) clusters)
1206 (if cluster (push (car cluster) clusters)))
1207 (setf cluster nil))
1208 (brk (x)
1209 (flush) (push x clusters))
1210 (nobrk (x) (push x cluster)))
1211 (loop for ch in chars
1212 for type = (sentence-break-class ch)
1213 do (cond
1214 ((and (eql last-seen :cr) (eql type :lf)) (nobrk ch) (flush) (setf last-seen nil))
1215 ((eql last-seen :cr) (brk ch) (setf last-seen nil))
1216 ((eql type :cr) (nobrk ch) (setf last-seen :cr))
1217 ((eql type :lf) (brk ch) (setf last-seen nil))
1218 ((eql type :sep) (brk ch) (setf last-seen nil))
1219 ((and last-seen (or (eql type :extend) (eql type :format)))
1220 (nobrk ch))
1221 ((eql type :close)
1222 (unless (eql last-seen :close) (flush))
1223 (nobrk ch) (setf last-seen :close sp-run nil))
1224 ((eql type :sp)
1225 (unless (or (and (not sp-run) (eql last-seen :close)) (eql last-seen :sp))
1226 (flush) (setf sp-run t))
1227 (nobrk ch) (setf last-seen :sp))
1228 (t (flush) (nobrk ch) (setf last-seen type sp-run nil))))
1229 (flush) (nreverse clusters))))
1231 (defun sentences (string)
1232 #!+sb-doc
1233 "Breaks STRING into sentences acording to the default
1234 sentence breaking rules specified in UAX #29"
1235 (let ((special-handling '(:close :sp :sep :cr :lf :scontinue :sterm :aterm))
1236 (chars (sentence-prebreak string))
1237 sentence sentences state)
1238 (flatpush (car chars) sentence)
1239 (do ((first (car chars) second)
1240 (tail (cdr chars) (cdr tail))
1241 (second (cadr chars) (cadr tail))
1242 (third (caddr chars) (caddr tail)))
1243 ((not first)
1244 (progn
1245 ; Shake off last sentence
1246 (when sentence (push (nreverse sentence) sentences))
1247 (nreverse (mapcar #'(lambda (l) (coerce l 'string)) sentences))))
1248 (flet ((brk () (push (nreverse sentence) sentences)
1249 (setf sentence nil) (flatpush second sentence))
1250 (nobrk () (flatpush second sentence)))
1251 (let ((c1 (sentence-break-class first))
1252 (c2 (sentence-break-class second))
1253 (c3 (sentence-break-class third)))
1254 (cond
1255 ((eql state :brk-next) (brk) (setf state nil))
1256 ((eql state :nobrk-next) (nobrk) (setf state nil))
1257 ((member c1 '(:sep :cr :lf)) (brk))
1258 ((and (eql c1 :aterm) (eql c2 :numeric)) (nobrk))
1259 ((and (eql c1 :upper) (eql c2 :aterm)
1260 (eql c3 :upper)) (nobrk) (setf state :nobrk-next))
1261 ((or (and (member c1 '(:sterm :aterm)) (member c2 '(:close :sp))
1262 (member c3 '(:scontinue :sterm :aterm)))
1263 (and (member c1 '(:sterm :aterm))
1264 (member c2 '(:scontinue :sterm :aterm))))
1265 (nobrk) (when (member c2 '(:close :sp)) (setf state :nobrk-next)))
1266 ((and (member c1 '(:sterm :aterm)) (member c2 '(:close :sp))
1267 (member c3 '(:sep :cr :lf)))
1268 (nobrk) (setf state :nobrk-next)) ;; Let the linebreak call (brk)
1269 ((and (member c1 '(:sterm :aterm)) (member c2 '(:sep :cr :lf)))
1270 (nobrk)) ; Doesn't trigger rule 8
1271 ((eql c1 :sterm) ; Not ambiguous anymore, rule 8a already handled
1272 (if (member c2 '(:close :sp))
1273 (progn (nobrk) (setf state :brk-next))
1274 (brk)))
1275 ((and (eql c2 :sterm) third (not (member c3 special-handling)))
1276 (nobrk) (setf state :brk-next)) ; STerm followed by nothing important
1277 ((or (eql c1 :aterm)
1278 (and (eql c2 :aterm) third
1279 (not (member c3 special-handling)) (not (eql c3 :numeric))))
1280 ; Finally handle rule 8
1281 (if (loop for c in
1282 (if (and third (not (or (member c3 special-handling)
1283 (eql c3 :numeric))))
1284 (cdr tail) tail)
1285 for type = (sentence-break-class c) do
1286 (when (member type '(:oletter :upper :sep :cr :lf
1287 :sterm :aterm))
1288 (return nil))
1289 (when (eql type :lower) (return t)) finally (return nil))
1290 ; Ambiguous case
1291 (progn (nobrk) (setf state :nobrk-next))
1292 ; Otherwise
1293 (if (member c2 '(:close :sp :aterm))
1294 (progn (nobrk) (setf state :brk-next))
1295 (brk))))
1296 (t (nobrk))))))))
1298 (defun line-prebreak (string)
1299 (let ((chars (coerce string 'list))
1300 cluster clusters last-seen)
1301 (loop for char in chars
1302 for type = (line-break-class char)
1304 (when
1305 (and cluster
1307 (not (eql type :cm))
1308 (and (eql type :cm)
1309 (member last-seen '(nil :BK :CR :LF :NL :SP :ZW)))))
1310 (if (cdr cluster)
1311 (push (nreverse cluster) clusters)
1312 (push (car cluster) clusters))
1313 (setf cluster nil))
1314 (unless (eql type :cm) (setf last-seen type))
1315 (push char cluster))
1316 (if (cdr cluster)
1317 (push (nreverse cluster) clusters)
1318 (push (car cluster) clusters))
1319 (nreverse clusters)))
1321 (defun line-break-annotate (string)
1322 (let ((chars (line-prebreak string))
1323 first second t1 t2 tail (ret (list :cant))
1324 state after-spaces)
1325 (macrolet ((cmpush (thing)
1326 (let ((gthing (gensym)))
1327 `(let ((,gthing ,thing))
1328 (if (listp ,gthing)
1329 (loop for (c next) on ,gthing do
1330 (push c ret)
1331 (when next (push :cant ret)))
1332 (push ,thing ret)))))
1333 (between (a b action)
1334 (let ((atest (if (eql a :any) t
1335 (if (listp a)
1336 `(member t1 ,a)
1337 `(eql t1 ,a))))
1338 (btest (if (eql b :any) t
1339 (if (listp b)
1340 `(member t2 ,b)
1341 `(eql t2 ,b)))))
1342 `(when (and ,atest ,btest)
1343 (cmpush ,action)
1344 (cmpush second)
1345 (go tail))))
1346 (after-spaces (a b action)
1347 (let ((atest (if (eql a :any) t
1348 (if (listp a)
1349 `(member t1 ,a)
1350 `(eql t1 ,a))))
1351 (btest (if (eql b :any) t
1352 (if (listp b)
1353 `(member type ,b)
1354 `(eql type ,b)))))
1355 `(when
1356 (and ,atest
1357 (loop for c in tail
1358 for type = (line-break-class c :resolve t)
1360 (when (not (eql type :sp))
1361 (return ,btest))))
1362 (if (eql t2 :sp)
1363 (progn (cmpush :cant)
1364 (cmpush second)
1365 (setf state :eat-spaces)
1366 (setf after-spaces ,action)
1367 (go tail))
1368 (progn (cmpush ,action)
1369 (cmpush second)
1370 (go tail)))))))
1372 (cmpush (car chars))
1373 (setf first (car chars))
1374 (setf tail (cdr chars))
1375 (setf second (car tail))
1376 (tagbody
1378 (when (not first) (go end))
1379 (setf t1 (line-break-class first :resolve t))
1380 (setf t2 (line-break-class second :resolve t))
1381 (between :any :nil :must)
1382 (when (and (eql state :eat-spaces) (eql t2 :sp))
1383 (cmpush :cant) (cmpush second) (go tail))
1384 (between :bk :any :must)
1385 (between :cr :lf :cant)
1386 (between '(:cr :lf :nl) :any :must)
1387 (between :any '(:zw :bk :cr :lf :nl) :cant)
1388 (when after-spaces (cmpush after-spaces) (cmpush second)
1389 (setf state nil after-spaces nil) (go tail))
1390 (after-spaces :zw :any :can)
1391 (between :any :wj :cant)
1392 (between :wj :any :cant)
1393 (between :gl :any :cant)
1394 (between '(:ZW :WJ :SY :SG :SA :RI :QU :PR :PO :OP :NU :NS :NL
1395 :LF :IS :IN :ID :HL :GL :EX :CR :CP :CM :CL :CJ :CB
1396 :BK :BB :B2 :AL :AI :JL :JV :JT :H2 :H3 :XX)
1397 :gl :cant)
1398 (between :any '(:cl :cp :ex :is :sy) :cant)
1399 (after-spaces :op :any :cant)
1400 (after-spaces :qu :op :cant)
1401 (after-spaces '(:cl :cp) :ns :cant)
1402 (after-spaces :b2 :b2 :cant)
1403 (between :any :sp :cant) ;; Goes here to deal with after-spaces
1404 (between :sp :any :can)
1405 (between :any :qu :cant)
1406 (between :qu :any :cant)
1407 (between :any :cb :can)
1408 (between :cb :any :can)
1409 (between :any '(:ba :hy :ns) :cant)
1410 (between :bb :any :cant)
1411 (when (and (eql t1 :hl) (eql t2 :hy))
1412 (cmpush :cant) (cmpush second)
1413 (setf after-spaces :can) (go tail))
1414 (between '(:al :hl :id :in :nu) :in :cant)
1415 (between :id :po :cant)
1416 (between '(:al :hl) :nu :cant)
1417 (between '(:nu :po) '(:al :hl) :cant)
1418 (between :pr '(:id :al :hl) :cant)
1419 (between '(:cl :cp :nu) '(:po :pr) :cant)
1420 (between :nu '(:po :pr :nu) :cant)
1421 (between '(:po :pr) :op :cant)
1422 (between '(:po :pr :hy :is :sy) :nu :cant)
1423 (between :jl '(:jl :jv :h2 :h3) :cant)
1424 (between '(:jv :h2) '(:jv :jt) :cant)
1425 (between '(:jt :h3) :jt :cant)
1426 (between '(:jl :jv :jt :h2 :h3) '(:in :po) :cant)
1427 (between :pr '(:jl :jv :jt :h2 :h3) :cant)
1428 (between '(:al :hl :is) '(:al :hl) :cant)
1429 (between '(:al :hl :nu) :op :cant)
1430 (between :cp '(:al :hl :nu) :cant)
1431 (between :ri :ri :cant)
1432 (between :any :any :can)
1433 tail
1434 (setf first second)
1435 (setf tail (cdr tail))
1436 (setf second (car tail))
1437 (go top)
1438 end)
1439 ;; LB3 satisfied by (:any :nil) -> :must
1440 (setf ret (nreverse ret))
1441 ret)))
1443 (defun break-list-at (list n)
1444 (let ((tail list) (pre-tail nil))
1445 (loop repeat n do (setf pre-tail tail) (setf tail (cdr tail)))
1446 (setf (cdr pre-tail) nil)
1447 (values list tail)))
1449 (defun lines (string &key (margin *print-right-margin*))
1450 #!+sb-doc
1451 "Breaks STRING into lines that are no wider than :MARGIN according to the
1452 line breaking rules outlined in UAX #14. Combining marks will awsays be kept
1453 together with their base characters, and spaces (but not other types of
1454 whitespace) will be removed from the end of lines. If :MARGIN is unspecified,
1455 it defaults to 80 characters"
1456 (when (string= string "") (return-from lines (list "")))
1457 (unless margin (setf margin 80))
1458 (do* ((chars (line-break-annotate string))
1459 line lines (filled 0) last-break-distance
1460 (break-type (car chars) (car tail))
1461 (char (cadr chars) (cadr tail))
1462 (tail (cddr chars) (cddr tail)))
1463 ((not break-type)
1464 (mapcar #'(lambda (s) (coerce s 'string)) (nreverse lines)))
1465 (ecase break-type
1466 (:cant
1467 (push char line)
1468 (unless (eql (line-break-class char) :CM)
1469 (incf filled))
1470 (when last-break-distance (incf last-break-distance)))
1471 (:can
1472 (push char line)
1473 (setf last-break-distance 1)
1474 (incf filled))
1475 (:must
1476 (push char line)
1477 (setf last-break-distance 1)
1478 (incf filled)
1479 (go break)))
1480 (if (> filled margin)
1481 (go break)
1482 (go next))
1483 break
1484 (when (not last-break-distance)
1485 ;; If we don't have any line breaks, remove the last thing we added that
1486 ;; takes up space, and all its combining marks
1487 (setf last-break-distance
1488 (1+ (loop for c in line while (eql (line-break-class c) :cm) summing 1))))
1489 (multiple-value-bind (next-line this-line) (break-list-at line last-break-distance)
1490 (loop while (eql (line-break-class (car this-line)) :sp)
1491 do (setf this-line (cdr this-line)))
1492 (push (nreverse this-line) lines)
1493 (setf line next-line)
1494 (setf filled (length line))
1495 (setf last-break-distance nil))
1496 next))
1499 ;;; Collation
1500 (defconstant +maximum-variable-primary-element+
1501 #.(with-open-file (stream
1502 (merge-pathnames
1503 (make-pathname
1504 :directory
1505 '(:relative :up :up "output")
1506 :name "other-collation-info" :type "lisp-expr")
1507 sb!xc:*compile-file-truename*)
1508 :direction :input
1509 :element-type 'character)
1510 (read stream)))
1512 (defun unpack-collation-key (key)
1513 (declare (type (simple-array (unsigned-byte 32) (*)) key))
1514 (loop for value across key
1515 collect
1516 (list (ldb (byte 16 16) value)
1517 (ldb (byte 11 5) value)
1518 (ldb (byte 5 0) value))))
1520 (declaim (inline variable-p))
1521 (defun variable-p (x)
1522 (<= 1 x +maximum-variable-primary-element+))
1524 (defun collation-key (string start end)
1525 (let (char1
1526 (char2 (code-char 0))
1527 (char3 (code-char 0)))
1528 (case (- end start)
1529 (1 (setf char1 (char string start)))
1530 (2 (setf char1 (char string start)
1531 char2 (char string (+ start 1))))
1532 (3 (setf char1 (char string start)
1533 char2 (char string (+ start 1))
1534 char3 (char string (+ start 2))))
1536 ;; There are never more than three characters in a contraction, right?
1537 (return-from collation-key nil)))
1538 (let ((packed-key (gethash (pack-3-codepoints
1539 (char-code char1)
1540 (char-code char2)
1541 (char-code char3))
1542 **character-collations**)))
1543 (if packed-key
1544 (unpack-collation-key packed-key)
1545 (when (char= (code-char 0) char2 char3)
1546 (let* ((cp (char-code char1))
1547 (base
1548 (cond ((not (proplist-p char1 :unified-ideograph))
1549 #xFBC0)
1550 ((or (<= #x4E00 cp #x9FFF)
1551 (<= #xF900 cp #xFAFF))
1552 #xFB40)
1554 #xFB80)))
1555 (a (+ base (ash cp -15)))
1556 (b (logior #.(ash 1 15) (logand cp #x7FFFF))))
1557 (list (list a #x20 #x2) (list b 0 0))))))))
1559 (defun sort-key (string)
1560 (let* ((str (normalize-string string :nfd))
1561 (i 0) (len (length str)) max-match new-i
1562 sort-key
1563 after-variable)
1564 (loop while (< i len)
1566 (loop for offset from 1 to 3
1567 for index = (+ i offset)
1568 while (<= index len)
1570 (let ((key (collation-key str i index)))
1571 (when key
1572 (setf max-match key
1573 new-i index))))
1574 (loop for index from new-i below len
1575 for char = (char str index)
1576 for previous-combining-class = combining-class
1577 for combining-class = (combining-class char)
1578 until (eql combining-class 0)
1579 unless (and (>= (- index new-i) 1)
1580 ;; Combiners are sorted, we only have to look back
1581 ;; one step (see canonically-compose)
1582 (>= (combining-class (char str (1- index)))
1583 combining-class))
1585 (rotatef (char str new-i) (char str index))
1586 (let ((key (collation-key str i (1+ new-i))))
1587 (if key
1588 (setf max-match key
1589 new-i (1+ new-i))
1590 (rotatef (char str new-i) (char str index)))))
1591 (loop for key in max-match do (push key sort-key))
1592 (setf i new-i))
1593 (macrolet ((push-non-zero (obj place)
1594 `(when (/= ,obj 0)
1595 (push ,obj ,place))))
1596 (let (primary secondary tertiary quatenary)
1597 (loop for (k1 k2 k3) in (nreverse sort-key)
1599 (cond
1600 ((= k1 k2 k3 0))
1601 ((variable-p k1)
1602 (setf after-variable t)
1603 (push k1 quatenary))
1604 ((/= k1 0)
1605 (setf after-variable nil)
1606 (push k1 primary)
1607 (push-non-zero k2 secondary)
1608 (push-non-zero k3 tertiary)
1609 (push #xFFFF quatenary))
1610 ((/= k3 0)
1611 (unless after-variable
1612 (push-non-zero k2 secondary)
1613 (push k3 tertiary)
1614 (push #xFFFF quatenary)))))
1615 (concatenate 'vector
1616 (nreverse primary) #(0) (nreverse secondary) #(0)
1617 (nreverse tertiary) #(0) (nreverse quatenary))))))
1619 (defun vector< (vector1 vector2)
1620 (loop for i across vector1
1621 for j across vector2
1623 (cond ((< i j) (return-from vector< t))
1624 ((> i j) (return-from vector< nil))))
1625 ;; If there's no differences, shortest vector wins
1626 (< (length vector1) (length vector2)))
1628 (defun unicode= (string1 string2 &key (start1 0) end1 (start2 0) end2 (strict t))
1629 #!+sb-doc
1630 "Determines whether STRING1 and STRING2 are canonically equivalent according
1631 to Unicode. The START and END arguments behave like the arguments to STRING=.
1632 If :STRICT is NIL, UNICODE= tests compatibility equavalence instead."
1633 (let ((str1 (normalize-string (subseq string1 start1 end1) (if strict :nfd :nfkd)))
1634 (str2 (normalize-string (subseq string2 start2 end2) (if strict :nfd :nfkd))))
1635 (string= str1 str2)))
1637 (defun unicode-equal (string1 string2 &key (start1 0) end1 (start2 0) end2 (strict t))
1638 #!+sb-doc
1639 "Determines whether STRING1 and STRING2 are canonically equivalent after
1640 casefoldin8 (that is, ignoring case differences) according to Unicode. The
1641 START and END arguments behave like the arguments to STRING=. If :STRICT is
1642 NIL, UNICODE= tests compatibility equavalence instead."
1643 (let ((str1 (normalize-string (subseq string1 start1 end1) (if strict :nfd :nfkd)))
1644 (str2 (normalize-string (subseq string2 start2 end2) (if strict :nfd :nfkd))))
1645 (string=
1646 (normalize-string (casefold str1) (if strict :nfd :nfkd))
1647 (normalize-string (casefold str2) (if strict :nfd :nfkd)))))
1649 (defun unicode< (string1 string2 &key (start1 0) end1 (start2 0) end2)
1650 #!+sb-doc
1651 "Determines whether STRING1 sorts before STRING2 using the Unicode Collation
1652 Algorithm, The function uses an untailored Default Unicode Collation Element Table
1653 to produce the sort keys. The function uses the Shifted method for dealing
1654 with variable-weight characters, as described in UTS #10"
1655 (let* ((s1 (subseq string1 start1 end1))
1656 (s2 (subseq string2 start2 end2))
1657 (k1 (sort-key s1)) (k2 (sort-key s2)))
1658 (if (equalp k1 k2)
1659 (string< (normalize-string s1 :nfd) (normalize-string s2 :nfd))
1660 (vector< k1 k2))))
1662 (defun unicode<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
1663 #!+sb-doc
1664 "Tests if STRING1 and STRING2 are either UNICODE< or UNICODE="
1666 (unicode= string1 string2 :start1 start1 :end1 end1
1667 :start2 start2 :end2 end2)
1668 (unicode< string1 string2 :start1 start1 :end1 end1
1669 :start2 start2 :end2 end2)))
1671 (defun unicode> (string1 string2 &key (start1 0) end1 (start2 0) end2)
1672 #!+sb-doc
1673 "Tests if STRING2 is UNICODE< STRING1."
1674 (unicode< string2 string1 :start1 start2 :end1 end2
1675 :start2 start1 :end2 end1))
1677 (defun unicode>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
1678 #!+sb-doc
1679 "Tests if STRING1 and STRING2 are either UNICODE= or UNICODE>"
1681 (unicode= string1 string2 :start1 start1 :end1 end1
1682 :start2 start2 :end2 end2)
1683 (unicode> string1 string2 :start1 start1 :end1 end1
1684 :start2 start2 :end2 end2)))
1687 ;;; Confusable detection
1689 (defun canonically-deconfuse (string)
1690 (let (ret (i 0) new-i (len (length string))
1691 best-node)
1692 (loop while (< i len) do
1693 (loop for offset from 1 to 5
1694 while (<= (+ i offset) len)
1696 (let ((node (gethash (subseq string i (+ i offset))
1697 **confusables**)))
1698 (when node (setf best-node node new-i (+ i offset)))))
1699 (cond
1700 (best-node (push best-node ret) (setf i new-i))
1701 (t (push (subseq string i (1+ i)) ret) (incf i)))
1702 (setf best-node nil new-i nil))
1703 (apply #'concatenate 'string (nreverse ret))))
1705 (defun confusable-p (string1 string2 &key (start1 0) end1 (start2 0) end2)
1706 #!+sb-doc
1707 "Determines whether STRING1 and STRING2 could be visually confusable
1708 according to the IDNA confusableSummary.txt table"
1709 (let* ((form #!+sb-unicode :nfd #!-sb-unicode :nfc)
1710 (str1 (normalize-string (subseq string1 start1 end1) form))
1711 (str2 (normalize-string (subseq string2 start2 end2) form))
1712 (skeleton1 (normalize-string (canonically-deconfuse str1) form))
1713 (skeleton2 (normalize-string (canonically-deconfuse str2) form)))
1714 (string= skeleton1 skeleton2)))