src/code/target-unicode.lisp

   1 ;;;; Unicode functions
   2
   3 ;;;; This software is part of the SBCL system. See the README file for
   4 ;;;; more information.
   5 ;;;;
   6 ;;;; This software is derived from the CMU CL system, which was
   7 ;;;; written at Carnegie Mellon University and released into the
   8 ;;;; public domain. The software is in the public domain and is
   9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
  10 ;;;; files for more information.
  11
  12 (in-package "SB!UNICODE")
  13
  14 (declaim (type simple-vector **special-numerics**))
  15 (sb!impl::defglobal **special-numerics**
  16   #.(sb-cold:read-from-file "output/numerics.lisp-expr"))
  17
  18 (declaim (type (simple-array (unsigned-byte 32) (*)) **block-ranges**))
  19 (sb!impl::defglobal **block-ranges**
  20   #.(!coerce-to-specialized
  21      (sb-cold:read-from-file "output/blocks.lisp-expr")
  22      '(unsigned-byte 32)))
  23
  24 (macrolet ((unicode-property-init ()
  25              (let ((proplist-dump
  26                     (sb-cold:read-from-file "output/misc-properties.lisp-expr"))
  27                    (confusable-sets
  28                     (sb-cold:read-from-file "output/confusables.lisp-expr"))
  29                    (bidi-mirroring-list
  30                     (sb-cold:read-from-file "output/bidi-mirrors.lisp-expr")))
  31                `(progn
  32                   (sb!impl::defglobal **proplist-properties** ',proplist-dump)
  33                   (sb!impl::defglobal **confusables** ',confusable-sets)
  34                   (sb!impl::defglobal **bidi-mirroring-glyphs** ',bidi-mirroring-list)
  35                   (defun !unicode-properties-cold-init ()
  36                     (let ((hash (make-hash-table)) (list ',proplist-dump))
  37                       (do ((k (car list) (car list)) (v (cadr list) (cadr list)))
  38                           ((not list) hash)
  39                         (setf (gethash k hash) v)
  40                         (setf list (cddr list)))
  41                       (setf **proplist-properties** hash))
  42                     (let ((hash (make-hash-table :test #'equal)))
  43                       (loop for set in ',confusable-sets
  44                          for items = (mapcar #'(lambda (item)
  45                                                  (map 'simple-string
  46                                                       #'code-char item))
  47                                              #!+sb-unicode set
  48                                              #!-sb-unicode
  49                                              (remove-if-not
  50                                               #'(lambda (item)
  51                                                   (every
  52                                                    #'(lambda (x)
  53                                                        (< x sb!xc:char-code-limit))
  54                                                    item)) set))
  55                          do (dolist (i items)
  56                               (setf (gethash (logically-readonlyize (possibly-base-stringize i))
  57                                              hash)
  58                                     (logically-readonlyize
  59                                      (possibly-base-stringize (first items))))))
  60                       (setf **confusables** hash))
  61                     (let ((hash (make-hash-table)) (list ',bidi-mirroring-list))
  62                       (loop for (k v) in list do
  63                            (setf (gethash k hash) v))
  64                       (setf **bidi-mirroring-glyphs** hash)))))))
  65   (unicode-property-init))
  66
  67 ;;; Unicode property access
  68 (defun ordered-ranges-member (item vector)
  69   (declare (type simple-vector vector)
  70            (type fixnum item)
  71            (optimize speed))
  72   (labels ((recurse (start end)
  73              (declare (type index start end)
  74                       (optimize (safety 0)))
  75              (when (< start end)
  76                (let* ((i (+ start (truncate (the index (- end start)) 2)))
  77                       (index (* 2 i))
  78                       (elt1 (svref vector index))
  79                       (elt2 (svref vector (1+ index))))
  80                  (declare (type index i)
  81                           (fixnum elt1 elt2))
  82                  (cond ((< item elt1)
  83                         (recurse start i))
  84                        ((> item elt2)
  85                         (recurse (+ 1 i) end))
  86                        (t
  87                         item))))))
  88     (recurse 0 (truncate (length vector) 2))))
  89
  90 ;; Returns which range `item` was found in or NIL
  91 ;; First range = 0, second range = 1 ...
  92 (defun ordered-ranges-position (item vector)
  93   (declare (type (simple-array (unsigned-byte 32) (*)) vector)
  94            (type fixnum item))
  95   (labels ((recurse (start end)
  96              (declare (type index start end))
  97              (when (< start end)
  98                (let* ((i (+ start (truncate (- end start) 2)))
  99                       (index (* 2 i))
 100                       (elt1 (aref vector index))
 101                       (elt2 (aref vector (1+ index))))
 102                  (declare (type index i))
 103                  (cond ((< item elt1)
 104                         (recurse start i))
 105                        ((> item elt2)
 106                         (recurse (+ 1 i) end))
 107                        (t
 108                         i))))))
 109     (recurse 0 (truncate (length vector) 2))))
 110
 111 (defun proplist-p (character property)
 112   "Returns T if CHARACTER has the specified PROPERTY.
 113 PROPERTY is a keyword representing one of the properties from PropList.txt,
 114 with underscores replaced by dashes."
 115   (ordered-ranges-member (char-code character)
 116                          (gethash property **proplist-properties**)))
 117
 118 ;; WARNING: These have to be manually kept in sync with the values in ucd.lisp
 119 (declaim (type simple-vector *general-categories* *bidi-classes* *east-asian-widths*
 120                *scripts* *line-break-classes* *blocks*))
 121 (sb!impl::defglobal *general-categories*
 122   #(:Lu :Ll :Lt :Lm :Lo :Cc :Cf :Co :Cs :Cn :Mc :Me :Mn :Nd
 123     :Nl :No :Pc :Pd :Pe :Pf :Pi :Po :Ps :Sc :Sk :Sm :So :Zl
 124     :Zp :Zs))
 125
 126 (sb!impl::defglobal *bidi-classes*
 127   #(:BN :AL :AN :B :CS :EN :ES :ET :L :LRE :LRO :NSM :ON
 128     :PDF :R :RLE :RLO :S :WS :LRI :RLI :FSI :PDI))
 129
 130 (sb!impl::defglobal *east-asian-widths*
 131   #(:N :A :H :W :F :Na))
 132
 133 (sb!impl::defglobal *scripts*
 134   #(:Unknown :Common :Latin :Greek :Cyrillic :Armenian :Hebrew :Arabic :Syriac
 135     :Thaana :Devanagari :Bengali :Gurmukhi :Gujarati :Oriya :Tamil :Telugu
 136     :Kannada :Malayalam :Sinhala :Thai :Lao :Tibetan :Myanmar :Georgian :Hangul
 137     :Ethiopic :Cherokee :Canadian-Aboriginal :Ogham :Runic :Khmer :Mongolian
 138     :Hiragana :Katakana :Bopomofo :Han :Yi :Old-Italic :Gothic :Deseret
 139     :Inherited :Tagalog :Hanunoo :Buhid :Tagbanwa :Limbu :Tai-Le :Linear-B
 140     :Ugaritic :Shavian :Osmanya :Cypriot :Braille :Buginese :Coptic :New-Tai-Lue
 141     :Glagolitic :Tifinagh :Syloti-Nagri :Old-Persian :Kharoshthi :Balinese
 142     :Cuneiform :Phoenician :Phags-Pa :Nko :Sundanese :Lepcha :Ol-Chiki :Vai
 143     :Saurashtra :Kayah-Li :Rejang :Lycian :Carian :Lydian :Cham :Tai-Tham
 144     :Tai-Viet :Avestan :Egyptian-Hieroglyphs :Samaritan :Lisu :Bamum :Javanese
 145     :Meetei-Mayek :Imperial-Aramaic :Old-South-Arabian :Inscriptional-Parthian
 146     :Inscriptional-Pahlavi :Old-Turkic :Kaithi :Batak :Brahmi :Mandaic :Chakma
 147     :Meroitic-Cursive :Meroitic-Hieroglyphs :Miao :Sharada :Sora-Sompeng
 148     :Takri :Bassa-Vah :Mahajani :Pahawh-Hmong :Caucasian-Albanian :Manichaean
 149     :Palmyrene :Duployan :Mende-Kikakui :Pau-Cin-Hau :Elbasan :Modi
 150     :Psalter-Pahlavi :Grantha :Mro :Siddham :Khojki :Nabataean :Tirhuta
 151     :Khudawadi :Old-North-Arabian :Warang-Citi :Linear-A :Old-Permic))
 152
 153 (sb!impl::defglobal *line-break-classes*
 154     #(:XX :AI :AL :B2 :BA :BB :BK :CB :CJ :CL :CM :CP :CR :EX :GL
 155       :HL :HY :ID :IN :IS :LF :NL :NS :NU :OP :PO :PR :QU :RI :SA
 156       :SG :SP :SY :WJ :ZW))
 157
 158 (sb!impl::defglobal *blocks*
 159   #(:Basic-Latin :Latin-1-Supplement :Latin-Extended-A :Latin-Extended-B
 160     :IPA-Extensions :Spacing-Modifier-Letters :Combining-Diacritical-Marks
 161     :Greek-and-Coptic :Cyrillic :Cyrillic-Supplement :Armenian :Hebrew :Arabic
 162     :Syriac :Arabic-Supplement :Thaana :NKo :Samaritan :Mandaic
 163     :Arabic-Extended-A :Devanagari :Bengali :Gurmukhi :Gujarati :Oriya :Tamil
 164     :Telugu :Kannada :Malayalam :Sinhala :Thai :Lao :Tibetan :Myanmar :Georgian
 165     :Hangul-Jamo :Ethiopic :Ethiopic-Supplement :Cherokee
 166     :Unified-Canadian-Aboriginal-Syllabics :Ogham :Runic :Tagalog :Hanunoo
 167     :Buhid :Tagbanwa :Khmer :Mongolian
 168     :Unified-Canadian-Aboriginal-Syllabics-Extended :Limbu :Tai-Le :New-Tai-Lue
 169     :Khmer-Symbols :Buginese :Tai-Tham :Combining-Diacritical-Marks-Extended
 170     :Balinese :Sundanese :Batak :Lepcha :Ol-Chiki :Sundanese-Supplement
 171     :Vedic-Extensions :Phonetic-Extensions :Phonetic-Extensions-Supplement
 172     :Combining-Diacritical-Marks-Supplement :Latin-Extended-Additional
 173     :Greek-Extended :General-Punctuation :Superscripts-and-Subscripts
 174     :Currency-Symbols :Combining-Diacritical-Marks-for-Symbols
 175     :Letterlike-Symbols :Number-Forms :Arrows :Mathematical-Operators
 176     :Miscellaneous-Technical :Control-Pictures :Optical-Character-Recognition
 177     :Enclosed-Alphanumerics :Box-Drawing :Block-Elements :Geometric-Shapes
 178     :Miscellaneous-Symbols :Dingbats :Miscellaneous-Mathematical-Symbols-A
 179     :Supplemental-Arrows-A :Braille-Patterns :Supplemental-Arrows-B
 180     :Miscellaneous-Mathematical-Symbols-B :Supplemental-Mathematical-Operators
 181     :Miscellaneous-Symbols-and-Arrows :Glagolitic :Latin-Extended-C :Coptic
 182     :Georgian-Supplement :Tifinagh :Ethiopic-Extended :Cyrillic-Extended-A
 183     :Supplemental-Punctuation :CJK-Radicals-Supplement :Kangxi-Radicals
 184     :Ideographic-Description-Characters :CJK-Symbols-and-Punctuation :Hiragana
 185     :Katakana :Bopomofo :Hangul-Compatibility-Jamo :Kanbun :Bopomofo-Extended
 186     :CJK-Strokes :Katakana-Phonetic-Extensions :Enclosed-CJK-Letters-and-Months
 187     :CJK-Compatibility :CJK-Unified-Ideographs-Extension-A
 188     :Yijing-Hexagram-Symbols :CJK-Unified-Ideographs :Yi-Syllables :Yi-Radicals
 189     :Lisu :Vai :Cyrillic-Extended-B :Bamum :Modifier-Tone-Letters
 190     :Latin-Extended-D :Syloti-Nagri :Common-Indic-Number-Forms :Phags-pa
 191     :Saurashtra :Devanagari-Extended :Kayah-Li :Rejang :Hangul-Jamo-Extended-A
 192     :Javanese :Myanmar-Extended-B :Cham :Myanmar-Extended-A :Tai-Viet
 193     :Meetei-Mayek-Extensions :Ethiopic-Extended-A :Latin-Extended-E
 194     :Meetei-Mayek :Hangul-Syllables :Hangul-Jamo-Extended-B :High-Surrogates
 195     :High-Private-Use-Surrogates :Low-Surrogates :Private-Use-Area
 196     :CJK-Compatibility-Ideographs :Alphabetic-Presentation-Forms
 197     :Arabic-Presentation-Forms-A :Variation-Selectors :Vertical-Forms
 198     :Combining-Half-Marks :CJK-Compatibility-Forms :Small-Form-Variants
 199     :Arabic-Presentation-Forms-B :Halfwidth-and-Fullwidth-Forms :Specials
 200     :Linear-B-Syllabary :Linear-B-Ideograms :Aegean-Numbers
 201     :Ancient-Greek-Numbers :Ancient-Symbols :Phaistos-Disc :Lycian :Carian
 202     :Coptic-Epact-Numbers :Old-Italic :Gothic :Old-Permic :Ugaritic :Old-Persian
 203     :Deseret :Shavian :Osmanya :Elbasan :Caucasian-Albanian :Linear-A
 204     :Cypriot-Syllabary :Imperial-Aramaic :Palmyrene :Nabataean :Phoenician
 205     :Lydian :Meroitic-Hieroglyphs :Meroitic-Cursive :Kharoshthi
 206     :Old-South-Arabian :Old-North-Arabian :Manichaean :Avestan
 207     :Inscriptional-Parthian :Inscriptional-Pahlavi :Psalter-Pahlavi :Old-Turkic
 208     :Rumi-Numeral-Symbols :Brahmi :Kaithi :Sora-Sompeng :Chakma :Mahajani
 209     :Sharada :Sinhala-Archaic-Numbers :Khojki :Khudawadi :Grantha :Tirhuta
 210     :Siddham :Modi :Takri :Warang-Citi :Pau-Cin-Hau :Cuneiform
 211     :Cuneiform-Numbers-and-Punctuation :Egyptian-Hieroglyphs :Bamum-Supplement
 212     :Mro :Bassa-Vah :Pahawh-Hmong :Miao :Kana-Supplement :Duployan
 213     :Shorthand-Format-Controls :Byzantine-Musical-Symbols :Musical-Symbols
 214     :Ancient-Greek-Musical-Notation :Tai-Xuan-Jing-Symbols
 215     :Counting-Rod-Numerals :Mathematical-Alphanumeric-Symbols :Mende-Kikakui
 216     :Arabic-Mathematical-Alphabetic-Symbols :Mahjong-Tiles :Domino-Tiles
 217     :Playing-Cards :Enclosed-Alphanumeric-Supplement
 218     :Enclosed-Ideographic-Supplement :Miscellaneous-Symbols-and-Pictographs
 219     :Emoticons :Ornamental-Dingbats :Transport-and-Map-Symbols
 220     :Alchemical-Symbols :Geometric-Shapes-Extended :Supplemental-Arrows-C
 221     :CJK-Unified-Ideographs-Extension-B :CJK-Unified-Ideographs-Extension-C
 222     :CJK-Unified-Ideographs-Extension-D :CJK-Compatibility-Ideographs-Supplement
 223     :Tags :Variation-Selectors-Supplement :Supplementary-Private-Use-Area-A
 224     :Supplementary-Private-Use-Area-B))
 225
 226 (declaim (inline svref-or-null))
 227 (defun svref-or-null (vector index)
 228   (and (< index (length vector))
 229        (svref vector index)))
 230
 231 (defun general-category (character)
 232   "Returns the general category of CHARACTER as it appears in UnicodeData.txt"
 233   (svref-or-null *general-categories* (sb!impl::ucd-general-category character)))
 234
 235 (defun bidi-class (character)
 236   "Returns the bidirectional class of CHARACTER"
 237   (if (and (eql (general-category character) :Cn)
 238            (default-ignorable-p character))
 239       :Bn
 240       (svref-or-null
 241        *bidi-classes*
 242        (aref **character-misc-database** (1+ (misc-index character))))))
 243
 244 (declaim (inline combining-class))
 245 (defun combining-class (character)
 246   "Returns the canonical combining class (CCC) of CHARACTER"
 247   (aref **character-misc-database** (+ 2 (misc-index character))))
 248
 249 (defun decimal-value (character)
 250   "Returns the decimal digit value associated with CHARACTER or NIL if
 251 there is no such value.
 252
 253 The only characters in Unicode with a decimal digit value are those
 254 that are part of a range of characters that encode the digits 0-9.
 255 Because of this, `(decimal-digit c) <=> (digit-char-p c 10)` in
 256 #+sb-unicode builds"
 257   (sb!impl::ucd-decimal-digit character))
 258
 259 (defun digit-value (character)
 260   "Returns the Unicode digit value of CHARACTER or NIL if it doesn't exist.
 261
 262 Digit values are guaranteed to be integers between 0 and 9 inclusive.
 263 All characters with decimal digit values have the same digit value,
 264 but there are characters (such as digits of number systems without a 0 value)
 265 that have a digit value but no decimal digit value"
 266   (let ((%digit (clear-flag 6
 267                             (aref **character-misc-database**
 268                                   (+ 3 (misc-index character))))))
 269     (if (< %digit 10) %digit nil)))
 270
 271 (defun numeric-value (character)
 272   "Returns the numeric value of CHARACTER or NIL if there is no such value.
 273 Numeric value is the most general of the Unicode numeric properties.
 274 The only constraint on the numeric value is that it be a rational number."
 275   (or (double-vector-binary-search (char-code character)
 276                                    **special-numerics**)
 277       (digit-value character)))
 278
 279 (defun mirrored-p (character)
 280   "Returns T if CHARACTER needs to be mirrored in bidirectional text.
 281 Otherwise, returns NIL."
 282   (logbitp 5 (aref **character-misc-database**
 283                     (+ 5 (misc-index character)))))
 284
 285 (defun bidi-mirroring-glyph (character)
 286   "Returns the mirror image of CHARACTER if it exists.
 287 Otherwise, returns NIL."
 288   (when (mirrored-p character)
 289     (let ((ret (gethash (char-code character) **bidi-mirroring-glyphs**)))
 290       (when ret (code-char ret)))))
 291
 292 (defun east-asian-width (character)
 293   "Returns the East Asian Width property of CHARACTER as
 294 one of the keywords :N (Narrow), :A (Ambiguous), :H (Halfwidth),
 295 :W (Wide), :F (Fullwidth), or :NA (Not applicable)"
 296   (svref-or-null *east-asian-widths*
 297                  (ldb (byte 3 0)
 298                       (aref **character-misc-database**
 299                             (+ 5 (misc-index character))))))
 300
 301 (defun script (character)
 302   "Returns the Script property of CHARACTER as a keyword.
 303 If CHARACTER does not have a known script, returns :UNKNOWN"
 304   (svref-or-null *scripts*
 305                  (aref **character-misc-database** (+ 6 (misc-index character)))))
 306
 307 (defun char-block (character)
 308   "Returns the Unicode block in which CHARACTER resides as a keyword.
 309 If CHARACTER does not have a known block, returns :NO-BLOCK"
 310   (let* ((code (char-code character))
 311          (block-index (ordered-ranges-position code **block-ranges**)))
 312     (if block-index
 313         (aref *blocks* block-index) :no-block)))
 314
 315 (defun unicode-1-name (character)
 316   "Returns the name assigned to CHARACTER in Unicode 1.0 if it is distinct
 317 from the name currently assigned to CHARACTER. Otherwise, returns NIL.
 318 This property has been officially obsoleted by the Unicode standard, and
 319 is only included for backwards compatibility."
 320   (let* ((char-code (char-code character))
 321          (h-code (double-vector-binary-search char-code
 322                                               **unicode-1-char-name-database**)))
 323     (when h-code
 324       (huffman-decode h-code **unicode-character-name-huffman-tree**))))
 325
 326 (defun age (character)
 327   "Returns the version of Unicode in which CHARACTER was assigned as a pair
 328 of values, both integers, representing the major and minor version respectively.
 329 If CHARACTER is not assigned in Unicode, returns NIL for both values."
 330   (let* ((value (aref **character-misc-database** (+ 8 (misc-index character))))
 331          (major (ash value -3))
 332          (minor (ldb (byte 3 0) value)))
 333     (if (zerop value) (values nil nil) (values major minor))))
 334
 335 (defun hangul-syllable-type (character)
 336   "Returns the Hangul syllable type of CHARACTER.
 337 The syllable type can be one of :L, :V, :T, :LV, or :LVT.
 338 If the character is not a Hangul syllable or Jamo, returns NIL"
 339   (let ((cp (char-code character)))
 340     (cond
 341       ((or
 342         (and (<= #x1100 cp) (<= cp #x115f))
 343         (and (<= #xa960 cp) (<= cp #xa97c))) :L)
 344       ((or
 345         (and (<= #x1160 cp) (<= cp #x11a7))
 346         (and (<= #xd7B0 cp) (<= cp #xd7C6))) :V)
 347       ((or
 348         (and (<= #x11a8 cp) (<= cp #x11ff))
 349         (and (<= #xd7c8 cp) (<= cp #xd7fb))) :T)
 350       ((and (<= #xac00 cp) (<= cp #xd7a3))
 351        (if (= 0 (rem (- cp #xac00) 28)) :LV :LVT)))))
 352
 353 (defun line-break-class (character &key resolve)
 354   "Returns the line breaking class of CHARACTER, as specified in UAX #14.
 355 If :RESOLVE is NIL, returns the character class found in the property file.
 356 If :RESOLVE is non-NIL, centain line-breaking classes will be mapped to othec
 357 classes as specified in the applicable standards. Addinionally, if :RESOLVE
 358 is :EAST-ASIAN, Ambigious (class :AI) characters will be mapped to the
 359 Ideographic (:ID) class instead of Alphabetic (:AL)."
 360   (when (and resolve (listp character)) (setf character (car character)))
 361   (when (and resolve (not character)) (return-from line-break-class :nil))
 362   (let ((raw-class
 363          (svref-or-null *line-break-classes*
 364                         (aref **character-misc-database** (+ 7 (misc-index character)))))
 365         (syllable-type (hangul-syllable-type character)))
 366     (when syllable-type
 367       (setf raw-class
 368             (cdr (assoc syllable-type
 369                         '((:l . :JL) (:v . :JV) (:t . :JT)
 370                           (:lv . :H2) (:lvt . :H3))))))
 371     (when resolve
 372       (setf raw-class
 373             (case raw-class
 374               (:ai (if (eql resolve :east-asion) :ID :AL))
 375               ; If we see :CM when resolving, we have a CM that isn't subject
 376               ; to LB9, so we do LB10
 377               ((:xx :cm) :al)
 378               (:sa (if (member (general-category character) '(:Mn :Mc))
 379                        :CM :AL))
 380               (:cj :ns)
 381               (:sg (error "The character ~S is a surrogate, which should not
 382 appear in an SBCL string. The line-breaking behavior of surrogates is undefined."
 383                           character))
 384               (t raw-class))))
 385     raw-class))
 386
 387 (defun uppercase-p (character)
 388   "Returns T if CHARACTER has the Unicode property Uppercase and NIL otherwise"
 389   (or (eql (general-category character) :Lu) (proplist-p character :other-uppercase)))
 390
 391 (defun lowercase-p (character)
 392   "Returns T if CHARACTER has the Unicode property Lowercase and NIL otherwise"
 393   (or (eql (general-category character) :Ll) (proplist-p character :other-lowercase)))
 394
 395 (defun cased-p (character)
 396   "Returns T if CHARACTER has a (Unicode) case, and NIL otherwise"
 397   (or (uppercase-p character) (lowercase-p character)
 398       (eql (general-category character) :Lt)))
 399
 400 (defun case-ignorable-p (character)
 401   "Returns T if CHARACTER is Case Ignorable as defined in Unicode 6.3, Chapter
 402 3"
 403   (or (member (general-category character)
 404               '(:Mn :Me :Cf :Lm :Sk))
 405       (member (word-break-class character)
 406               '(:midletter :midnumlet :single-quote))))
 407
 408 (defun alphabetic-p (character)
 409   "Returns T if CHARACTER is Alphabetic according to the Unicode standard
 410 and NIL otherwise"
 411   (or (member (general-category character) '(:Lu :Ll :Lt :Lm :Lo :Nl))
 412       (proplist-p character :other-alphabetic)))
 413
 414 (defun ideographic-p (character)
 415   "Returns T if CHARACTER has the Unicode property Ideographic,
 416 which loosely corresponds to the set of \"Chinese characters\""
 417   (proplist-p character :ideographic))
 418
 419 (defun math-p (character)
 420   "Returns T if CHARACTER is a mathematical symbol according to Unicode and
 421 NIL otherwise"
 422   (or (eql (general-category character) :sm) (proplist-p character :other-math)))
 423
 424 (defun whitespace-p (character)
 425   "Returns T if CHARACTER is whitespace according to Unicode
 426 and NIL otherwise"
 427   (proplist-p character :white-space))
 428
 429 (defun hex-digit-p (character &key ascii)
 430   "Returns T if CHARACTER is a hexadecimal digit and NIL otherwise.
 431 If :ASCII is non-NIL, fullwidth equivalents of the Latin letters A through F
 432 are excluded."
 433   (proplist-p character (if ascii :ascii-hex-digit :hex-digit)))
 434
 435 (defun soft-dotted-p (character)
 436   "Returns T if CHARACTER has a soft dot (such as the dots on i and j) which
 437 disappears when accents are placed on top of it. and NIL otherwise"
 438   (proplist-p character :soft-dotted))
 439
 440 (defun default-ignorable-p (character)
 441   "Returns T if CHARACTER is a Default_Ignorable_Code_Point"
 442   (and
 443    (or (proplist-p character :other-default-ignorable-code-point)
 444        (eql (general-category character) :cf)
 445        (proplist-p character :variation-selector))
 446    (not
 447     (or (whitespace-p character)
 448         (ordered-ranges-member
 449          (char-code character)
 450          #(#x0600 #x0604 #x06DD #x06DD #x070F #x070F #xFFF9 #xFFFB
 451            #x110BD #x110BD))))))
 452
 453 \f
 454 ;;; Implements UAX#15: Normalization Forms
 455 (declaim (inline char-decomposition-info))
 456 (defun char-decomposition-info (char)
 457   (let ((value (aref **character-misc-database**
 458                      (+ 4 (misc-index char)))))
 459     (values (clear-flag 7 value) (logbitp 7 value))))
 460
 461 (defun char-decomposition (char length callback)
 462   (declare (function callback))
 463   ;; Caller should have gotten length from char-decomposition-info
 464   (let* ((cp (char-code char))
 465          (cp-high (ash cp -8))
 466          (decompositions **character-decompositions**)
 467          (high-page (aref **character-high-pages** cp-high))
 468          (index (unless (logbitp 15 high-page) ;; Hangul syllable
 469                   (aref **character-low-pages**
 470                         (+ 1 (* 2 (+ (ldb (byte 8 0) cp) (ash high-page 8))))))))
 471     (cond ((= length 1)
 472            (funcall callback (code-char (aref decompositions index))))
 473           ((<= #xac00 cp #xd7a3)
 474            ;; see Unicode 6.2, section 3-12
 475            (let* ((sbase #xac00)
 476                   (lbase #x1100)
 477                   (vbase #x1161)
 478                   (tbase #x11a7)
 479                   (vcount 21)
 480                   (tcount 28)
 481                   (ncount (* vcount tcount))
 482                   (sindex (- cp sbase))
 483                   (lindex (floor sindex ncount))
 484                   (vindex (floor (mod sindex ncount) tcount))
 485                   (tindex (mod sindex tcount)))
 486              (funcall callback (code-char (+ lbase lindex)))
 487              (funcall callback (code-char (+ vbase vindex)))
 488              (when (> tindex 0)
 489                (funcall callback  (code-char (+ tbase tindex))))))
 490
 491           (t
 492            (loop for i below length
 493                  do
 494                  (funcall callback (code-char (aref decompositions (+ index i)))))))))
 495
 496 (defun decompose-char (char compatibility callback)
 497   (declare (function callback))
 498   (multiple-value-bind (info compat) (char-decomposition-info char)
 499     (if (and (plusp info)
 500              (or compatibility
 501                  (not compat)))
 502         (if compatibility
 503             (dx-flet ((callback (char)
 504                         (decompose-char char t callback)))
 505               (char-decomposition char info #'callback))
 506             (char-decomposition char info callback))
 507         (funcall callback char))))
 508
 509 (defun decompose-string (string compatibility filter)
 510   (let (chars
 511         (length 0)
 512         (previous-combining-class 0))
 513     (declare (type index length))
 514     (dx-flet ((callback (char)
 515                         (let ((combining-class (combining-class char)))
 516                           (incf length)
 517                           (cond ((< 0 combining-class previous-combining-class)
 518                                  ;; Ensure it's sorted
 519                                  (loop for cons on chars
 520                                        for next-char = (cadr cons)
 521                                        when (or (not next-char)
 522                                                 (<= 0 (combining-class next-char) combining-class))
 523                                        do (setf (cdr cons)
 524                                                 (cons char (cdr cons)))
 525                                           (return)))
 526                                 (t
 527                                  (push char chars)
 528                                  (setf previous-combining-class combining-class))))))
 529       (sb!kernel:with-array-data ((string string) (start) (end)
 530                                   :check-fill-pointer t)
 531         (let ((calback (if filter
 532                            (let ((filter (sb!kernel:%coerce-callable-to-fun filter)))
 533                              (lambda (char)
 534                                (when (funcall filter char)
 535                                  (callback char))))
 536                            #'callback)))
 537           (loop for i from start below end
 538                 for char = (schar string i)
 539                 do
 540                 (decompose-char char compatibility calback))))
 541       (let ((result (make-string length)))
 542         (loop for char in chars
 543               for i from (1- length) downto 0
 544               do (setf (schar result i) char))
 545         result))))
 546
 547 (defun composition-hangul-syllable-type (cp)
 548   (cond
 549     ((and (<= #x1100 cp) (<= cp #x1112)) :L)
 550     ((and (<= #x1161 cp) (<= cp #x1175)) :V)
 551     ((and (<= #x11a8 cp) (<= cp #x11c2)) :T)
 552     ((and (<= #xac00 cp) (<= cp #.(+ #xac00 11171)))
 553      (if (= 0 (rem (- cp #xac00) 28)) :LV :LVT))))
 554
 555 (defun primary-composition (char1 char2)
 556   (flet ((maybe (fn x) (when x (funcall fn x))))
 557     (let ((c1 (char-code char1))
 558           (c2 (char-code char2)))
 559       (maybe
 560        #'code-char
 561        (cond
 562          ((gethash (dpb c1 (byte 21 21) c2)
 563                    **character-primary-compositions**))
 564          ((and (eql (composition-hangul-syllable-type c1) :L)
 565                (eql (composition-hangul-syllable-type c2) :V))
 566           (let ((lindex (- c1 #x1100))
 567                 (vindex (- c2 #x1161)))
 568             (+ #xac00 (* lindex 588) (* vindex 28))))
 569          ((and (eql (composition-hangul-syllable-type c1) :LV)
 570                (eql (composition-hangul-syllable-type c2) :T))
 571           (+ c1 (- c2 #x11a7))))))))
 572
 573 ;;; This implements a sequence data structure, specialized for
 574 ;;; efficient deletion of characters at an index, along with tolerable
 575 ;;; random access.  The purpose is to support the canonical
 576 ;;; composition algorithm from Unicode, which involves replacing (not
 577 ;;; necessarily consecutive) pairs of code points with a single code
 578 ;;; point (e.g. [#\e #\combining_acute_accent] with
 579 ;;; #\latin_small_letter_e_with_acute).  The data structure is a list
 580 ;;; of three-element lists, each denoting a chunk of string data
 581 ;;; starting at the first index and ending at the second.
 582 ;;;
 583 ;;; Actually, the implementation isn't particularly efficient, and
 584 ;;; would probably benefit from being rewritten in terms of displaced
 585 ;;; arrays, which would substantially reduce copying.
 586 ;;;
 587 ;;; (also, generic sequences.  *sigh*.)
 588 (defun lref (lstring index)
 589   (dolist (l lstring)
 590     (when (and (<= (first l) index)
 591                (< index (second l)))
 592       (return (aref (third l) (- index (first l)))))))
 593
 594 (defun (setf lref) (newchar lstring index)
 595   (dolist (l lstring)
 596     (when (and (<= (first l) index)
 597                (< index (second l)))
 598       (return (setf (aref (third l) (- index (first l))) newchar)))))
 599
 600 (defun llength (lstring)
 601   (second (first (last lstring))))
 602
 603 (defun lstring (lstring)
 604   (let ((result (make-string (llength lstring))))
 605     (dolist (l lstring result)
 606       (replace result (third l) :start1 (first l) :end1 (second l)))))
 607
 608 (defun ldelete (lstring index)
 609   (do* ((ls lstring (cdr ls))
 610         (l (car ls) (car ls))
 611         so-fars)
 612        ((and (<= (first l) index)
 613              (< index (second l)))
 614         (append
 615          (nreverse so-fars)
 616          (cond
 617            ((= (first l) index)
 618             (list (list (first l) (1- (second l)) (subseq (third l) 1))))
 619            ((= index (1- (second l)))
 620             (list (list (first l) (1- (second l)) (subseq (third l) 0 (1- (length (third l)))))))
 621            (t
 622             (list
 623              (list (first l) index
 624                    (subseq (third l) 0 (- index (first l))))
 625              (list index (1- (second l))
 626                    (subseq (third l) (1+ (- index (first l))))))))
 627          (mapcar (lambda (x) (list (1- (first x)) (1- (second x)) (third x)))
 628                  (cdr ls))))
 629     (push l so-fars)))
 630
 631 (defun canonically-compose (string)
 632   (let* ((result (list (list 0 (length string) string)))
 633          (previous-starter-index (position 0 string :key #'combining-class))
 634          (i (and previous-starter-index (1+ previous-starter-index))))
 635     (when (or (not i) (= i (length string)))
 636       (return-from canonically-compose string))
 637     (tagbody
 638      again
 639        (when (and (>= (- i previous-starter-index) 2)
 640                   ;; test for Blocked (Unicode 3.11 para. D115)
 641                   ;;
 642                   ;; (assumes here that string has sorted combiners,
 643                   ;; so can look back just one step)
 644                   (>= (combining-class (lref result (1- i)))
 645                       (combining-class (lref result i))))
 646          (when (= (combining-class (lref result i)) 0)
 647            (setf previous-starter-index i))
 648          (incf i)
 649          (go next))
 650
 651        (let ((comp (primary-composition (lref result previous-starter-index)
 652                                         (lref result i))))
 653          (cond
 654            (comp
 655             (setf (lref result previous-starter-index) comp)
 656             (setf result (ldelete result i)))
 657            (t
 658             (when (= (combining-class (lref result i)) 0)
 659               (setf previous-starter-index i))
 660             (incf i))))
 661      next
 662        (unless (= i (llength result))
 663          (go again)))
 664     (if (= i (length string))
 665         string
 666         (lstring result))))
 667
 668 (defun normalize-string (string &optional (form :nfd)
 669                                           filter)
 670   "Normalize STRING to the Unicode normalization form form.
 671 Acceptable values for form are :NFD, :NFC, :NFKD, and :NFKC.
 672 If FILTER is a function it is called on each decomposed character and
 673 only characters for which it returns T are collected."
 674   (declare (type (member :nfd :nfkd :nfc :nfkc) form))
 675   #!-sb-unicode
 676   (declare (ignore filter))
 677   #!-sb-unicode
 678   (etypecase string
 679     ((array nil (*)) string)
 680     (string
 681      (ecase form
 682        ((:nfc :nfkc) string)
 683        ((:nfd :nfkd) (error "Cannot normalize to ~A form in #-SB-UNICODE builds" form)))))
 684   #!+sb-unicode
 685   (etypecase string
 686     (base-string string)
 687     ((array character (*))
 688      (ecase form
 689        ((:nfc)
 690         (canonically-compose (decompose-string string nil filter)))
 691        ((:nfd)
 692         (decompose-string string nil filter))
 693        ((:nfkc)
 694         (canonically-compose (decompose-string string t filter)))
 695        ((:nfkd)
 696         (decompose-string string t filter))))
 697     ((array nil (*)) string)))
 698
 699 (defun normalized-p (string &optional (form :nfd))
 700   "Tests if STRING is normalized to FORM"
 701   ;; FIXME: can be optimized
 702   (string= string (normalize-string string form)))
 703
 704 \f
 705 ;;; Unicode case algorithms
 706 ;; FIXME: Make these parts less redundant (macro?)
 707 (sb!ext:defglobal **special-titlecases**
 708   '#.(sb-cold:read-from-file "output/titlecases.lisp-expr"))
 709
 710 (sb!ext:defglobal **special-casefolds**
 711   '#.(sb-cold:read-from-file "output/foldcases.lisp-expr"))
 712
 713 (defun has-case-p (char)
 714   ;; Bit 6 is the Unicode case flag, as opposed to the Common Lisp one
 715   (logbitp 6 (aref **character-misc-database** (+ 5 (misc-index char)))))
 716
 717 (defun char-uppercase (char)
 718   (if (has-case-p char)
 719       (let ((cp (car (char-case-info char))))
 720         (if (atom cp) (list (code-char cp)) (mapcar #'code-char cp)))
 721       (list char)))
 722
 723 (defun char-lowercase (char)
 724   (if (has-case-p char)
 725       (let ((cp (cdr (char-case-info char))))
 726         (if (atom cp) (list (code-char cp)) (mapcar #'code-char cp)))
 727       (list char)))
 728
 729 (defun char-titlecase (char)
 730   (unless (has-case-p char) (return-from char-titlecase (list char)))
 731   (let* ((cp (char-code char))
 732          (value (assoc cp **special-titlecases**)))
 733     (if value
 734         (if (atom (cdr value))
 735             (list (code-char (cdr value)))
 736             (mapcar #'code-char (cdr value)))
 737         (char-uppercase char))))
 738
 739 (defun char-foldcase (char)
 740   (unless (has-case-p char) (return-from char-foldcase (list char)))
 741   (let* ((cp (char-code char))
 742          (value (assoc cp **special-casefolds**)))
 743     (if value
 744         (if (atom (cdr value))
 745             (list (code-char (cdr value)))
 746             (mapcar #'code-char (cdr value)))
 747         (char-lowercase char))))
 748
 749 (defun string-somethingcase (fn string special-fn)
 750   (let (result (len (length string)))
 751     (loop for index from 0 below len
 752        for char = (char string index)
 753        for cased = (or (funcall special-fn char index len)
 754                        (funcall fn char))
 755        do (loop for c in (remove :none cased) do (push c result)))
 756     (setf result (nreverse result))
 757     (coerce result 'string)))
 758
 759 (declaim (type function sb!unix::posix-getenv))
 760 (defun get-user-locale ()
 761   (let ((raw-locale
 762          #!+(or win32 unix) (or (sb!unix::posix-getenv "LC_ALL")
 763                                 (sb!unix::posix-getenv "LANG"))
 764          #!-(or win32 unix) nil))
 765     (when raw-locale
 766       (let ((lang-code (string-upcase
 767                         (subseq raw-locale 0 (position #\_ raw-locale)))))
 768         (when lang-code
 769           (intern lang-code "KEYWORD"))))))
 770
 771
 772 (defun uppercase (string &key locale)
 773   "Returns the full uppercase of STRING according to the Unicode standard.
 774 The result is not guaranteed to have the same length as the input. If :LOCALE
 775 is NIL, no language-specific case transformations are applied. If :LOCALE is a
 776 keyword representing a two-letter ISO country code, the case transforms of that
 777 locale are used. If :LOCALE is T, the user's current locale is used (Unix and
 778 Win32 only)."
 779   (when (eq locale t) (setf locale (get-user-locale)))
 780   (string-somethingcase
 781    #'char-uppercase string
 782    #!-sb-unicode (constantly nil)
 783    #!+sb-unicode ;; code-char with a constant > 255 breaks the build
 784    #'(lambda (char index len)
 785        (declare (ignore len))
 786        (cond
 787          ((and (eql locale :lt) (char= char (code-char #x0307))
 788                   (loop for i from (1- index) downto 0
 789                      for c = (char string i)
 790                      do (case (combining-class c)
 791                           (0 (return (soft-dotted-p c)))
 792                           (230 (return nil))
 793                           (t t))
 794                      finally (return nil)))
 795           '(:none))
 796          ((and (or (eql locale :tr) (eql locale :az))
 797                (char= char #\i))
 798           (list (code-char #x0130)))
 799          (t nil)))))
 800
 801 (defun lowercase (string &key locale)
 802   "Returns the full lowercase of STRING according to the Unicode standard.
 803 The result is not guaranteed to have the same length as the input.
 804 :LOCALE has the same semantics as the :LOCALE argument to UPPERCASE."
 805   (when (eq locale t) (setf locale (get-user-locale)))
 806   (string-somethingcase
 807    #'char-lowercase string
 808    #!-sb-unicode (constantly nil)
 809    #!+sb-unicode
 810    #'(lambda (char index len)
 811        (cond
 812          ((and (char= char (code-char #x03A3))
 813                (loop for i from (1- index) downto 0
 814                   for c = (char string i)
 815                   do (cond ((cased-p c) (return t))
 816                            ((case-ignorable-p c))
 817                            (t (return nil)))
 818                   finally (return nil))
 819                (loop for i from (1+ index) below len
 820                   for c = (char string i)
 821                   do (cond ((cased-p c) (return nil))
 822                            ((case-ignorable-p c))
 823                            (t (return t)))
 824                   finally (return t)))
 825           (list (code-char #x03C2)))
 826        ((eql locale :lt)
 827         (mapcar
 828          #'code-char
 829          (cdr (or
 830                (assoc (char-code char)
 831                       '((#x00CC . (#x0069 #x0307 #x0300))
 832                         (#x00CD . (#x0069 #x0307 #x0301))
 833                         (#x0128 . (#x0069 #x0307 #x0303))))
 834                (and (loop for i from (1+ index) below len
 835                        for c = (char string i)
 836                        do (case (combining-class c)
 837                             (230 (return t))
 838                             (0 (return nil))
 839                             (t t))
 840                        finally (return nil))
 841                     (assoc (char-code char)
 842                            '((#x0049 . (#x0069 #x0307))
 843                              (#x004A . (#x006A #x0307))
 844                              (#x012E . (#x012F #x0307)))))))))
 845        ((or (eql locale :tr) (eql locale :az))
 846         (cond
 847           ((char= char (code-char #x0130)) (list #\i))
 848           ((and (char= char (code-char #x0307))
 849                 (loop for i from (1- index) downto 0
 850                    for c = (char string i)
 851                    do (case (combining-class c)
 852                         (0 (return (char= c #\I)))
 853                         (230 (return nil))
 854                         (t t))
 855                    finally (return nil)))
 856            '(:none))
 857           ((and (char= char #\I)
 858                 (loop for i from (1+ index) below len
 859                    for c = (char string i)
 860                    do (case (combining-class c)
 861                         (0 (return t))
 862                         (230 (return (char/= c (code-char #x0307))))
 863                         (t t))
 864                    finally (return t)))
 865            (list (code-char #x0131)))
 866           (t nil)))
 867        (t nil)))))
 868
 869 (defun titlecase (string &key locale)
 870   "Returns the titlecase of STRING. The resulting string can
 871 be longer than the input.
 872 :LOCALE has the same semantics as the :LOCALE argument to UPPERCASE."
 873   (when (eq locale t) (setf locale (get-user-locale)))
 874   (let ((words (words string))
 875         (cased nil))
 876    (loop for word in words
 877       for first-cased = (or (position-if #'cased-p word) 0)
 878       for pre = (subseq word 0 first-cased)
 879       for initial = (char word first-cased)
 880       for rest = (subseq word (1+ first-cased))
 881       do (let ((up (char-titlecase initial)) (down (lowercase rest)))
 882            #!+sb-unicode
 883            (when (and (or (eql locale :tr) (eql locale :az))
 884                       (eql initial #\i))
 885              (setf up (list (code-char #x0130))))
 886            #!+sb-unicode
 887            (when (and (eql locale :lt)
 888                       (soft-dotted-p initial)
 889                       (eql (char down
 890                                  (position-if
 891                                   #'(lambda (c)
 892                                       (or (eql (combining-class c) 0)
 893                                           (eql (combining-class c) 230))) down))
 894                            (code-char #x0307)))
 895              (setf down (delete (code-char #x0307) down :count 1)))
 896            (push (concatenate 'string pre up down) cased)))
 897    (apply #'concatenate 'string (nreverse cased))))
 898
 899 (defun casefold (string)
 900   "Returns the full casefolding of STRING according to the Unicode standard.
 901 Casefolding removes case information in a way that allows the results to be used
 902 for case-insensitive comparisons.
 903 The result is not guaranteed to have the same length as the input."
 904   (string-somethingcase #'char-foldcase string (constantly nil)))
 905
 906 \f
 907 ;;; Unicode break algorithms
 908 ;;; In all the breaking methods:
 909 ;;; (brk) establishes a break between `first` and `second`
 910 ;;; (nobrk) prevents a break between `first` and `second`
 911 ;;; Setting flag=T/state=:nobrk-next prevents a break between `second` and `htird`
 912
 913 ;; Word breaking sets this to make their algorithms less tricky
 914 (defvar *other-break-special-graphemes* nil)
 915 (defun grapheme-break-class (char)
 916   "Returns the grapheme breaking class of CHARACTER, as specified in UAX #29."
 917   (let ((cp (when char (char-code char)))
 918         (gc (when char (general-category char)))
 919         (not-spacing-mark
 920          #(#x102B #x102C #x1038 #x1062 #x1063 #x1064 #x1067 #x1068 #x1069
 921            #x106A #x106B #x106C #x106D #x1083 #x1087 #x1088 #x1089 #x108A
 922            #x108B #x108C #x108F #x109A #x109B #x109C #x19B0 #x19B1 #x19B2
 923            #x19B3 #x19B4 #x19B8 #x19B9 #x19BB #x19BC #x19BD #x19BE #x19BF
 924            #x19C0 #x19C8 #x19C9 #x1A61 #x1A63 #x1A64 #xAA7B #xAA7D)))
 925     (cond
 926       ((not char) nil)
 927       ((= cp 10) :LF)
 928       ((= cp 13) :CR)
 929       ((or (member gc '(:Mn :Me))
 930            (proplist-p char :other-grapheme-extend)
 931            (and *other-break-special-graphemes*
 932                 (member gc '(:Mc :Cf)) (not (<= #x200B cp #x200D))))
 933        :extend)
 934       ((or (member gc '(:Zl :Zp :Cc :Cs :Cf))
 935            ;; From Cn and Default_Ignorable_Code_Point
 936            (eql cp #x2065) (eql cp #xE0000)
 937            (<= #xFFF0 cp #xFFF8)
 938            (<= #xE0002 cp #xE001F)
 939            (<= #xE0080 cp #xE00FF)
 940            (<= #xE01F0 cp #xE0FFF)) :control)
 941       ((<= #x1F1E6 cp #x1F1FF) :regional-indicator)
 942       ((and (or (eql gc :Mc)
 943                 (eql cp #x0E33) (eql cp #x0EB3))
 944             (not (binary-search cp not-spacing-mark))) :spacing-mark)
 945       (t (hangul-syllable-type char)))))
 946
 947 (macrolet ((def (name extendedp)
 948              `(defun ,name (function string)
 949                 (do ((length (length string))
 950                      (start 0)
 951                      (end 1 (1+ end))
 952                      (c1 nil)
 953                      (c2 (and (> (length string) 0) (grapheme-break-class (char string 0)))))
 954                     ((>= end length)
 955                      (if (= end length) (progn (funcall function string start end) nil)))
 956                   (flet ((brk () (funcall function string start end) (setf start end)))
 957                     (declare (truly-dynamic-extent #'brk))
 958                     (shiftf c1 c2 (grapheme-break-class (char string end)))
 959                     (cond
 960                       ((and (eql c1 :cr) (eql c2 :lf)))
 961                       ((or (member c1 '(:control :cr :lf))
 962                            (member c2 '(:control :cr :lf)))
 963                        (brk))
 964                       ((or (and (eql c1 :l) (member c2 '(:l :v :lv :lvt)))
 965                            (and (or (eql c1 :v) (eql c1 :lv))
 966                                 (or (eql c2 :v) (eql c2 :t)))
 967                            (and (eql c2 :t) (or (eql c1 :lvt) (eql c1 :t)))))
 968                       ((and (eql c1 :regional-indicator) (eql c2 :regional-indicator)))
 969                       ((eql c2 :extend))
 970                       ,@(when extendedp
 971                               `(((or (eql c2 :spacing-mark) (eql c1 :prepend)))))
 972                       (t (brk))))))))
 973   (def map-legacy-grapheme-boundaries nil)
 974   (def map-grapheme-boundaries t))
 975
 976 (macrolet ((def (name mapper)
 977              `(defun ,name (function string)
 978                 (let ((array (make-array 0 :element-type (array-element-type string) :adjustable t :displaced-to string)))
 979                   (flet ((fun (string start end)
 980                            (declare (type string string))
 981                            (funcall function (adjust-array array (- end start) :displaced-to string :displaced-index-offset start))))
 982                     (declare (truly-dynamic-extent #'fun))
 983                     (,mapper #'fun string))))))
 984   (def map-legacy-graphemes map-legacy-grapheme-boundaries)
 985   (def map-graphemes map-grapheme-boundaries))
 986
 987 (defun graphemes (string)
 988   "Breaks STRING into graphemes acording to the default
 989 grapheme breaking rules specified in UAX #29, returning a list of strings."
 990   (let (result)
 991     (map-graphemes (lambda (a) (push (subseq a 0) result)) string)
 992     (nreverse result)))
 993
 994 (defun word-break-class (char)
 995   "Returns the word breaking class of CHARACTER, as specified in UAX #29."
 996   ;; Words use graphemes as characters to deal with the ignore rule
 997   (when (listp char) (setf char (car char)))
 998   (let ((cp (when char (char-code char)))
 999         (gc (when char (general-category char)))
1000         (newlines #(#xB #xC #x0085 #x0085 #x2028 #x2029))
1001         (also-katakana
1002          #(#x3031 #x3035 #x309B #x309C
1003            #x30A0 #x30A0 #x30FC #x30FC
1004            #xFF70 #xFF70))
1005         (midnumlet #(#x002E #x2018 #x2019 #x2024 #xFE52 #xFF07 #xFF0E))
1006         (midletter
1007          #(#x003A #x00B7 #x002D7 #x0387 #x05F4 #x2027 #xFE13 #xFE55 #xFF1A))
1008         (midnum
1009          ;; Grepping of Line_Break = IS adjusted per UAX #29
1010          #(#x002C #x003B #x037E #x0589 #x060C #x060D #x066C #x07F8 #x2044
1011            #xFE10 #xFE14 #xFE50 #xFE54 #xFF0C #xFF1B)))
1012     (cond
1013       ((not char) nil)
1014       ((= cp 10) :LF)
1015       ((= cp 13) :CR)
1016       ((= cp #x27) :single-quote)
1017       ((= cp #x22) :double-quote)
1018       ((ordered-ranges-member cp newlines) :newline)
1019       ((or (eql (grapheme-break-class char) :extend)
1020            (eql gc :mc)) :extend)
1021       ((<= #x1F1E6 cp #x1F1FF) :regional-indicator)
1022       ((and (eql gc :Cf) (not (<= #x200B cp #x200D))) :format)
1023       ((or (eql (script char) :katakana)
1024            (ordered-ranges-member cp also-katakana)) :katakana)
1025       ((and (eql (script char) :Hebrew) (eql gc :lo)) :hebrew-letter)
1026       ((and (or (alphabetic-p char) (= cp #x05F3))
1027             (not (or (ideographic-p char)
1028                      (eql (line-break-class char) :sa)
1029                      (eql (script char) :hiragana)))) :aletter)
1030       ((binary-search cp midnumlet) :midnumlet)
1031       ((binary-search cp midletter) :midletter)
1032       ((binary-search cp midnum) :midnum)
1033       ((or (and (eql gc :Nd) (not (<= #xFF10 cp #xFF19))) ;Fullwidth digits
1034            (eql cp #x066B)) :numeric)
1035       ((eql gc :Pc) :extendnumlet)
1036       (t nil))))
1037
1038 (defmacro flatpush (thing list)
1039   (let ((%thing (gensym)) (%i (gensym)))
1040     `(let ((,%thing ,thing))
1041        (if (listp ,%thing)
1042            (dolist (,%i ,%thing)
1043              (push ,%i ,list))
1044            (push ,%thing ,list)))))
1045
1046 (defun words (string)
1047   "Breaks STRING into words acording to the default
1048 word breaking rules specified in UAX #29. Returns a list of strings"
1049   (let ((chars (mapcar
1050                  #'(lambda (s)
1051                      (let ((l (coerce s 'list)))
1052                        (if (cdr l) l (car l))))
1053                  (let ((*other-break-special-graphemes* t)) (graphemes string))))
1054          words word flag)
1055     (flatpush (car chars) word)
1056     (do ((first (car chars) second)
1057          (tail (cdr chars) (cdr tail))
1058          (second (cadr chars) (cadr tail)))
1059         ((not first) (nreverse (mapcar #'(lambda (l) (coerce l 'string)) words)))
1060       (flet ((brk () (push (nreverse word) words) (setf word nil) (flatpush second word))
1061              (nobrk () (flatpush second word)))
1062         (let ((c1 (word-break-class first))
1063               (c2 (word-break-class second))
1064               (c3 (when (and tail (cdr tail)) (word-break-class (cadr tail)))))
1065           (cond
1066             (flag (nobrk) (setf flag nil))
1067             ;; CR+LF are bound together by the grapheme clustering
1068             ((or (eql c1 :newline) (eql c1 :cr) (eql c1 :lf)
1069                  (eql c2 :newline) (eql c2 :cr) (eql c2 :lf)) (brk))
1070             ((or (eql c2 :format) (eql c2 :extend)) (nobrk))
1071             ((and (or (eql c1 :aletter) (eql c1 :hebrew-letter))
1072                   (or (eql c2 :aletter) (eql c2 :hebrew-letter))) (nobrk))
1073             ((and (or (eql c1 :aletter) (eql c1 :hebrew-letter))
1074                   (member c2 '(:midletter :midnumlet :single-quote))
1075                   (or (eql c3 :aletter) (eql c3 :hebrew-letter)))
1076              (nobrk) (setf flag t)) ; Handle the multiple breaks from this rule
1077             ((and (eql c1 :hebrew-letter) (eql c2 :double-quote)
1078                   (eql c3 :hebrew-letter))
1079              (nobrk) (setf flag t))
1080             ((and (eql c1 :hebrew-letter) (eql c2 :single-quote)) (nobrk))
1081             ((or (and (eql c1 :numeric) (member c2 '(:numeric :aletter :hebrew-letter)))
1082                  (and (eql c2 :numeric) (member c1 '(:numeric :aletter :hebrew-letter))))
1083              (nobrk))
1084             ((and (eql c1 :numeric)
1085                   (member c2 '(:midnum :midnumlet :single-quote))
1086                   (eql c3 :numeric))
1087              (nobrk) (setf flag t))
1088             ((and (eql c1 :katakana) (eql c2 :katakana)) (nobrk))
1089             ((or (and (member c1
1090                               '(:aletter :hebrew-letter :katakana
1091                                 :numeric :extendnumlet)) (eql c2 :extendnumlet))
1092                  (and (member c2
1093                               '(:aletter :hebrew-letter :katakana
1094                                 :numeric :extendnumlet)) (eql c1 :extendnumlet)))
1095              (nobrk))
1096             ((and (eql c1 :regional-indicator) (eql c2 :regional-indicator)) (nobrk))
1097             (t (brk))))))))
1098
1099 (defun sentence-break-class (char)
1100   "Returns the sentence breaking class of CHARACTER, as specified in UAX #29."
1101   (when (listp char) (setf char (car char)))
1102   (let ((cp (when char (char-code char)))
1103         (gc (when char (general-category char)))
1104         (aterms #(#x002E #x2024 #xFE52 #xFF0E))
1105         (scontinues
1106          #(#x002C #x002D #x003A #x055D #x060C #x060D #x07F8 #x1802 #x1808
1107            #x2013 #x2014 #x3001 #xFE10 #xFE11 #xFE13 #xFE31 #xFE32 #xFE50
1108            #xFE51 #xFE55 #xFE58 #xFE63 #xFF0C #xFF0D #xFF1A #xFF64)))
1109     (cond
1110       ((not char) nil)
1111       ((= cp 10) :LF)
1112       ((= cp 13) :CR)
1113       ((or (eql (grapheme-break-class char) :extend)
1114            (eql gc :mc)) :extend)
1115       ((or (eql cp #x0085) (<= #x2028 cp #x2029)) :sep)
1116       ((and (eql gc :Cf) (not (<= #x200C cp #x200D))) :format)
1117       ((whitespace-p char) :sp)
1118       ((lowercase-p char) :lower)
1119       ((or (uppercase-p char) (eql gc :Lt)) :upper)
1120       ((or (alphabetic-p char) (eql cp #x00A0) (eql cp #x05F3)) :oletter)
1121       ((or (and (eql gc :Nd) (not (<= #xFF10 cp #xFF19))) ;Fullwidth digits
1122            (<= #x066B cp #x066C)) :numeric)
1123       ((binary-search cp aterms) :aterm)
1124       ((binary-search cp scontinues) :scontinue)
1125       ((proplist-p char :sterm) :sterm)
1126       ((and (or (member gc '(:Po :Ps :Pe :Pf :Pi))
1127                 (eql (line-break-class char) :qu))
1128             (not (eql cp #x05F3))) :close)
1129       (t nil))))
1130
1131 (defun sentence-prebreak (string)
1132   "Pre-combines some sequences of characters to make the sentence-break
1133 algorithm simpler..
1134 Specifically,
1135 - Combines any character with the following extend of format characters
1136 - Combines CR + LF into '(CR LF)
1137 - Combines any run of :cp*:close* into one character"
1138   (let ((chars (coerce string 'list))
1139         cluster clusters last-seen sp-run)
1140     (labels ((flush () (if (cdr cluster) (push (nreverse cluster) clusters)
1141                            (if cluster (push (car cluster) clusters)))
1142                     (setf cluster nil))
1143              (brk (x)
1144                (flush) (push x clusters))
1145              (nobrk (x) (push x cluster)))
1146     (loop for ch in chars
1147        for type = (sentence-break-class ch)
1148        do (cond
1149             ((and (eql last-seen :cr) (eql type :lf)) (nobrk ch) (flush) (setf last-seen nil))
1150             ((eql last-seen :cr) (brk ch) (setf last-seen nil))
1151             ((eql type :cr) (nobrk ch) (setf last-seen :cr))
1152             ((eql type :lf) (brk ch) (setf last-seen nil))
1153             ((eql type :sep) (brk ch) (setf last-seen nil))
1154             ((and last-seen (or (eql type :extend) (eql type :format)))
1155              (nobrk ch))
1156             ((eql type :close)
1157              (unless (eql last-seen :close) (flush))
1158              (nobrk ch) (setf last-seen :close sp-run nil))
1159             ((eql type :sp)
1160              (unless (or (and (not sp-run) (eql last-seen :close)) (eql last-seen :sp))
1161                (flush) (setf sp-run t))
1162              (nobrk ch) (setf last-seen :sp))
1163             (t (flush) (nobrk ch) (setf last-seen type sp-run nil))))
1164     (flush) (nreverse clusters))))
1165
1166 (defun sentences (string)
1167   "Breaks STRING into sentences acording to the default
1168 sentence breaking rules specified in UAX #29"
1169   (let ((special-handling '(:close :sp :sep :cr :lf :scontinue :sterm :aterm))
1170         (chars (sentence-prebreak string))
1171         sentence sentences state)
1172     (flatpush (car chars) sentence)
1173     (do ((first (car chars) second)
1174          (tail (cdr chars) (cdr tail))
1175          (second (cadr chars) (cadr tail))
1176          (third (caddr chars) (caddr tail)))
1177         ((not first)
1178          (progn
1179            ; Shake off last sentence
1180            (when sentence (push (nreverse sentence) sentences))
1181            (nreverse (mapcar #'(lambda (l) (coerce l 'string)) sentences))))
1182       (flet ((brk () (push (nreverse sentence) sentences)
1183                   (setf sentence nil) (flatpush second sentence))
1184              (nobrk () (flatpush second sentence)))
1185       (let ((c1 (sentence-break-class first))
1186             (c2 (sentence-break-class second))
1187             (c3 (sentence-break-class third)))
1188         (cond
1189           ((eql state :brk-next) (brk) (setf state nil))
1190           ((eql state :nobrk-next) (nobrk) (setf state nil))
1191           ((member c1 '(:sep :cr :lf)) (brk))
1192           ((and (eql c1 :aterm) (eql c2 :numeric)) (nobrk))
1193           ((and (eql c1 :upper) (eql c2 :aterm)
1194                 (eql c3 :upper)) (nobrk) (setf state :nobrk-next))
1195           ((or (and (member c1 '(:sterm :aterm)) (member c2 '(:close :sp))
1196                     (member c3 '(:scontinue :sterm :aterm)))
1197                (and (member c1 '(:sterm :aterm))
1198                     (member c2 '(:scontinue :sterm :aterm))))
1199            (nobrk) (when (member c2 '(:close :sp)) (setf state :nobrk-next)))
1200           ((and (member c1 '(:sterm :aterm)) (member c2 '(:close :sp))
1201                 (member c3 '(:sep :cr :lf)))
1202            (nobrk) (setf state :nobrk-next)) ;; Let the linebreak call (brk)
1203           ((and (member c1 '(:sterm :aterm)) (member c2 '(:sep :cr :lf)))
1204            (nobrk)) ; Doesn't trigger rule 8
1205           ((eql c1 :sterm) ; Not ambiguous anymore, rule 8a already handled
1206            (if (member c2 '(:close :sp))
1207                (progn (nobrk) (setf state :brk-next))
1208                (brk)))
1209           ((and (eql c2 :sterm) third (not (member c3 special-handling)))
1210            (nobrk) (setf state :brk-next)) ; STerm followed by nothing important
1211           ((or (eql c1 :aterm)
1212                (and (eql c2 :aterm) third
1213                     (not (member c3 special-handling)) (not (eql c3 :numeric))))
1214            ; Finally handle rule 8
1215            (if (loop for c in
1216                     (if (and third (not (or (member c3 special-handling)
1217                                             (eql c3 :numeric))))
1218                         (cdr tail) tail)
1219                   for type = (sentence-break-class c) do
1220                     (when (member type '(:oletter :upper :sep :cr :lf
1221                                          :sterm :aterm))
1222                       (return nil))
1223                     (when (eql type :lower) (return t)) finally (return nil))
1224                ; Ambiguous case
1225                (progn (nobrk) (setf state :nobrk-next))
1226                ; Otherwise
1227                (if (member c2 '(:close :sp :aterm))
1228                    (progn (nobrk) (setf state :brk-next))
1229                    (brk))))
1230           (t (nobrk))))))))
1231
1232 (defun line-prebreak (string)
1233   (let ((chars (coerce string 'list))
1234         cluster clusters last-seen)
1235     (loop for char in chars
1236        for type = (line-break-class char)
1237        do
1238          (when
1239              (and cluster
1240                   (or
1241                    (not (eql type :cm))
1242                    (and (eql type :cm)
1243                         (member last-seen '(nil :BK :CR :LF :NL :SP :ZW)))))
1244            (if (cdr cluster)
1245                (push (nreverse cluster) clusters)
1246                (push (car cluster) clusters))
1247            (setf cluster nil))
1248          (unless (eql type :cm) (setf last-seen type))
1249          (push char cluster))
1250     (if (cdr cluster)
1251         (push (nreverse cluster) clusters)
1252         (push (car cluster) clusters))
1253     (nreverse clusters)))
1254
1255 (defun line-break-annotate (string)
1256   (let ((chars (line-prebreak string))
1257         first second t1 t2 tail (ret (list :cant))
1258         state after-spaces)
1259     (macrolet ((cmpush (thing)
1260                  (let ((gthing (gensym)))
1261                    `(let ((,gthing ,thing))
1262                       (if (listp ,gthing)
1263                           (loop for (c next) on ,gthing do
1264                                (push c ret)
1265                                (when next (push :cant ret)))
1266                           (push ,thing ret)))))
1267                (between (a b action)
1268                  (let ((atest (if (eql a :any) t
1269                                   (if (listp a)
1270                                       `(member t1 ,a)
1271                                       `(eql t1 ,a))))
1272                        (btest (if (eql b :any) t
1273                                   (if (listp b)
1274                                       `(member t2 ,b)
1275                                       `(eql t2 ,b)))))
1276                  `(when (and ,atest ,btest)
1277                     (cmpush ,action)
1278                     (cmpush second)
1279                     (go tail))))
1280                (after-spaces (a b action)
1281                  (let ((atest (if (eql a :any) t
1282                                   (if (listp a)
1283                                       `(member t1 ,a)
1284                                       `(eql t1 ,a))))
1285                        (btest (if (eql b :any) t
1286                                   (if (listp b)
1287                                       `(member type ,b)
1288                                       `(eql type ,b)))))
1289                    `(when
1290                         (and ,atest
1291                              (loop for c in tail
1292                                 for type = (line-break-class c :resolve t)
1293                                 do
1294                                   (when (not (eql type :sp))
1295                                     (return ,btest))))
1296                       (if (eql t2 :sp)
1297                          (progn (cmpush :cant)
1298                                 (cmpush second)
1299                                 (setf state :eat-spaces)
1300                                 (setf after-spaces ,action)
1301                                 (go tail))
1302                          (progn (cmpush ,action)
1303                                 (cmpush second)
1304                                 (go tail)))))))
1305
1306       (cmpush (car chars))
1307       (setf first (car chars))
1308       (setf tail (cdr chars))
1309       (setf second (car tail))
1310       (tagbody
1311          top
1312          (when (not first) (go end))
1313          (setf t1 (line-break-class first :resolve t))
1314          (setf t2 (line-break-class second :resolve t))
1315          (between :any :nil :must)
1316          (when (and (eql state :eat-spaces) (eql t2 :sp))
1317             (cmpush :cant) (cmpush second) (go tail))
1318          (between :bk :any :must)
1319          (between :cr :lf :cant)
1320          (between '(:cr :lf :nl) :any :must)
1321          (between :any '(:zw :bk :cr :lf :nl) :cant)
1322          (when after-spaces (cmpush after-spaces) (cmpush second)
1323                (setf state nil after-spaces nil) (go tail))
1324          (after-spaces :zw :any :can)
1325          (between :any :wj :cant)
1326          (between :wj :any :cant)
1327          (between :gl :any :cant)
1328          (between '(:ZW :WJ :SY :SG :SA :RI :QU :PR :PO :OP :NU :NS :NL
1329                     :LF :IS :IN :ID :HL :GL :EX :CR :CP :CM :CL :CJ :CB
1330                     :BK :BB :B2 :AL :AI :JL :JV :JT :H2 :H3 :XX)
1331                   :gl :cant)
1332          (between :any '(:cl :cp :ex :is :sy) :cant)
1333          (after-spaces :op :any :cant)
1334          (after-spaces :qu :op :cant)
1335          (after-spaces '(:cl :cp) :ns :cant)
1336          (after-spaces :b2 :b2 :cant)
1337          (between :any :sp :cant) ;; Goes here to deal with after-spaces
1338          (between :sp :any :can)
1339          (between :any :qu :cant)
1340          (between :qu :any :cant)
1341          (between :any :cb :can)
1342          (between :cb :any :can)
1343          (between :any '(:ba :hy :ns) :cant)
1344          (between :bb :any :cant)
1345          (when (and (eql t1 :hl) (eql t2 :hy))
1346            (cmpush :cant) (cmpush second)
1347            (setf after-spaces :can) (go tail))
1348          (between '(:al :hl :id :in :nu) :in :cant)
1349          (between :id :po :cant)
1350          (between '(:al :hl) :nu :cant)
1351          (between '(:nu :po) '(:al :hl) :cant)
1352          (between :pr '(:id :al :hl) :cant)
1353          (between '(:cl :cp :nu) '(:po :pr) :cant)
1354          (between :nu '(:po :pr :nu) :cant)
1355          (between '(:po :pr) :op :cant)
1356          (between '(:po :pr :hy :is :sy) :nu :cant)
1357          (between :jl '(:jl :jv :h2 :h3) :cant)
1358          (between '(:jv :h2) '(:jv :jt) :cant)
1359          (between '(:jt :h3) :jt :cant)
1360          (between '(:jl :jv :jt :h2 :h3) '(:in :po) :cant)
1361          (between :pr '(:jl :jv :jt :h2 :h3) :cant)
1362          (between '(:al :hl :is) '(:al :hl) :cant)
1363          (between '(:al :hl :nu) :op :cant)
1364          (between :cp '(:al :hl :nu) :cant)
1365          (between :ri :ri :cant)
1366          (between :any :any :can)
1367          tail
1368          (setf first second)
1369          (setf tail (cdr tail))
1370          (setf second (car tail))
1371          (go top)
1372          end)
1373       ;; LB3 satisfied by (:any :nil) -> :must
1374       (setf ret (nreverse ret))
1375       ret)))
1376
1377 (defun break-list-at (list n)
1378   (let ((tail list) (pre-tail nil))
1379     (loop repeat n do (setf pre-tail tail) (setf tail (cdr tail)))
1380     (setf (cdr pre-tail) nil)
1381     (values list tail)))
1382
1383 (defun lines (string &key (margin *print-right-margin*))
1384   "Breaks STRING into lines that are no wider than :MARGIN according to the
1385 line breaking rules outlined in UAX #14. Combining marks will always be kept
1386 together with their base characters, and spaces (but not other types of
1387 whitespace) will be removed from the end of lines. If :MARGIN is unspecified,
1388 it defaults to 80 characters"
1389   (when (string= string "") (return-from lines (list "")))
1390   (unless margin (setf margin 80))
1391   (do* ((chars (line-break-annotate string))
1392         line lines (filled 0) last-break-distance
1393         (break-type (car chars) (car tail))
1394         (char (cadr chars) (cadr tail))
1395         (tail (cddr chars) (cddr tail)))
1396        ((not break-type)
1397         (mapcar #'(lambda (s) (coerce s 'string)) (nreverse lines)))
1398     (ecase break-type
1399       (:cant
1400        (push char line)
1401        (unless (eql (line-break-class char) :CM)
1402          (incf filled))
1403        (when last-break-distance (incf last-break-distance)))
1404       (:can
1405        (push char line)
1406        (setf last-break-distance 1)
1407        (incf filled))
1408       (:must
1409        (push char line)
1410        (setf last-break-distance 1)
1411        (incf filled)
1412        (go break)))
1413     (if (> filled margin)
1414         (go break)
1415         (go next))
1416    break
1417     (when (not last-break-distance)
1418       ;; If we don't have any line breaks, remove the last thing we added that
1419       ;; takes up space, and all its combining marks
1420       (setf last-break-distance
1421             (1+ (loop for c in line while (eql (line-break-class c) :cm) summing 1))))
1422     (multiple-value-bind (next-line this-line) (break-list-at line last-break-distance)
1423       (loop while (eql (line-break-class (car this-line)) :sp)
1424          do (setf this-line (cdr this-line)))
1425       (push (nreverse this-line) lines)
1426       (setf line next-line)
1427       (setf filled (length line))
1428       (setf last-break-distance nil))
1429    next))
1430
1431 \f
1432 ;;; Collation
1433 (defconstant +maximum-variable-primary-element+
1434   #.(sb-cold:read-from-file "output/other-collation-info.lisp-expr"))
1435
1436 (defun unpack-collation-key (key)
1437   (declare (type (simple-array (unsigned-byte 32) (*)) key))
1438   (loop for value across key
1439         collect
1440         (list (ldb (byte 16 16) value)
1441               (ldb (byte 11 5) value)
1442               (ldb (byte 5 0) value))))
1443
1444 (declaim (inline variable-p))
1445 (defun variable-p (x)
1446   (<= 1 x +maximum-variable-primary-element+))
1447
1448 (defun collation-key (string start end)
1449   (let (char1
1450         (char2 (code-char 0))
1451         (char3 (code-char 0)))
1452     (case (- end start)
1453       (1 (setf char1 (char string start)))
1454       (2 (setf char1 (char string start)
1455                char2 (char string (+ start 1))))
1456       (3 (setf char1 (char string start)
1457                char2 (char string (+ start 1))
1458                char3 (char string (+ start 2))))
1459       (t
1460        ;; There are never more than three characters in a contraction, right?
1461        (return-from collation-key nil)))
1462     (let ((packed-key (gethash (pack-3-codepoints
1463                                 (char-code char1)
1464                                 (char-code char2)
1465                                 (char-code char3))
1466                                **character-collations**)))
1467       (if packed-key
1468           (unpack-collation-key packed-key)
1469           (when (char= (code-char 0) char2 char3)
1470             (let* ((cp (char-code char1))
1471                    (base
1472                      (cond ((not (proplist-p char1 :unified-ideograph))
1473                             #xFBC0)
1474                            ((or (<= #x4E00 cp #x9FFF)
1475                                 (<= #xF900 cp #xFAFF))
1476                             #xFB40)
1477                            (t
1478                             #xFB80)))
1479                    (a (+ base (ash cp -15)))
1480                    (b (logior #.(ash 1 15) (logand cp #x7FFFF))))
1481               (list (list a #x20 #x2) (list b 0 0))))))))
1482
1483 (defun sort-key (string)
1484   (let* ((str (normalize-string string :nfd))
1485          (i 0) (len (length str)) max-match new-i
1486          sort-key
1487          after-variable)
1488     (loop while (< i len)
1489           do
1490           (loop for offset from 1 to 3
1491                 for index = (+ i offset)
1492                 while (<= index len)
1493                 do
1494                 (let ((key (collation-key str i index)))
1495                   (when key
1496                     (setf max-match key
1497                           new-i index))))
1498           (loop for index from new-i below len
1499                 for char = (char str index)
1500                 for previous-combining-class = combining-class
1501                 for combining-class = (combining-class char)
1502                 until (eql combining-class 0)
1503                 unless (and (>= (- index new-i) 1)
1504                             ;; Combiners are sorted, we only have to look back
1505                             ;; one step (see canonically-compose)
1506                             (>= (combining-class (char str (1- index)))
1507                                 combining-class))
1508                 do
1509                 (rotatef (char str new-i) (char str index))
1510                 (let ((key (collation-key str i (1+ new-i))))
1511                   (if key
1512                       (setf max-match key
1513                             new-i (1+ new-i))
1514                       (rotatef (char str new-i) (char str index)))))
1515           (loop for key in max-match do (push key sort-key))
1516           (setf i new-i))
1517     (macrolet ((push-non-zero (obj place)
1518                  `(when (/= ,obj 0)
1519                     (push ,obj ,place))))
1520       (let (primary secondary tertiary quatenary)
1521         (loop for (k1 k2 k3) in (nreverse sort-key)
1522               do
1523               (cond
1524                 ((= k1 k2 k3 0))
1525                 ((variable-p k1)
1526                  (setf after-variable t)
1527                  (push k1 quatenary))
1528                 ((/= k1 0)
1529                  (setf after-variable nil)
1530                  (push k1 primary)
1531                  (push-non-zero k2 secondary)
1532                  (push-non-zero k3 tertiary)
1533                  (push #xFFFF quatenary))
1534                 ((/= k3 0)
1535                  (unless after-variable
1536                    (push-non-zero k2 secondary)
1537                    (push k3 tertiary)
1538                    (push #xFFFF quatenary)))))
1539         (concatenate 'vector
1540                      (nreverse primary) #(0) (nreverse secondary) #(0)
1541                      (nreverse tertiary) #(0) (nreverse quatenary))))))
1542
1543 (defun vector< (vector1 vector2)
1544   (loop for i across vector1
1545         for j across vector2
1546         do
1547         (cond ((< i j) (return-from vector< t))
1548               ((> i j) (return-from vector< nil))))
1549   ;; If there's no differences, shortest vector wins
1550   (< (length vector1) (length vector2)))
1551
1552 (defun unicode= (string1 string2 &key (start1 0) end1 (start2 0) end2 (strict t))
1553   "Determines whether STRING1 and STRING2 are canonically equivalent according
1554 to Unicode. The START and END arguments behave like the arguments to STRING=.
1555 If :STRICT is NIL, UNICODE= tests compatibility equavalence instead."
1556   (let ((str1 (normalize-string (subseq string1 start1 end1) (if strict :nfd :nfkd)))
1557         (str2 (normalize-string (subseq string2 start2 end2) (if strict :nfd :nfkd))))
1558     (string= str1 str2)))
1559
1560 (defun unicode-equal (string1 string2 &key (start1 0) end1 (start2 0) end2 (strict t))
1561   "Determines whether STRING1 and STRING2 are canonically equivalent after
1562 casefoldin8 (that is, ignoring case differences) according to Unicode. The
1563 START and END arguments behave like the arguments to STRING=. If :STRICT is
1564 NIL, UNICODE= tests compatibility equavalence instead."
1565   (let ((str1 (normalize-string (subseq string1 start1 end1) (if strict :nfd :nfkd)))
1566         (str2 (normalize-string (subseq string2 start2 end2) (if strict :nfd :nfkd))))
1567     (string=
1568      (normalize-string (casefold str1) (if strict :nfd :nfkd))
1569      (normalize-string (casefold str2) (if strict :nfd :nfkd)))))
1570
1571 (defun unicode< (string1 string2 &key (start1 0) end1 (start2 0) end2)
1572   "Determines whether STRING1 sorts before STRING2 using the Unicode Collation
1573 Algorithm, The function uses an untailored Default Unicode Collation Element Table
1574 to produce the sort keys. The function uses the Shifted method for dealing
1575 with variable-weight characters, as described in UTS #10"
1576   (let* ((s1 (subseq string1 start1 end1))
1577          (s2 (subseq string2 start2 end2))
1578          (k1 (sort-key s1)) (k2 (sort-key s2)))
1579     (if (equalp k1 k2)
1580         (string< (normalize-string s1 :nfd) (normalize-string s2 :nfd))
1581         (vector< k1 k2))))
1582
1583 (defun unicode<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
1584   "Tests if STRING1 and STRING2 are either UNICODE< or UNICODE="
1585   (or
1586    (unicode= string1 string2 :start1 start1 :end1 end1
1587              :start2 start2 :end2 end2)
1588    (unicode< string1 string2 :start1 start1 :end1 end1
1589              :start2 start2 :end2 end2)))
1590
1591 (defun unicode> (string1 string2 &key (start1 0) end1 (start2 0) end2)
1592   "Tests if STRING2 is UNICODE< STRING1."
1593    (unicode< string2 string1 :start1 start2 :end1 end2
1594              :start2 start1 :end2 end1))
1595
1596 (defun unicode>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
1597   "Tests if STRING1 and STRING2 are either UNICODE= or UNICODE>"
1598   (or
1599    (unicode= string1 string2 :start1 start1 :end1 end1
1600              :start2 start2 :end2 end2)
1601    (unicode> string1 string2 :start1 start1 :end1 end1
1602              :start2 start2 :end2 end2)))
1603
1604 \f
1605 ;;; Confusable detection
1606
1607 (defun canonically-deconfuse (string)
1608   (let (ret (i 0) new-i (len (length string))
1609             best-node)
1610     (loop while (< i len) do
1611          (loop for offset from 1 to 5
1612             while (<= (+ i offset) len)
1613             do
1614               (let ((node (gethash (subseq string i (+ i offset))
1615                                    **confusables**)))
1616                 (when node (setf best-node node new-i (+ i offset)))))
1617          (cond
1618            (best-node (push best-node ret) (setf i new-i))
1619            (t (push (subseq string i (1+ i)) ret) (incf i)))
1620          (setf best-node nil new-i nil))
1621     (apply #'concatenate 'string (nreverse ret))))
1622
1623 (defun confusable-p (string1 string2 &key (start1 0) end1 (start2 0) end2)
1624   "Determines whether STRING1 and STRING2 could be visually confusable
1625 according to the IDNA confusableSummary.txt table"
1626     (let* ((form #!+sb-unicode :nfd #!-sb-unicode :nfc)
1627            (str1 (normalize-string (subseq string1 start1 end1) form))
1628            (str2 (normalize-string (subseq string2 start2 end2) form))
1629            (skeleton1 (normalize-string (canonically-deconfuse str1) form))
1630            (skeleton2 (normalize-string (canonically-deconfuse str2) form)))
1631       (string= skeleton1 skeleton2)))