src/code/target-unicode.lisp

   1 ;;;; Unicode functions
   2
   3 ;;;; This software is part of the SBCL system. See the README file for
   4 ;;;; more information.
   5 ;;;;
   6 ;;;; This software is derived from the CMU CL system, which was
   7 ;;;; written at Carnegie Mellon University and released into the
   8 ;;;; public domain. The software is in the public domain and is
   9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
  10 ;;;; files for more information.
  11
  12 (in-package "SB!UNICODE")
  13
  14 (declaim (type simple-vector **special-numerics**))
  15 (sb!impl::defglobal **special-numerics**
  16   #.(with-open-file (stream
  17                      (merge-pathnames
  18                       (make-pathname
  19                        :directory
  20                        '(:relative :up :up "output")
  21                        :name "numerics" :type "lisp-expr")
  22                       sb!xc:*compile-file-truename*)
  23                      :direction :input
  24                      :element-type 'character)
  25       (read stream)))
  26
  27
  28 (declaim (type (simple-array (unsigned-byte 32) (*)) **block-ranges**))
  29 (sb!impl::defglobal **block-ranges**
  30   #.(sb!int:!coerce-to-specialized
  31      (with-open-file (stream
  32                       (merge-pathnames
  33                        (make-pathname
  34                         :directory
  35                         '(:relative :up :up "output")
  36                         :name "blocks" :type "lisp-expr")
  37                        sb!xc:*compile-file-truename*)
  38                       :direction :input
  39                       :element-type 'character)
  40        (read stream))
  41      '(unsigned-byte 32)))
  42
  43 (macrolet ((unicode-property-init ()
  44              (let ((proplist-dump
  45                     (with-open-file (stream
  46                                      (merge-pathnames
  47                                       (make-pathname
  48                                        :directory
  49                                        '(:relative :up :up "output")
  50                                        :name "misc-properties" :type "lisp-expr")
  51                                       sb!xc:*compile-file-truename*)
  52                                      :direction :input
  53                                      :element-type 'character)
  54                       (read stream)))
  55                    (confusable-sets
  56                     (with-open-file (stream
  57                                      (merge-pathnames
  58                                       (make-pathname
  59                                        :directory
  60                                        '(:relative :up :up "output")
  61                                        :name "confusables" :type "lisp-expr")
  62                                       sb!xc:*compile-file-truename*)
  63                                      :direction :input
  64                                      :element-type 'character)
  65                       (read stream)))
  66                    (bidi-mirroring-list
  67                     (with-open-file (stream
  68                                      (merge-pathnames
  69                                       (make-pathname
  70                                        :directory
  71                                        '(:relative :up :up "output")
  72                                        :name "bidi-mirrors" :type "lisp-expr")
  73                                       sb!xc:*compile-file-truename*)
  74                                      :direction :input
  75                                      :element-type 'character)
  76                       (read stream))))
  77                `(progn
  78                   (sb!impl::defglobal **proplist-properties** ',proplist-dump)
  79                   (sb!impl::defglobal **confusables** ',confusable-sets)
  80                   (sb!impl::defglobal **bidi-mirroring-glyphs** ',bidi-mirroring-list)
  81                   (defun !unicode-properties-cold-init ()
  82                     (let ((hash (make-hash-table)) (list ',proplist-dump))
  83                       (do ((k (car list) (car list)) (v (cadr list) (cadr list)))
  84                           ((not list) hash)
  85                         (setf (gethash k hash) v)
  86                         (setf list (cddr list)))
  87                       (setf **proplist-properties** hash))
  88                     (let ((hash (make-hash-table :test #'equal)))
  89                       (loop for set in ',confusable-sets
  90                          for items = (mapcar #'(lambda (item)
  91                                                  (map 'simple-string
  92                                                       #'code-char item))
  93                                              #!+sb-unicode set
  94                                              #!-sb-unicode
  95                                              (remove-if-not
  96                                               #'(lambda (item)
  97                                                   (every
  98                                                    #'(lambda (x)
  99                                                        (< x sb!xc:char-code-limit))
 100                                                    item)) set))
 101                          do (dolist (i items)
 102                               (setf (gethash (logically-readonlyize (possibly-base-stringize i))
 103                                              hash)
 104                                     (logically-readonlyize
 105                                      (possibly-base-stringize (first items))))))
 106                       (setf **confusables** hash))
 107                     (let ((hash (make-hash-table)) (list ',bidi-mirroring-list))
 108                       (loop for (k v) in list do
 109                            (setf (gethash k hash) v))
 110                       (setf **bidi-mirroring-glyphs** hash)))))))
 111   (unicode-property-init))
 112
 113 ;;; Unicode property access
 114 (defun ordered-ranges-member (item vector)
 115   (declare (type simple-vector vector)
 116            (type fixnum item)
 117            (optimize speed))
 118   (labels ((recurse (start end)
 119              (declare (type index start end)
 120                       (optimize (safety 0)))
 121              (when (< start end)
 122                (let* ((i (+ start (truncate (the index (- end start)) 2)))
 123                       (index (* 2 i))
 124                       (elt1 (svref vector index))
 125                       (elt2 (svref vector (1+ index))))
 126                  (declare (type index i)
 127                           (fixnum elt1 elt2))
 128                  (cond ((< item elt1)
 129                         (recurse start i))
 130                        ((> item elt2)
 131                         (recurse (+ 1 i) end))
 132                        (t
 133                         item))))))
 134     (recurse 0 (truncate (length vector) 2))))
 135
 136 ;; Returns which range `item` was found in or NIL
 137 ;; First range = 0, second range = 1 ...
 138 (defun ordered-ranges-position (item vector)
 139   (declare (type (simple-array (unsigned-byte 32) (*)) vector)
 140            (type fixnum item))
 141   (labels ((recurse (start end)
 142              (declare (type index start end))
 143              (when (< start end)
 144                (let* ((i (+ start (truncate (- end start) 2)))
 145                       (index (* 2 i))
 146                       (elt1 (aref vector index))
 147                       (elt2 (aref vector (1+ index))))
 148                  (declare (type index i))
 149                  (cond ((< item elt1)
 150                         (recurse start i))
 151                        ((> item elt2)
 152                         (recurse (+ 1 i) end))
 153                        (t
 154                         i))))))
 155     (recurse 0 (truncate (length vector) 2))))
 156
 157 (defun proplist-p (character property)
 158   "Returns T if CHARACTER has the specified PROPERTY.
 159 PROPERTY is a keyword representing one of the properties from PropList.txt,
 160 with underscores replaced by dashes."
 161   (ordered-ranges-member (char-code character)
 162                          (gethash property **proplist-properties**)))
 163
 164 ;; WARNING: These have to be manually kept in sync with the values in ucd.lisp
 165 (declaim (type simple-vector *general-categories* *bidi-classes* *east-asian-widths*
 166                *scripts* *line-break-classes* *blocks*))
 167 (sb!impl::defglobal *general-categories*
 168   #(:Lu :Ll :Lt :Lm :Lo :Cc :Cf :Co :Cs :Cn :Mc :Me :Mn :Nd
 169     :Nl :No :Pc :Pd :Pe :Pf :Pi :Po :Ps :Sc :Sk :Sm :So :Zl
 170     :Zp :Zs))
 171
 172 (sb!impl::defglobal *bidi-classes*
 173   #(:BN :AL :AN :B :CS :EN :ES :ET :L :LRE :LRO :NSM :ON
 174     :PDF :R :RLE :RLO :S :WS :LRI :RLI :FSI :PDI))
 175
 176 (sb!impl::defglobal *east-asian-widths*
 177   #(:N :A :H :W :F :Na))
 178
 179 (sb!impl::defglobal *scripts*
 180   #(:Unknown :Common :Latin :Greek :Cyrillic :Armenian :Hebrew :Arabic :Syriac
 181     :Thaana :Devanagari :Bengali :Gurmukhi :Gujarati :Oriya :Tamil :Telugu
 182     :Kannada :Malayalam :Sinhala :Thai :Lao :Tibetan :Myanmar :Georgian :Hangul
 183     :Ethiopic :Cherokee :Canadian-Aboriginal :Ogham :Runic :Khmer :Mongolian
 184     :Hiragana :Katakana :Bopomofo :Han :Yi :Old-Italic :Gothic :Deseret
 185     :Inherited :Tagalog :Hanunoo :Buhid :Tagbanwa :Limbu :Tai-Le :Linear-B
 186     :Ugaritic :Shavian :Osmanya :Cypriot :Braille :Buginese :Coptic :New-Tai-Lue
 187     :Glagolitic :Tifinagh :Syloti-Nagri :Old-Persian :Kharoshthi :Balinese
 188     :Cuneiform :Phoenician :Phags-Pa :Nko :Sundanese :Lepcha :Ol-Chiki :Vai
 189     :Saurashtra :Kayah-Li :Rejang :Lycian :Carian :Lydian :Cham :Tai-Tham
 190     :Tai-Viet :Avestan :Egyptian-Hieroglyphs :Samaritan :Lisu :Bamum :Javanese
 191     :Meetei-Mayek :Imperial-Aramaic :Old-South-Arabian :Inscriptional-Parthian
 192     :Inscriptional-Pahlavi :Old-Turkic :Kaithi :Batak :Brahmi :Mandaic :Chakma
 193     :Meroitic-Cursive :Meroitic-Hieroglyphs :Miao :Sharada :Sora-Sompeng
 194     :Takri :Bassa-Vah :Mahajani :Pahawh-Hmong :Caucasian-Albanian :Manichaean
 195     :Palmyrene :Duployan :Mende-Kikakui :Pau-Cin-Hau :Elbasan :Modi
 196     :Psalter-Pahlavi :Grantha :Mro :Siddham :Khojki :Nabataean :Tirhuta
 197     :Khudawadi :Old-North-Arabian :Warang-Citi :Linear-A :Old-Permic))
 198
 199 (sb!impl::defglobal *line-break-classes*
 200     #(:XX :AI :AL :B2 :BA :BB :BK :CB :CJ :CL :CM :CP :CR :EX :GL
 201       :HL :HY :ID :IN :IS :LF :NL :NS :NU :OP :PO :PR :QU :RI :SA
 202       :SG :SP :SY :WJ :ZW))
 203
 204 (sb!impl::defglobal *blocks*
 205   #(:Basic-Latin :Latin-1-Supplement :Latin-Extended-A :Latin-Extended-B
 206     :IPA-Extensions :Spacing-Modifier-Letters :Combining-Diacritical-Marks
 207     :Greek-and-Coptic :Cyrillic :Cyrillic-Supplement :Armenian :Hebrew :Arabic
 208     :Syriac :Arabic-Supplement :Thaana :NKo :Samaritan :Mandaic
 209     :Arabic-Extended-A :Devanagari :Bengali :Gurmukhi :Gujarati :Oriya :Tamil
 210     :Telugu :Kannada :Malayalam :Sinhala :Thai :Lao :Tibetan :Myanmar :Georgian
 211     :Hangul-Jamo :Ethiopic :Ethiopic-Supplement :Cherokee
 212     :Unified-Canadian-Aboriginal-Syllabics :Ogham :Runic :Tagalog :Hanunoo
 213     :Buhid :Tagbanwa :Khmer :Mongolian
 214     :Unified-Canadian-Aboriginal-Syllabics-Extended :Limbu :Tai-Le :New-Tai-Lue
 215     :Khmer-Symbols :Buginese :Tai-Tham :Combining-Diacritical-Marks-Extended
 216     :Balinese :Sundanese :Batak :Lepcha :Ol-Chiki :Sundanese-Supplement
 217     :Vedic-Extensions :Phonetic-Extensions :Phonetic-Extensions-Supplement
 218     :Combining-Diacritical-Marks-Supplement :Latin-Extended-Additional
 219     :Greek-Extended :General-Punctuation :Superscripts-and-Subscripts
 220     :Currency-Symbols :Combining-Diacritical-Marks-for-Symbols
 221     :Letterlike-Symbols :Number-Forms :Arrows :Mathematical-Operators
 222     :Miscellaneous-Technical :Control-Pictures :Optical-Character-Recognition
 223     :Enclosed-Alphanumerics :Box-Drawing :Block-Elements :Geometric-Shapes
 224     :Miscellaneous-Symbols :Dingbats :Miscellaneous-Mathematical-Symbols-A
 225     :Supplemental-Arrows-A :Braille-Patterns :Supplemental-Arrows-B
 226     :Miscellaneous-Mathematical-Symbols-B :Supplemental-Mathematical-Operators
 227     :Miscellaneous-Symbols-and-Arrows :Glagolitic :Latin-Extended-C :Coptic
 228     :Georgian-Supplement :Tifinagh :Ethiopic-Extended :Cyrillic-Extended-A
 229     :Supplemental-Punctuation :CJK-Radicals-Supplement :Kangxi-Radicals
 230     :Ideographic-Description-Characters :CJK-Symbols-and-Punctuation :Hiragana
 231     :Katakana :Bopomofo :Hangul-Compatibility-Jamo :Kanbun :Bopomofo-Extended
 232     :CJK-Strokes :Katakana-Phonetic-Extensions :Enclosed-CJK-Letters-and-Months
 233     :CJK-Compatibility :CJK-Unified-Ideographs-Extension-A
 234     :Yijing-Hexagram-Symbols :CJK-Unified-Ideographs :Yi-Syllables :Yi-Radicals
 235     :Lisu :Vai :Cyrillic-Extended-B :Bamum :Modifier-Tone-Letters
 236     :Latin-Extended-D :Syloti-Nagri :Common-Indic-Number-Forms :Phags-pa
 237     :Saurashtra :Devanagari-Extended :Kayah-Li :Rejang :Hangul-Jamo-Extended-A
 238     :Javanese :Myanmar-Extended-B :Cham :Myanmar-Extended-A :Tai-Viet
 239     :Meetei-Mayek-Extensions :Ethiopic-Extended-A :Latin-Extended-E
 240     :Meetei-Mayek :Hangul-Syllables :Hangul-Jamo-Extended-B :High-Surrogates
 241     :High-Private-Use-Surrogates :Low-Surrogates :Private-Use-Area
 242     :CJK-Compatibility-Ideographs :Alphabetic-Presentation-Forms
 243     :Arabic-Presentation-Forms-A :Variation-Selectors :Vertical-Forms
 244     :Combining-Half-Marks :CJK-Compatibility-Forms :Small-Form-Variants
 245     :Arabic-Presentation-Forms-B :Halfwidth-and-Fullwidth-Forms :Specials
 246     :Linear-B-Syllabary :Linear-B-Ideograms :Aegean-Numbers
 247     :Ancient-Greek-Numbers :Ancient-Symbols :Phaistos-Disc :Lycian :Carian
 248     :Coptic-Epact-Numbers :Old-Italic :Gothic :Old-Permic :Ugaritic :Old-Persian
 249     :Deseret :Shavian :Osmanya :Elbasan :Caucasian-Albanian :Linear-A
 250     :Cypriot-Syllabary :Imperial-Aramaic :Palmyrene :Nabataean :Phoenician
 251     :Lydian :Meroitic-Hieroglyphs :Meroitic-Cursive :Kharoshthi
 252     :Old-South-Arabian :Old-North-Arabian :Manichaean :Avestan
 253     :Inscriptional-Parthian :Inscriptional-Pahlavi :Psalter-Pahlavi :Old-Turkic
 254     :Rumi-Numeral-Symbols :Brahmi :Kaithi :Sora-Sompeng :Chakma :Mahajani
 255     :Sharada :Sinhala-Archaic-Numbers :Khojki :Khudawadi :Grantha :Tirhuta
 256     :Siddham :Modi :Takri :Warang-Citi :Pau-Cin-Hau :Cuneiform
 257     :Cuneiform-Numbers-and-Punctuation :Egyptian-Hieroglyphs :Bamum-Supplement
 258     :Mro :Bassa-Vah :Pahawh-Hmong :Miao :Kana-Supplement :Duployan
 259     :Shorthand-Format-Controls :Byzantine-Musical-Symbols :Musical-Symbols
 260     :Ancient-Greek-Musical-Notation :Tai-Xuan-Jing-Symbols
 261     :Counting-Rod-Numerals :Mathematical-Alphanumeric-Symbols :Mende-Kikakui
 262     :Arabic-Mathematical-Alphabetic-Symbols :Mahjong-Tiles :Domino-Tiles
 263     :Playing-Cards :Enclosed-Alphanumeric-Supplement
 264     :Enclosed-Ideographic-Supplement :Miscellaneous-Symbols-and-Pictographs
 265     :Emoticons :Ornamental-Dingbats :Transport-and-Map-Symbols
 266     :Alchemical-Symbols :Geometric-Shapes-Extended :Supplemental-Arrows-C
 267     :CJK-Unified-Ideographs-Extension-B :CJK-Unified-Ideographs-Extension-C
 268     :CJK-Unified-Ideographs-Extension-D :CJK-Compatibility-Ideographs-Supplement
 269     :Tags :Variation-Selectors-Supplement :Supplementary-Private-Use-Area-A
 270     :Supplementary-Private-Use-Area-B))
 271
 272 (declaim (inline svref-or-null))
 273 (defun svref-or-null (vector index)
 274   (and (< index (length vector))
 275        (svref vector index)))
 276
 277 (defun general-category (character)
 278   "Returns the general category of CHARACTER as it appears in UnicodeData.txt"
 279   (svref-or-null *general-categories* (sb!impl::ucd-general-category character)))
 280
 281 (defun bidi-class (character)
 282   "Returns the bidirectional class of CHARACTER"
 283   (if (and (eql (general-category character) :Cn)
 284            (default-ignorable-p character))
 285       :Bn
 286       (svref-or-null
 287        *bidi-classes*
 288        (aref **character-misc-database** (1+ (misc-index character))))))
 289
 290 (declaim (inline combining-class))
 291 (defun combining-class (character)
 292   "Returns the canonical combining class (CCC) of CHARACTER"
 293   (aref **character-misc-database** (+ 2 (misc-index character))))
 294
 295 (defun decimal-value (character)
 296   "Returns the decimal digit value associated with CHARACTER or NIL if
 297 there is no such value.
 298
 299 The only characters in Unicode with a decimal digit value are those
 300 that are part of a range of characters that encode the digits 0-9.
 301 Because of this, `(decimal-digit c) <=> (digit-char-p c 10)` in
 302 #+sb-unicode builds"
 303   (sb!impl::ucd-decimal-digit character))
 304
 305 (defun digit-value (character)
 306   "Returns the Unicode digit value of CHARACTER or NIL if it doesn't exist.
 307
 308 Digit values are guaranteed to be integers between 0 and 9 inclusive.
 309 All characters with decimal digit values have the same digit value,
 310 but there are characters (such as digits of number systems without a 0 value)
 311 that have a digit value but no decimal digit value"
 312   (let ((%digit (clear-flag 6
 313                             (aref **character-misc-database**
 314                                   (+ 3 (misc-index character))))))
 315     (if (< %digit 10) %digit nil)))
 316
 317 (defun numeric-value (character)
 318   "Returns the numeric value of CHARACTER or NIL if there is no such value.
 319 Numeric value is the most general of the Unicode numeric properties.
 320 The only constraint on the numeric value is that it be a rational number."
 321   (or (double-vector-binary-search (char-code character)
 322                                    **special-numerics**)
 323       (digit-value character)))
 324
 325 (defun mirrored-p (character)
 326   "Returns T if CHARACTER needs to be mirrored in bidirectional text.
 327 Otherwise, returns NIL."
 328   (logbitp 5 (aref **character-misc-database**
 329                     (+ 5 (misc-index character)))))
 330
 331 (defun bidi-mirroring-glyph (character)
 332   "Returns the mirror image of CHARACTER if it exists.
 333 Otherwise, returns NIL."
 334   (when (mirrored-p character)
 335     (let ((ret (gethash (char-code character) **bidi-mirroring-glyphs**)))
 336       (when ret (code-char ret)))))
 337
 338 (defun east-asian-width (character)
 339   "Returns the East Asian Width property of CHARACTER as
 340 one of the keywords :N (Narrow), :A (Ambiguous), :H (Halfwidth),
 341 :W (Wide), :F (Fullwidth), or :NA (Not applicable)"
 342   (svref-or-null *east-asian-widths*
 343                  (ldb (byte 3 0)
 344                       (aref **character-misc-database**
 345                             (+ 5 (misc-index character))))))
 346
 347 (defun script (character)
 348   "Returns the Script property of CHARACTER as a keyword.
 349 If CHARACTER does not have a known script, returns :UNKNOWN"
 350   (svref-or-null *scripts*
 351                  (aref **character-misc-database** (+ 6 (misc-index character)))))
 352
 353 (defun char-block (character)
 354   "Returns the Unicode block in which CHARACTER resides as a keyword.
 355 If CHARACTER does not have a known block, returns :NO-BLOCK"
 356   (let* ((code (char-code character))
 357          (block-index (ordered-ranges-position code **block-ranges**)))
 358     (if block-index
 359         (aref *blocks* block-index) :no-block)))
 360
 361 (defun unicode-1-name (character)
 362   "Returns the name assigned to CHARACTER in Unicode 1.0 if it is distinct
 363 from the name currently assigned to CHARACTER. Otherwise, returns NIL.
 364 This property has been officially obsoleted by the Unicode standard, and
 365 is only included for backwards compatibility."
 366   (let* ((char-code (char-code character))
 367          (h-code (double-vector-binary-search char-code
 368                                               **unicode-1-char-name-database**)))
 369     (when h-code
 370       (huffman-decode h-code **unicode-character-name-huffman-tree**))))
 371
 372 (defun age (character)
 373   "Returns the version of Unicode in which CHARACTER was assigned as a pair
 374 of values, both integers, representing the major and minor version respectively.
 375 If CHARACTER is not assigned in Unicode, returns NIL for both values."
 376   (let* ((value (aref **character-misc-database** (+ 8 (misc-index character))))
 377          (major (ash value -3))
 378          (minor (ldb (byte 3 0) value)))
 379     (if (zerop value) (values nil nil) (values major minor))))
 380
 381 (defun hangul-syllable-type (character)
 382   "Returns the Hangul syllable type of CHARACTER.
 383 The syllable type can be one of :L, :V, :T, :LV, or :LVT.
 384 If the character is not a Hangul syllable or Jamo, returns NIL"
 385   (let ((cp (char-code character)))
 386     (cond
 387       ((or
 388         (and (<= #x1100 cp) (<= cp #x115f))
 389         (and (<= #xa960 cp) (<= cp #xa97c))) :L)
 390       ((or
 391         (and (<= #x1160 cp) (<= cp #x11a7))
 392         (and (<= #xd7B0 cp) (<= cp #xd7C6))) :V)
 393       ((or
 394         (and (<= #x11a8 cp) (<= cp #x11ff))
 395         (and (<= #xd7c8 cp) (<= cp #xd7fb))) :T)
 396       ((and (<= #xac00 cp) (<= cp #xd7a3))
 397        (if (= 0 (rem (- cp #xac00) 28)) :LV :LVT)))))
 398
 399 (defun line-break-class (character &key resolve)
 400   "Returns the line breaking class of CHARACTER, as specified in UAX #14.
 401 If :RESOLVE is NIL, returns the character class found in the property file.
 402 If :RESOLVE is non-NIL, centain line-breaking classes will be mapped to othec
 403 classes as specified in the applicable standards. Addinionally, if :RESOLVE
 404 is :EAST-ASIAN, Ambigious (class :AI) characters will be mapped to the
 405 Ideographic (:ID) class instead of Alphabetic (:AL)."
 406   (when (and resolve (listp character)) (setf character (car character)))
 407   (when (and resolve (not character)) (return-from line-break-class :nil))
 408   (let ((raw-class
 409          (svref-or-null *line-break-classes*
 410                         (aref **character-misc-database** (+ 7 (misc-index character)))))
 411         (syllable-type (hangul-syllable-type character)))
 412     (when syllable-type
 413       (setf raw-class
 414             (cdr (assoc syllable-type
 415                         '((:l . :JL) (:v . :JV) (:t . :JT)
 416                           (:lv . :H2) (:lvt . :H3))))))
 417     (when resolve
 418       (setf raw-class
 419             (case raw-class
 420               (:ai (if (eql resolve :east-asion) :ID :AL))
 421               ; If we see :CM when resolving, we have a CM that isn't subject
 422               ; to LB9, so we do LB10
 423               ((:xx :cm) :al)
 424               (:sa (if (member (general-category character) '(:Mn :Mc))
 425                        :CM :AL))
 426               (:cj :ns)
 427               (:sg (error "The character ~S is a surrogate, which should not
 428 appear in an SBCL string. The line-breaking behavior of surrogates is undefined."
 429                           character))
 430               (t raw-class))))
 431     raw-class))
 432
 433 (defun uppercase-p (character)
 434   "Returns T if CHARACTER has the Unicode property Uppercase and NIL otherwise"
 435   (or (eql (general-category character) :Lu) (proplist-p character :other-uppercase)))
 436
 437 (defun lowercase-p (character)
 438   "Returns T if CHARACTER has the Unicode property Lowercase and NIL otherwise"
 439   (or (eql (general-category character) :Ll) (proplist-p character :other-lowercase)))
 440
 441 (defun cased-p (character)
 442   "Returns T if CHARACTER has a (Unicode) case, and NIL otherwise"
 443   (or (uppercase-p character) (lowercase-p character)
 444       (eql (general-category character) :Lt)))
 445
 446 (defun case-ignorable-p (character)
 447   "Returns T if CHARACTER is Case Ignorable as defined in Unicode 6.3, Chapter
 448 3"
 449   (or (member (general-category character)
 450               '(:Mn :Me :Cf :Lm :Sk))
 451       (member (word-break-class character)
 452               '(:midletter :midnumlet :single-quote))))
 453
 454 (defun alphabetic-p (character)
 455   "Returns T if CHARACTER is Alphabetic according to the Unicode standard
 456 and NIL otherwise"
 457   (or (member (general-category character) '(:Lu :Ll :Lt :Lm :Lo :Nl))
 458       (proplist-p character :other-alphabetic)))
 459
 460 (defun ideographic-p (character)
 461   "Returns T if CHARACTER has the Unicode property Ideographic,
 462 which loosely corresponds to the set of \"Chinese characters\""
 463   (proplist-p character :ideographic))
 464
 465 (defun math-p (character)
 466   "Returns T if CHARACTER is a mathematical symbol according to Unicode and
 467 NIL otherwise"
 468   (or (eql (general-category character) :sm) (proplist-p character :other-math)))
 469
 470 (defun whitespace-p (character)
 471   "Returns T if CHARACTER is whitespace according to Unicode
 472 and NIL otherwise"
 473   (proplist-p character :white-space))
 474
 475 (defun hex-digit-p (character &key ascii)
 476   "Returns T if CHARACTER is a hexadecimal digit and NIL otherwise.
 477 If :ASCII is non-NIL, fullwidth equivalents of the Latin letters A through F
 478 are excluded."
 479   (proplist-p character (if ascii :ascii-hex-digit :hex-digit)))
 480
 481 (defun soft-dotted-p (character)
 482   "Returns T if CHARACTER has a soft dot (such as the dots on i and j) which
 483 disappears when accents are placed on top of it. and NIL otherwise"
 484   (proplist-p character :soft-dotted))
 485
 486 (defun default-ignorable-p (character)
 487   "Returns T if CHARACTER is a Default_Ignorable_Code_Point"
 488   (and
 489    (or (proplist-p character :other-default-ignorable-code-point)
 490        (eql (general-category character) :cf)
 491        (proplist-p character :variation-selector))
 492    (not
 493     (or (whitespace-p character)
 494         (ordered-ranges-member
 495          (char-code character)
 496          #(#x0600 #x0604 #x06DD #x06DD #x070F #x070F #xFFF9 #xFFFB
 497            #x110BD #x110BD))))))
 498
 499 \f
 500 ;;; Implements UAX#15: Normalization Forms
 501 (declaim (inline char-decomposition-info))
 502 (defun char-decomposition-info (char)
 503   (let ((value (aref **character-misc-database**
 504                      (+ 4 (misc-index char)))))
 505     (values (clear-flag 7 value) (logbitp 7 value))))
 506
 507 (defun char-decomposition (char length callback)
 508   (declare (function callback))
 509   ;; Caller should have gotten length from char-decomposition-info
 510   (let* ((cp (char-code char))
 511          (cp-high (ash cp -8))
 512          (decompositions **character-decompositions**)
 513          (high-page (aref **character-high-pages** cp-high))
 514          (index (unless (logbitp 15 high-page) ;; Hangul syllable
 515                   (aref **character-low-pages**
 516                         (+ 1 (* 2 (+ (ldb (byte 8 0) cp) (ash high-page 8))))))))
 517     (cond ((= length 1)
 518            (funcall callback (code-char (aref decompositions index))))
 519           ((<= #xac00 cp #xd7a3)
 520            ;; see Unicode 6.2, section 3-12
 521            (let* ((sbase #xac00)
 522                   (lbase #x1100)
 523                   (vbase #x1161)
 524                   (tbase #x11a7)
 525                   (vcount 21)
 526                   (tcount 28)
 527                   (ncount (* vcount tcount))
 528                   (sindex (- cp sbase))
 529                   (lindex (floor sindex ncount))
 530                   (vindex (floor (mod sindex ncount) tcount))
 531                   (tindex (mod sindex tcount)))
 532              (funcall callback (code-char (+ lbase lindex)))
 533              (funcall callback (code-char (+ vbase vindex)))
 534              (when (> tindex 0)
 535                (funcall callback  (code-char (+ tbase tindex))))))
 536
 537           (t
 538            (loop for i below length
 539                  do
 540                  (funcall callback (code-char (aref decompositions (+ index i)))))))))
 541
 542 (defun decompose-char (char compatibility callback)
 543   (declare (function callback))
 544   (multiple-value-bind (info compat) (char-decomposition-info char)
 545     (if (and (plusp info)
 546              (or compatibility
 547                  (not compat)))
 548         (if compatibility
 549             (dx-flet ((callback (char)
 550                         (decompose-char char t callback)))
 551               (char-decomposition char info #'callback))
 552             (char-decomposition char info callback))
 553         (funcall callback char))))
 554
 555 (defun decompose-string (string compatibility filter)
 556   (let (chars
 557         (length 0)
 558         (previous-combining-class 0))
 559     (declare (type index length))
 560     (dx-flet ((callback (char)
 561                         (let ((combining-class (combining-class char)))
 562                           (incf length)
 563                           (cond ((< 0 combining-class previous-combining-class)
 564                                  ;; Ensure it's sorted
 565                                  (loop for cons on chars
 566                                        for next-char = (cadr cons)
 567                                        when (or (not next-char)
 568                                                 (<= 0 (combining-class next-char) combining-class))
 569                                        do (setf (cdr cons)
 570                                                 (cons char (cdr cons)))
 571                                           (return)))
 572                                 (t
 573                                  (push char chars)
 574                                  (setf previous-combining-class combining-class))))))
 575       (sb!kernel:with-array-data ((string string) (start) (end)
 576                                   :check-fill-pointer t)
 577         (let ((calback (if filter
 578                            (let ((filter (sb!kernel:%coerce-callable-to-fun filter)))
 579                              (lambda (char)
 580                                (when (funcall filter char)
 581                                  (callback char))))
 582                            #'callback)))
 583           (loop for i from start below end
 584                 for char = (schar string i)
 585                 do
 586                 (decompose-char char compatibility calback))))
 587       (let ((result (make-string length)))
 588         (loop for char in chars
 589               for i from (1- length) downto 0
 590               do (setf (schar result i) char))
 591         result))))
 592
 593 (defun composition-hangul-syllable-type (cp)
 594   (cond
 595     ((and (<= #x1100 cp) (<= cp #x1112)) :L)
 596     ((and (<= #x1161 cp) (<= cp #x1175)) :V)
 597     ((and (<= #x11a8 cp) (<= cp #x11c2)) :T)
 598     ((and (<= #xac00 cp) (<= cp #.(+ #xac00 11171)))
 599      (if (= 0 (rem (- cp #xac00) 28)) :LV :LVT))))
 600
 601 (defun primary-composition (char1 char2)
 602   (flet ((maybe (fn x) (when x (funcall fn x))))
 603     (let ((c1 (char-code char1))
 604           (c2 (char-code char2)))
 605       (maybe
 606        #'code-char
 607        (cond
 608          ((gethash (dpb c1 (byte 21 21) c2)
 609                    **character-primary-compositions**))
 610          ((and (eql (composition-hangul-syllable-type c1) :L)
 611                (eql (composition-hangul-syllable-type c2) :V))
 612           (let ((lindex (- c1 #x1100))
 613                 (vindex (- c2 #x1161)))
 614             (+ #xac00 (* lindex 588) (* vindex 28))))
 615          ((and (eql (composition-hangul-syllable-type c1) :LV)
 616                (eql (composition-hangul-syllable-type c2) :T))
 617           (+ c1 (- c2 #x11a7))))))))
 618
 619 ;;; This implements a sequence data structure, specialized for
 620 ;;; efficient deletion of characters at an index, along with tolerable
 621 ;;; random access.  The purpose is to support the canonical
 622 ;;; composition algorithm from Unicode, which involves replacing (not
 623 ;;; necessarily consecutive) pairs of code points with a single code
 624 ;;; point (e.g. [#\e #\combining_acute_accent] with
 625 ;;; #\latin_small_letter_e_with_acute).  The data structure is a list
 626 ;;; of three-element lists, each denoting a chunk of string data
 627 ;;; starting at the first index and ending at the second.
 628 ;;;
 629 ;;; Actually, the implementation isn't particularly efficient, and
 630 ;;; would probably benefit from being rewritten in terms of displaced
 631 ;;; arrays, which would substantially reduce copying.
 632 ;;;
 633 ;;; (also, generic sequences.  *sigh*.)
 634 (defun lref (lstring index)
 635   (dolist (l lstring)
 636     (when (and (<= (first l) index)
 637                (< index (second l)))
 638       (return (aref (third l) (- index (first l)))))))
 639
 640 (defun (setf lref) (newchar lstring index)
 641   (dolist (l lstring)
 642     (when (and (<= (first l) index)
 643                (< index (second l)))
 644       (return (setf (aref (third l) (- index (first l))) newchar)))))
 645
 646 (defun llength (lstring)
 647   (second (first (last lstring))))
 648
 649 (defun lstring (lstring)
 650   (let ((result (make-string (llength lstring))))
 651     (dolist (l lstring result)
 652       (replace result (third l) :start1 (first l) :end1 (second l)))))
 653
 654 (defun ldelete (lstring index)
 655   (do* ((ls lstring (cdr ls))
 656         (l (car ls) (car ls))
 657         so-fars)
 658        ((and (<= (first l) index)
 659              (< index (second l)))
 660         (append
 661          (nreverse so-fars)
 662          (cond
 663            ((= (first l) index)
 664             (list (list (first l) (1- (second l)) (subseq (third l) 1))))
 665            ((= index (1- (second l)))
 666             (list (list (first l) (1- (second l)) (subseq (third l) 0 (1- (length (third l)))))))
 667            (t
 668             (list
 669              (list (first l) index
 670                    (subseq (third l) 0 (- index (first l))))
 671              (list index (1- (second l))
 672                    (subseq (third l) (1+ (- index (first l))))))))
 673          (mapcar (lambda (x) (list (1- (first x)) (1- (second x)) (third x)))
 674                  (cdr ls))))
 675     (push l so-fars)))
 676
 677 (defun canonically-compose (string)
 678   (let* ((result (list (list 0 (length string) string)))
 679          (previous-starter-index (position 0 string :key #'combining-class))
 680          (i (and previous-starter-index (1+ previous-starter-index))))
 681     (when (or (not i) (= i (length string)))
 682       (return-from canonically-compose string))
 683     (tagbody
 684      again
 685        (when (and (>= (- i previous-starter-index) 2)
 686                   ;; test for Blocked (Unicode 3.11 para. D115)
 687                   ;;
 688                   ;; (assumes here that string has sorted combiners,
 689                   ;; so can look back just one step)
 690                   (>= (combining-class (lref result (1- i)))
 691                       (combining-class (lref result i))))
 692          (when (= (combining-class (lref result i)) 0)
 693            (setf previous-starter-index i))
 694          (incf i)
 695          (go next))
 696
 697        (let ((comp (primary-composition (lref result previous-starter-index)
 698                                         (lref result i))))
 699          (cond
 700            (comp
 701             (setf (lref result previous-starter-index) comp)
 702             (setf result (ldelete result i)))
 703            (t
 704             (when (= (combining-class (lref result i)) 0)
 705               (setf previous-starter-index i))
 706             (incf i))))
 707      next
 708        (unless (= i (llength result))
 709          (go again)))
 710     (if (= i (length string))
 711         string
 712         (lstring result))))
 713
 714 (defun normalize-string (string &optional (form :nfd)
 715                                           filter)
 716   "Normalize STRING to the Unicode normalization form form.
 717 Acceptable values for form are :NFD, :NFC, :NFKD, and :NFKC.
 718 If FILTER is a function it is called on each decomposed character and
 719 only characters for which it returns T are collected."
 720   (declare (type (member :nfd :nfkd :nfc :nfkc) form))
 721   #!-sb-unicode
 722   (declare (ignore filter))
 723   #!-sb-unicode
 724   (etypecase string
 725     ((array nil (*)) string)
 726     (string
 727      (ecase form
 728        ((:nfc :nfkc) string)
 729        ((:nfd :nfkd) (error "Cannot normalize to ~A form in #-SB-UNICODE builds" form)))))
 730   #!+sb-unicode
 731   (etypecase string
 732     (base-string string)
 733     ((array character (*))
 734      (ecase form
 735        ((:nfc)
 736         (canonically-compose (decompose-string string nil filter)))
 737        ((:nfd)
 738         (decompose-string string nil filter))
 739        ((:nfkc)
 740         (canonically-compose (decompose-string string t filter)))
 741        ((:nfkd)
 742         (decompose-string string t filter))))
 743     ((array nil (*)) string)))
 744
 745 (defun normalized-p (string &optional (form :nfd))
 746   "Tests if STRING is normalized to FORM"
 747   ;; FIXME: can be optimized
 748   (string= string (normalize-string string form)))
 749
 750 \f
 751 ;;; Unicode case algorithms
 752 ;; FIXME: Make these parts less redundant (macro?)
 753 (defparameter **special-titlecases**
 754   '#.(with-open-file (stream
 755                      (merge-pathnames
 756                       (make-pathname
 757                        :directory
 758                        '(:relative :up :up "output")
 759                        :name "titlecases" :type "lisp-expr")
 760                       sb!xc:*compile-file-truename*)
 761                      :direction :input
 762                      :element-type 'character)
 763         (read stream)))
 764
 765 (defparameter **special-casefolds**
 766   '#.(with-open-file (stream
 767                      (merge-pathnames
 768                       (make-pathname
 769                        :directory
 770                        '(:relative :up :up "output")
 771                        :name "foldcases" :type "lisp-expr")
 772                       sb!xc:*compile-file-truename*)
 773                      :direction :input
 774                      :element-type 'character)
 775         (read stream)))
 776
 777 (defun has-case-p (char)
 778   ;; Bit 6 is the Unicode case flag, as opposed to the Common Lisp one
 779   (logbitp 6 (aref **character-misc-database** (+ 5 (misc-index char)))))
 780
 781 (defun char-uppercase (char)
 782   (if (has-case-p char)
 783       (let ((cp (car (char-case-info char))))
 784         (if (atom cp) (list (code-char cp)) (mapcar #'code-char cp)))
 785       (list char)))
 786
 787 (defun char-lowercase (char)
 788   (if (has-case-p char)
 789       (let ((cp (cdr (char-case-info char))))
 790         (if (atom cp) (list (code-char cp)) (mapcar #'code-char cp)))
 791       (list char)))
 792
 793 (defun char-titlecase (char)
 794   (unless (has-case-p char) (return-from char-titlecase (list char)))
 795   (let* ((cp (char-code char))
 796          (value (assoc cp **special-titlecases**)))
 797     (if value
 798         (if (atom (cdr value))
 799             (list (code-char (cdr value)))
 800             (mapcar #'code-char (cdr value)))
 801         (char-uppercase char))))
 802
 803 (defun char-foldcase (char)
 804   (unless (has-case-p char) (return-from char-foldcase (list char)))
 805   (let* ((cp (char-code char))
 806          (value (assoc cp **special-casefolds**)))
 807     (if value
 808         (if (atom (cdr value))
 809             (list (code-char (cdr value)))
 810             (mapcar #'code-char (cdr value)))
 811         (char-lowercase char))))
 812
 813 (defun string-somethingcase (fn string special-fn)
 814   (let (result (len (length string)))
 815     (loop for index from 0 below len
 816        for char = (char string index)
 817        for cased = (or (funcall special-fn char index len)
 818                        (funcall fn char))
 819        do (loop for c in (remove :none cased) do (push c result)))
 820     (setf result (nreverse result))
 821     (coerce result 'string)))
 822
 823 (declaim (type function sb!unix::posix-getenv))
 824 (defun get-user-locale ()
 825   (let ((raw-locale
 826          #!+(or win32 unix) (or (sb!unix::posix-getenv "LC_ALL")
 827                                 (sb!unix::posix-getenv "LANG"))
 828          #!-(or win32 unix) nil))
 829     (when raw-locale
 830       (let ((lang-code (string-upcase
 831                         (subseq raw-locale 0 (position #\_ raw-locale)))))
 832         (when lang-code
 833           (intern lang-code "KEYWORD"))))))
 834
 835
 836 (defun uppercase (string &key locale)
 837   "Returns the full uppercase of STRING according to the Unicode standard.
 838 The result is not guaranteed to have the same length as the input. If :LOCALE
 839 is NIL, no language-specific case transformations are applied. If :LOCALE is a
 840 keyword representing a two-letter ISO country code, the case transforms of that
 841 locale are used. If :LOCALE is T, the user's current locale is used (Unix and
 842 Win32 only)."
 843   (when (eq locale t) (setf locale (get-user-locale)))
 844   (string-somethingcase
 845    #'char-uppercase string
 846    #!-sb-unicode (constantly nil)
 847    #!+sb-unicode ;; code-char with a constant > 255 breaks the build
 848    #'(lambda (char index len)
 849        (declare (ignore len))
 850        (cond
 851          ((and (eql locale :lt) (char= char (code-char #x0307))
 852                   (loop for i from (1- index) downto 0
 853                      for c = (char string i)
 854                      do (case (combining-class c)
 855                           (0 (return (soft-dotted-p c)))
 856                           (230 (return nil))
 857                           (t t))
 858                      finally (return nil)))
 859           '(:none))
 860          ((and (or (eql locale :tr) (eql locale :az))
 861                (char= char #\i))
 862           (list (code-char #x0130)))
 863          (t nil)))))
 864
 865 (defun lowercase (string &key locale)
 866   "Returns the full lowercase of STRING according to the Unicode standard.
 867 The result is not guaranteed to have the same length as the input.
 868 :LOCALE has the same semantics as the :LOCALE argument to UPPERCASE."
 869   (when (eq locale t) (setf locale (get-user-locale)))
 870   (string-somethingcase
 871    #'char-lowercase string
 872    #!-sb-unicode (constantly nil)
 873    #!+sb-unicode
 874    #'(lambda (char index len)
 875        (cond
 876          ((and (char= char (code-char #x03A3))
 877                (loop for i from (1- index) downto 0
 878                   for c = (char string i)
 879                   do (cond ((cased-p c) (return t))
 880                            ((case-ignorable-p c))
 881                            (t (return nil)))
 882                   finally (return nil))
 883                (loop for i from (1+ index) below len
 884                   for c = (char string i)
 885                   do (cond ((cased-p c) (return nil))
 886                            ((case-ignorable-p c))
 887                            (t (return t)))
 888                   finally (return t)))
 889           (list (code-char #x03C2)))
 890        ((eql locale :lt)
 891         (mapcar
 892          #'code-char
 893          (cdr (or
 894                (assoc (char-code char)
 895                       '((#x00CC . (#x0069 #x0307 #x0300))
 896                         (#x00CD . (#x0069 #x0307 #x0301))
 897                         (#x0128 . (#x0069 #x0307 #x0303))))
 898                (and (loop for i from (1+ index) below len
 899                        for c = (char string i)
 900                        do (case (combining-class c)
 901                             (230 (return t))
 902                             (0 (return nil))
 903                             (t t))
 904                        finally (return nil))
 905                     (assoc (char-code char)
 906                            '((#x0049 . (#x0069 #x0307))
 907                              (#x004A . (#x006A #x0307))
 908                              (#x012E . (#x012F #x0307)))))))))
 909        ((or (eql locale :tr) (eql locale :az))
 910         (cond
 911           ((char= char (code-char #x0130)) (list #\i))
 912           ((and (char= char (code-char #x0307))
 913                 (loop for i from (1- index) downto 0
 914                    for c = (char string i)
 915                    do (case (combining-class c)
 916                         (0 (return (char= c #\I)))
 917                         (230 (return nil))
 918                         (t t))
 919                    finally (return nil)))
 920            '(:none))
 921           ((and (char= char #\I)
 922                 (loop for i from (1+ index) below len
 923                    for c = (char string i)
 924                    do (case (combining-class c)
 925                         (0 (return t))
 926                         (230 (return (char/= c (code-char #x0307))))
 927                         (t t))
 928                    finally (return t)))
 929            (list (code-char #x0131)))
 930           (t nil)))
 931        (t nil)))))
 932
 933 (defun titlecase (string &key locale)
 934   "Returns the titlecase of STRING. The resulting string can
 935 be longer than the input.
 936 :LOCALE has the same semantics as the :LOCALE argument to UPPERCASE."
 937   (when (eq locale t) (setf locale (get-user-locale)))
 938   (let ((words (words string))
 939         (cased nil))
 940    (loop for word in words
 941       for first-cased = (or (position-if #'cased-p word) 0)
 942       for pre = (subseq word 0 first-cased)
 943       for initial = (char word first-cased)
 944       for rest = (subseq word (1+ first-cased))
 945       do (let ((up (char-titlecase initial)) (down (lowercase rest)))
 946            #!+sb-unicode
 947            (when (and (or (eql locale :tr) (eql locale :az))
 948                       (eql initial #\i))
 949              (setf up (list (code-char #x0130))))
 950            #!+sb-unicode
 951            (when (and (eql locale :lt)
 952                       (soft-dotted-p initial)
 953                       (eql (char down
 954                                  (position-if
 955                                   #'(lambda (c)
 956                                       (or (eql (combining-class c) 0)
 957                                           (eql (combining-class c) 230))) down))
 958                            (code-char #x0307)))
 959              (setf down (delete (code-char #x0307) down :count 1)))
 960            (push (concatenate 'string pre up down) cased)))
 961    (apply #'concatenate 'string (nreverse cased))))
 962
 963 (defun casefold (string)
 964   "Returns the full casefolding of STRING according to the Unicode standard.
 965 Casefolding removes case information in a way that allows the results to be used
 966 for case-insensitive comparisons.
 967 The result is not guaranteed to have the same length as the input."
 968   (string-somethingcase #'char-foldcase string (constantly nil)))
 969
 970 \f
 971 ;;; Unicode break algorithms
 972 ;;; In all the breaking methods:
 973 ;;; (brk) establishes a break between `first` and `second`
 974 ;;; (nobrk) prevents a break between `first` and `second`
 975 ;;; Setting flag=T/state=:nobrk-next prevents a break between `second` and `htird`
 976
 977 ;; Word breaking sets this to make their algorithms less tricky
 978 (defvar *other-break-special-graphemes* nil)
 979 (defun grapheme-break-class (char)
 980   "Returns the grapheme breaking class of CHARACTER, as specified in UAX #29."
 981   (let ((cp (when char (char-code char)))
 982         (gc (when char (general-category char)))
 983         (not-spacing-mark
 984          #(#x102B #x102C #x1038 #x1062 #x1063 #x1064 #x1067 #x1068 #x1069
 985            #x106A #x106B #x106C #x106D #x1083 #x1087 #x1088 #x1089 #x108A
 986            #x108B #x108C #x108F #x109A #x109B #x109C #x19B0 #x19B1 #x19B2
 987            #x19B3 #x19B4 #x19B8 #x19B9 #x19BB #x19BC #x19BD #x19BE #x19BF
 988            #x19C0 #x19C8 #x19C9 #x1A61 #x1A63 #x1A64 #xAA7B #xAA7D)))
 989     (cond
 990       ((not char) nil)
 991       ((= cp 10) :LF)
 992       ((= cp 13) :CR)
 993       ((or (member gc '(:Mn :Me))
 994            (proplist-p char :other-grapheme-extend)
 995            (and *other-break-special-graphemes*
 996                 (member gc '(:Mc :Cf)) (not (<= #x200B cp #x200D))))
 997        :extend)
 998       ((or (member gc '(:Zl :Zp :Cc :Cs :Cf))
 999            ;; From Cn and Default_Ignorable_Code_Point
1000            (eql cp #x2065) (eql cp #xE0000)
1001            (<= #xFFF0 cp #xFFF8)
1002            (<= #xE0002 cp #xE001F)
1003            (<= #xE0080 cp #xE00FF)
1004            (<= #xE01F0 cp #xE0FFF)) :control)
1005       ((<= #x1F1E6 cp #x1F1FF) :regional-indicator)
1006       ((and (or (eql gc :Mc)
1007                 (eql cp #x0E33) (eql cp #x0EB3))
1008             (not (binary-search cp not-spacing-mark))) :spacing-mark)
1009       (t (hangul-syllable-type char)))))
1010
1011 (macrolet ((def (name extendedp)
1012              `(defun ,name (function string)
1013                 (do ((length (length string))
1014                      (start 0)
1015                      (end 1 (1+ end))
1016                      (c1 nil)
1017                      (c2 (and (> (length string) 0) (grapheme-break-class (char string 0)))))
1018                     ((>= end length)
1019                      (if (= end length) (progn (funcall function string start end) nil)))
1020                   (flet ((brk () (funcall function string start end) (setf start end)))
1021                     (declare (truly-dynamic-extent #'brk))
1022                     (shiftf c1 c2 (grapheme-break-class (char string end)))
1023                     (cond
1024                       ((and (eql c1 :cr) (eql c2 :lf)))
1025                       ((or (member c1 '(:control :cr :lf))
1026                            (member c2 '(:control :cr :lf)))
1027                        (brk))
1028                       ((or (and (eql c1 :l) (member c2 '(:l :v :lv :lvt)))
1029                            (and (or (eql c1 :v) (eql c1 :lv))
1030                                 (or (eql c2 :v) (eql c2 :t)))
1031                            (and (eql c2 :t) (or (eql c1 :lvt) (eql c1 :t)))))
1032                       ((and (eql c1 :regional-indicator) (eql c2 :regional-indicator)))
1033                       ((eql c2 :extend))
1034                       ,@(when extendedp
1035                               `(((or (eql c2 :spacing-mark) (eql c1 :prepend)))))
1036                       (t (brk))))))))
1037   (def map-legacy-grapheme-boundaries nil)
1038   (def map-grapheme-boundaries t))
1039
1040 (macrolet ((def (name mapper)
1041              `(defun ,name (function string)
1042                 (let ((array (make-array 0 :element-type (array-element-type string) :adjustable t :displaced-to string)))
1043                   (flet ((fun (string start end)
1044                            (declare (type string string))
1045                            (funcall function (adjust-array array (- end start) :displaced-to string :displaced-index-offset start))))
1046                     (declare (truly-dynamic-extent #'fun))
1047                     (,mapper #'fun string))))))
1048   (def map-legacy-graphemes map-legacy-grapheme-boundaries)
1049   (def map-graphemes map-grapheme-boundaries))
1050
1051 (defun graphemes (string)
1052   "Breaks STRING into graphemes acording to the default
1053 grapheme breaking rules specified in UAX #29, returning a list of strings."
1054   (let (result)
1055     (map-graphemes (lambda (a) (push (subseq a 0) result)) string)
1056     (nreverse result)))
1057
1058 (defun word-break-class (char)
1059   "Returns the word breaking class of CHARACTER, as specified in UAX #29."
1060   ;; Words use graphemes as characters to deal with the ignore rule
1061   (when (listp char) (setf char (car char)))
1062   (let ((cp (when char (char-code char)))
1063         (gc (when char (general-category char)))
1064         (newlines #(#xB #xC #x0085 #x0085 #x2028 #x2029))
1065         (also-katakana
1066          #(#x3031 #x3035 #x309B #x309C
1067            #x30A0 #x30A0 #x30FC #x30FC
1068            #xFF70 #xFF70))
1069         (midnumlet #(#x002E #x2018 #x2019 #x2024 #xFE52 #xFF07 #xFF0E))
1070         (midletter
1071          #(#x003A #x00B7 #x002D7 #x0387 #x05F4 #x2027 #xFE13 #xFE55 #xFF1A))
1072         (midnum
1073          ;; Grepping of Line_Break = IS adjusted per UAX #29
1074          #(#x002C #x003B #x037E #x0589 #x060C #x060D #x066C #x07F8 #x2044
1075            #xFE10 #xFE14 #xFE50 #xFE54 #xFF0C #xFF1B)))
1076     (cond
1077       ((not char) nil)
1078       ((= cp 10) :LF)
1079       ((= cp 13) :CR)
1080       ((= cp #x27) :single-quote)
1081       ((= cp #x22) :double-quote)
1082       ((ordered-ranges-member cp newlines) :newline)
1083       ((or (eql (grapheme-break-class char) :extend)
1084            (eql gc :mc)) :extend)
1085       ((<= #x1F1E6 cp #x1F1FF) :regional-indicator)
1086       ((and (eql gc :Cf) (not (<= #x200B cp #x200D))) :format)
1087       ((or (eql (script char) :katakana)
1088            (ordered-ranges-member cp also-katakana)) :katakana)
1089       ((and (eql (script char) :Hebrew) (eql gc :lo)) :hebrew-letter)
1090       ((and (or (alphabetic-p char) (= cp #x05F3))
1091             (not (or (ideographic-p char)
1092                      (eql (line-break-class char) :sa)
1093                      (eql (script char) :hiragana)))) :aletter)
1094       ((binary-search cp midnumlet) :midnumlet)
1095       ((binary-search cp midletter) :midletter)
1096       ((binary-search cp midnum) :midnum)
1097       ((or (and (eql gc :Nd) (not (<= #xFF10 cp #xFF19))) ;Fullwidth digits
1098            (eql cp #x066B)) :numeric)
1099       ((eql gc :Pc) :extendnumlet)
1100       (t nil))))
1101
1102 (defmacro flatpush (thing list)
1103   (let ((%thing (gensym)) (%i (gensym)))
1104     `(let ((,%thing ,thing))
1105        (if (listp ,%thing)
1106            (dolist (,%i ,%thing)
1107              (push ,%i ,list))
1108            (push ,%thing ,list)))))
1109
1110 (defun words (string)
1111   "Breaks STRING into words acording to the default
1112 word breaking rules specified in UAX #29. Returns a list of strings"
1113   (let ((chars (mapcar
1114                  #'(lambda (s)
1115                      (let ((l (coerce s 'list)))
1116                        (if (cdr l) l (car l))))
1117                  (let ((*other-break-special-graphemes* t)) (graphemes string))))
1118          words word flag)
1119     (flatpush (car chars) word)
1120     (do ((first (car chars) second)
1121          (tail (cdr chars) (cdr tail))
1122          (second (cadr chars) (cadr tail)))
1123         ((not first) (nreverse (mapcar #'(lambda (l) (coerce l 'string)) words)))
1124       (flet ((brk () (push (nreverse word) words) (setf word nil) (flatpush second word))
1125              (nobrk () (flatpush second word)))
1126         (let ((c1 (word-break-class first))
1127               (c2 (word-break-class second))
1128               (c3 (when (and tail (cdr tail)) (word-break-class (cadr tail)))))
1129           (cond
1130             (flag (nobrk) (setf flag nil))
1131             ;; CR+LF are bound together by the grapheme clustering
1132             ((or (eql c1 :newline) (eql c1 :cr) (eql c1 :lf)
1133                  (eql c2 :newline) (eql c2 :cr) (eql c2 :lf)) (brk))
1134             ((or (eql c2 :format) (eql c2 :extend)) (nobrk))
1135             ((and (or (eql c1 :aletter) (eql c1 :hebrew-letter))
1136                   (or (eql c2 :aletter) (eql c2 :hebrew-letter))) (nobrk))
1137             ((and (or (eql c1 :aletter) (eql c1 :hebrew-letter))
1138                   (member c2 '(:midletter :midnumlet :single-quote))
1139                   (or (eql c3 :aletter) (eql c3 :hebrew-letter)))
1140              (nobrk) (setf flag t)) ; Handle the multiple breaks from this rule
1141             ((and (eql c1 :hebrew-letter) (eql c2 :double-quote)
1142                   (eql c3 :hebrew-letter))
1143              (nobrk) (setf flag t))
1144             ((and (eql c1 :hebrew-letter) (eql c2 :single-quote)) (nobrk))
1145             ((or (and (eql c1 :numeric) (member c2 '(:numeric :aletter :hebrew-letter)))
1146                  (and (eql c2 :numeric) (member c1 '(:numeric :aletter :hebrew-letter))))
1147              (nobrk))
1148             ((and (eql c1 :numeric)
1149                   (member c2 '(:midnum :midnumlet :single-quote))
1150                   (eql c3 :numeric))
1151              (nobrk) (setf flag t))
1152             ((and (eql c1 :katakana) (eql c2 :katakana)) (nobrk))
1153             ((or (and (member c1
1154                               '(:aletter :hebrew-letter :katakana
1155                                 :numeric :extendnumlet)) (eql c2 :extendnumlet))
1156                  (and (member c2
1157                               '(:aletter :hebrew-letter :katakana
1158                                 :numeric :extendnumlet)) (eql c1 :extendnumlet)))
1159              (nobrk))
1160             ((and (eql c1 :regional-indicator) (eql c2 :regional-indicator)) (nobrk))
1161             (t (brk))))))))
1162
1163 (defun sentence-break-class (char)
1164   "Returns the sentence breaking class of CHARACTER, as specified in UAX #29."
1165   (when (listp char) (setf char (car char)))
1166   (let ((cp (when char (char-code char)))
1167         (gc (when char (general-category char)))
1168         (aterms #(#x002E #x2024 #xFE52 #xFF0E))
1169         (scontinues
1170          #(#x002C #x002D #x003A #x055D #x060C #x060D #x07F8 #x1802 #x1808
1171            #x2013 #x2014 #x3001 #xFE10 #xFE11 #xFE13 #xFE31 #xFE32 #xFE50
1172            #xFE51 #xFE55 #xFE58 #xFE63 #xFF0C #xFF0D #xFF1A #xFF64)))
1173     (cond
1174       ((not char) nil)
1175       ((= cp 10) :LF)
1176       ((= cp 13) :CR)
1177       ((or (eql (grapheme-break-class char) :extend)
1178            (eql gc :mc)) :extend)
1179       ((or (eql cp #x0085) (<= #x2028 cp #x2029)) :sep)
1180       ((and (eql gc :Cf) (not (<= #x200C cp #x200D))) :format)
1181       ((whitespace-p char) :sp)
1182       ((lowercase-p char) :lower)
1183       ((or (uppercase-p char) (eql gc :Lt)) :upper)
1184       ((or (alphabetic-p char) (eql cp #x00A0) (eql cp #x05F3)) :oletter)
1185       ((or (and (eql gc :Nd) (not (<= #xFF10 cp #xFF19))) ;Fullwidth digits
1186            (<= #x066B cp #x066C)) :numeric)
1187       ((binary-search cp aterms) :aterm)
1188       ((binary-search cp scontinues) :scontinue)
1189       ((proplist-p char :sterm) :sterm)
1190       ((and (or (member gc '(:Po :Ps :Pe :Pf :Pi))
1191                 (eql (line-break-class char) :qu))
1192             (not (eql cp #x05F3))) :close)
1193       (t nil))))
1194
1195 (defun sentence-prebreak (string)
1196   "Pre-combines some sequences of characters to make the sentence-break
1197 algorithm simpler..
1198 Specifically,
1199 - Combines any character with the following extend of format characters
1200 - Combines CR + LF into '(CR LF)
1201 - Combines any run of :cp*:close* into one character"
1202   (let ((chars (coerce string 'list))
1203         cluster clusters last-seen sp-run)
1204     (labels ((flush () (if (cdr cluster) (push (nreverse cluster) clusters)
1205                            (if cluster (push (car cluster) clusters)))
1206                     (setf cluster nil))
1207              (brk (x)
1208                (flush) (push x clusters))
1209              (nobrk (x) (push x cluster)))
1210     (loop for ch in chars
1211        for type = (sentence-break-class ch)
1212        do (cond
1213             ((and (eql last-seen :cr) (eql type :lf)) (nobrk ch) (flush) (setf last-seen nil))
1214             ((eql last-seen :cr) (brk ch) (setf last-seen nil))
1215             ((eql type :cr) (nobrk ch) (setf last-seen :cr))
1216             ((eql type :lf) (brk ch) (setf last-seen nil))
1217             ((eql type :sep) (brk ch) (setf last-seen nil))
1218             ((and last-seen (or (eql type :extend) (eql type :format)))
1219              (nobrk ch))
1220             ((eql type :close)
1221              (unless (eql last-seen :close) (flush))
1222              (nobrk ch) (setf last-seen :close sp-run nil))
1223             ((eql type :sp)
1224              (unless (or (and (not sp-run) (eql last-seen :close)) (eql last-seen :sp))
1225                (flush) (setf sp-run t))
1226              (nobrk ch) (setf last-seen :sp))
1227             (t (flush) (nobrk ch) (setf last-seen type sp-run nil))))
1228     (flush) (nreverse clusters))))
1229
1230 (defun sentences (string)
1231   "Breaks STRING into sentences acording to the default
1232 sentence breaking rules specified in UAX #29"
1233   (let ((special-handling '(:close :sp :sep :cr :lf :scontinue :sterm :aterm))
1234         (chars (sentence-prebreak string))
1235         sentence sentences state)
1236     (flatpush (car chars) sentence)
1237     (do ((first (car chars) second)
1238          (tail (cdr chars) (cdr tail))
1239          (second (cadr chars) (cadr tail))
1240          (third (caddr chars) (caddr tail)))
1241         ((not first)
1242          (progn
1243            ; Shake off last sentence
1244            (when sentence (push (nreverse sentence) sentences))
1245            (nreverse (mapcar #'(lambda (l) (coerce l 'string)) sentences))))
1246       (flet ((brk () (push (nreverse sentence) sentences)
1247                   (setf sentence nil) (flatpush second sentence))
1248              (nobrk () (flatpush second sentence)))
1249       (let ((c1 (sentence-break-class first))
1250             (c2 (sentence-break-class second))
1251             (c3 (sentence-break-class third)))
1252         (cond
1253           ((eql state :brk-next) (brk) (setf state nil))
1254           ((eql state :nobrk-next) (nobrk) (setf state nil))
1255           ((member c1 '(:sep :cr :lf)) (brk))
1256           ((and (eql c1 :aterm) (eql c2 :numeric)) (nobrk))
1257           ((and (eql c1 :upper) (eql c2 :aterm)
1258                 (eql c3 :upper)) (nobrk) (setf state :nobrk-next))
1259           ((or (and (member c1 '(:sterm :aterm)) (member c2 '(:close :sp))
1260                     (member c3 '(:scontinue :sterm :aterm)))
1261                (and (member c1 '(:sterm :aterm))
1262                     (member c2 '(:scontinue :sterm :aterm))))
1263            (nobrk) (when (member c2 '(:close :sp)) (setf state :nobrk-next)))
1264           ((and (member c1 '(:sterm :aterm)) (member c2 '(:close :sp))
1265                 (member c3 '(:sep :cr :lf)))
1266            (nobrk) (setf state :nobrk-next)) ;; Let the linebreak call (brk)
1267           ((and (member c1 '(:sterm :aterm)) (member c2 '(:sep :cr :lf)))
1268            (nobrk)) ; Doesn't trigger rule 8
1269           ((eql c1 :sterm) ; Not ambiguous anymore, rule 8a already handled
1270            (if (member c2 '(:close :sp))
1271                (progn (nobrk) (setf state :brk-next))
1272                (brk)))
1273           ((and (eql c2 :sterm) third (not (member c3 special-handling)))
1274            (nobrk) (setf state :brk-next)) ; STerm followed by nothing important
1275           ((or (eql c1 :aterm)
1276                (and (eql c2 :aterm) third
1277                     (not (member c3 special-handling)) (not (eql c3 :numeric))))
1278            ; Finally handle rule 8
1279            (if (loop for c in
1280                     (if (and third (not (or (member c3 special-handling)
1281                                             (eql c3 :numeric))))
1282                         (cdr tail) tail)
1283                   for type = (sentence-break-class c) do
1284                     (when (member type '(:oletter :upper :sep :cr :lf
1285                                          :sterm :aterm))
1286                       (return nil))
1287                     (when (eql type :lower) (return t)) finally (return nil))
1288                ; Ambiguous case
1289                (progn (nobrk) (setf state :nobrk-next))
1290                ; Otherwise
1291                (if (member c2 '(:close :sp :aterm))
1292                    (progn (nobrk) (setf state :brk-next))
1293                    (brk))))
1294           (t (nobrk))))))))
1295
1296 (defun line-prebreak (string)
1297   (let ((chars (coerce string 'list))
1298         cluster clusters last-seen)
1299     (loop for char in chars
1300        for type = (line-break-class char)
1301        do
1302          (when
1303              (and cluster
1304                   (or
1305                    (not (eql type :cm))
1306                    (and (eql type :cm)
1307                         (member last-seen '(nil :BK :CR :LF :NL :SP :ZW)))))
1308            (if (cdr cluster)
1309                (push (nreverse cluster) clusters)
1310                (push (car cluster) clusters))
1311            (setf cluster nil))
1312          (unless (eql type :cm) (setf last-seen type))
1313          (push char cluster))
1314     (if (cdr cluster)
1315         (push (nreverse cluster) clusters)
1316         (push (car cluster) clusters))
1317     (nreverse clusters)))
1318
1319 (defun line-break-annotate (string)
1320   (let ((chars (line-prebreak string))
1321         first second t1 t2 tail (ret (list :cant))
1322         state after-spaces)
1323     (macrolet ((cmpush (thing)
1324                  (let ((gthing (gensym)))
1325                    `(let ((,gthing ,thing))
1326                       (if (listp ,gthing)
1327                           (loop for (c next) on ,gthing do
1328                                (push c ret)
1329                                (when next (push :cant ret)))
1330                           (push ,thing ret)))))
1331                (between (a b action)
1332                  (let ((atest (if (eql a :any) t
1333                                   (if (listp a)
1334                                       `(member t1 ,a)
1335                                       `(eql t1 ,a))))
1336                        (btest (if (eql b :any) t
1337                                   (if (listp b)
1338                                       `(member t2 ,b)
1339                                       `(eql t2 ,b)))))
1340                  `(when (and ,atest ,btest)
1341                     (cmpush ,action)
1342                     (cmpush second)
1343                     (go tail))))
1344                (after-spaces (a b action)
1345                  (let ((atest (if (eql a :any) t
1346                                   (if (listp a)
1347                                       `(member t1 ,a)
1348                                       `(eql t1 ,a))))
1349                        (btest (if (eql b :any) t
1350                                   (if (listp b)
1351                                       `(member type ,b)
1352                                       `(eql type ,b)))))
1353                    `(when
1354                         (and ,atest
1355                              (loop for c in tail
1356                                 for type = (line-break-class c :resolve t)
1357                                 do
1358                                   (when (not (eql type :sp))
1359                                     (return ,btest))))
1360                       (if (eql t2 :sp)
1361                          (progn (cmpush :cant)
1362                                 (cmpush second)
1363                                 (setf state :eat-spaces)
1364                                 (setf after-spaces ,action)
1365                                 (go tail))
1366                          (progn (cmpush ,action)
1367                                 (cmpush second)
1368                                 (go tail)))))))
1369
1370       (cmpush (car chars))
1371       (setf first (car chars))
1372       (setf tail (cdr chars))
1373       (setf second (car tail))
1374       (tagbody
1375          top
1376          (when (not first) (go end))
1377          (setf t1 (line-break-class first :resolve t))
1378          (setf t2 (line-break-class second :resolve t))
1379          (between :any :nil :must)
1380          (when (and (eql state :eat-spaces) (eql t2 :sp))
1381             (cmpush :cant) (cmpush second) (go tail))
1382          (between :bk :any :must)
1383          (between :cr :lf :cant)
1384          (between '(:cr :lf :nl) :any :must)
1385          (between :any '(:zw :bk :cr :lf :nl) :cant)
1386          (when after-spaces (cmpush after-spaces) (cmpush second)
1387                (setf state nil after-spaces nil) (go tail))
1388          (after-spaces :zw :any :can)
1389          (between :any :wj :cant)
1390          (between :wj :any :cant)
1391          (between :gl :any :cant)
1392          (between '(:ZW :WJ :SY :SG :SA :RI :QU :PR :PO :OP :NU :NS :NL
1393                     :LF :IS :IN :ID :HL :GL :EX :CR :CP :CM :CL :CJ :CB
1394                     :BK :BB :B2 :AL :AI :JL :JV :JT :H2 :H3 :XX)
1395                   :gl :cant)
1396          (between :any '(:cl :cp :ex :is :sy) :cant)
1397          (after-spaces :op :any :cant)
1398          (after-spaces :qu :op :cant)
1399          (after-spaces '(:cl :cp) :ns :cant)
1400          (after-spaces :b2 :b2 :cant)
1401          (between :any :sp :cant) ;; Goes here to deal with after-spaces
1402          (between :sp :any :can)
1403          (between :any :qu :cant)
1404          (between :qu :any :cant)
1405          (between :any :cb :can)
1406          (between :cb :any :can)
1407          (between :any '(:ba :hy :ns) :cant)
1408          (between :bb :any :cant)
1409          (when (and (eql t1 :hl) (eql t2 :hy))
1410            (cmpush :cant) (cmpush second)
1411            (setf after-spaces :can) (go tail))
1412          (between '(:al :hl :id :in :nu) :in :cant)
1413          (between :id :po :cant)
1414          (between '(:al :hl) :nu :cant)
1415          (between '(:nu :po) '(:al :hl) :cant)
1416          (between :pr '(:id :al :hl) :cant)
1417          (between '(:cl :cp :nu) '(:po :pr) :cant)
1418          (between :nu '(:po :pr :nu) :cant)
1419          (between '(:po :pr) :op :cant)
1420          (between '(:po :pr :hy :is :sy) :nu :cant)
1421          (between :jl '(:jl :jv :h2 :h3) :cant)
1422          (between '(:jv :h2) '(:jv :jt) :cant)
1423          (between '(:jt :h3) :jt :cant)
1424          (between '(:jl :jv :jt :h2 :h3) '(:in :po) :cant)
1425          (between :pr '(:jl :jv :jt :h2 :h3) :cant)
1426          (between '(:al :hl :is) '(:al :hl) :cant)
1427          (between '(:al :hl :nu) :op :cant)
1428          (between :cp '(:al :hl :nu) :cant)
1429          (between :ri :ri :cant)
1430          (between :any :any :can)
1431          tail
1432          (setf first second)
1433          (setf tail (cdr tail))
1434          (setf second (car tail))
1435          (go top)
1436          end)
1437       ;; LB3 satisfied by (:any :nil) -> :must
1438       (setf ret (nreverse ret))
1439       ret)))
1440
1441 (defun break-list-at (list n)
1442   (let ((tail list) (pre-tail nil))
1443     (loop repeat n do (setf pre-tail tail) (setf tail (cdr tail)))
1444     (setf (cdr pre-tail) nil)
1445     (values list tail)))
1446
1447 (defun lines (string &key (margin *print-right-margin*))
1448   "Breaks STRING into lines that are no wider than :MARGIN according to the
1449 line breaking rules outlined in UAX #14. Combining marks will always be kept
1450 together with their base characters, and spaces (but not other types of
1451 whitespace) will be removed from the end of lines. If :MARGIN is unspecified,
1452 it defaults to 80 characters"
1453   (when (string= string "") (return-from lines (list "")))
1454   (unless margin (setf margin 80))
1455   (do* ((chars (line-break-annotate string))
1456         line lines (filled 0) last-break-distance
1457         (break-type (car chars) (car tail))
1458         (char (cadr chars) (cadr tail))
1459         (tail (cddr chars) (cddr tail)))
1460        ((not break-type)
1461         (mapcar #'(lambda (s) (coerce s 'string)) (nreverse lines)))
1462     (ecase break-type
1463       (:cant
1464        (push char line)
1465        (unless (eql (line-break-class char) :CM)
1466          (incf filled))
1467        (when last-break-distance (incf last-break-distance)))
1468       (:can
1469        (push char line)
1470        (setf last-break-distance 1)
1471        (incf filled))
1472       (:must
1473        (push char line)
1474        (setf last-break-distance 1)
1475        (incf filled)
1476        (go break)))
1477     (if (> filled margin)
1478         (go break)
1479         (go next))
1480    break
1481     (when (not last-break-distance)
1482       ;; If we don't have any line breaks, remove the last thing we added that
1483       ;; takes up space, and all its combining marks
1484       (setf last-break-distance
1485             (1+ (loop for c in line while (eql (line-break-class c) :cm) summing 1))))
1486     (multiple-value-bind (next-line this-line) (break-list-at line last-break-distance)
1487       (loop while (eql (line-break-class (car this-line)) :sp)
1488          do (setf this-line (cdr this-line)))
1489       (push (nreverse this-line) lines)
1490       (setf line next-line)
1491       (setf filled (length line))
1492       (setf last-break-distance nil))
1493    next))
1494
1495 \f
1496 ;;; Collation
1497 (defconstant +maximum-variable-primary-element+
1498   #.(with-open-file (stream
1499                      (merge-pathnames
1500                       (make-pathname
1501                        :directory
1502                        '(:relative :up :up "output")
1503                        :name "other-collation-info" :type "lisp-expr")
1504                       sb!xc:*compile-file-truename*)
1505                      :direction :input
1506                      :element-type 'character)
1507       (read stream)))
1508
1509 (defun unpack-collation-key (key)
1510   (declare (type (simple-array (unsigned-byte 32) (*)) key))
1511   (loop for value across key
1512         collect
1513         (list (ldb (byte 16 16) value)
1514               (ldb (byte 11 5) value)
1515               (ldb (byte 5 0) value))))
1516
1517 (declaim (inline variable-p))
1518 (defun variable-p (x)
1519   (<= 1 x +maximum-variable-primary-element+))
1520
1521 (defun collation-key (string start end)
1522   (let (char1
1523         (char2 (code-char 0))
1524         (char3 (code-char 0)))
1525     (case (- end start)
1526       (1 (setf char1 (char string start)))
1527       (2 (setf char1 (char string start)
1528                char2 (char string (+ start 1))))
1529       (3 (setf char1 (char string start)
1530                char2 (char string (+ start 1))
1531                char3 (char string (+ start 2))))
1532       (t
1533        ;; There are never more than three characters in a contraction, right?
1534        (return-from collation-key nil)))
1535     (let ((packed-key (gethash (pack-3-codepoints
1536                                 (char-code char1)
1537                                 (char-code char2)
1538                                 (char-code char3))
1539                                **character-collations**)))
1540       (if packed-key
1541           (unpack-collation-key packed-key)
1542           (when (char= (code-char 0) char2 char3)
1543             (let* ((cp (char-code char1))
1544                    (base
1545                      (cond ((not (proplist-p char1 :unified-ideograph))
1546                             #xFBC0)
1547                            ((or (<= #x4E00 cp #x9FFF)
1548                                 (<= #xF900 cp #xFAFF))
1549                             #xFB40)
1550                            (t
1551                             #xFB80)))
1552                    (a (+ base (ash cp -15)))
1553                    (b (logior #.(ash 1 15) (logand cp #x7FFFF))))
1554               (list (list a #x20 #x2) (list b 0 0))))))))
1555
1556 (defun sort-key (string)
1557   (let* ((str (normalize-string string :nfd))
1558          (i 0) (len (length str)) max-match new-i
1559          sort-key
1560          after-variable)
1561     (loop while (< i len)
1562           do
1563           (loop for offset from 1 to 3
1564                 for index = (+ i offset)
1565                 while (<= index len)
1566                 do
1567                 (let ((key (collation-key str i index)))
1568                   (when key
1569                     (setf max-match key
1570                           new-i index))))
1571           (loop for index from new-i below len
1572                 for char = (char str index)
1573                 for previous-combining-class = combining-class
1574                 for combining-class = (combining-class char)
1575                 until (eql combining-class 0)
1576                 unless (and (>= (- index new-i) 1)
1577                             ;; Combiners are sorted, we only have to look back
1578                             ;; one step (see canonically-compose)
1579                             (>= (combining-class (char str (1- index)))
1580                                 combining-class))
1581                 do
1582                 (rotatef (char str new-i) (char str index))
1583                 (let ((key (collation-key str i (1+ new-i))))
1584                   (if key
1585                       (setf max-match key
1586                             new-i (1+ new-i))
1587                       (rotatef (char str new-i) (char str index)))))
1588           (loop for key in max-match do (push key sort-key))
1589           (setf i new-i))
1590     (macrolet ((push-non-zero (obj place)
1591                  `(when (/= ,obj 0)
1592                     (push ,obj ,place))))
1593       (let (primary secondary tertiary quatenary)
1594         (loop for (k1 k2 k3) in (nreverse sort-key)
1595               do
1596               (cond
1597                 ((= k1 k2 k3 0))
1598                 ((variable-p k1)
1599                  (setf after-variable t)
1600                  (push k1 quatenary))
1601                 ((/= k1 0)
1602                  (setf after-variable nil)
1603                  (push k1 primary)
1604                  (push-non-zero k2 secondary)
1605                  (push-non-zero k3 tertiary)
1606                  (push #xFFFF quatenary))
1607                 ((/= k3 0)
1608                  (unless after-variable
1609                    (push-non-zero k2 secondary)
1610                    (push k3 tertiary)
1611                    (push #xFFFF quatenary)))))
1612         (concatenate 'vector
1613                      (nreverse primary) #(0) (nreverse secondary) #(0)
1614                      (nreverse tertiary) #(0) (nreverse quatenary))))))
1615
1616 (defun vector< (vector1 vector2)
1617   (loop for i across vector1
1618         for j across vector2
1619         do
1620         (cond ((< i j) (return-from vector< t))
1621               ((> i j) (return-from vector< nil))))
1622   ;; If there's no differences, shortest vector wins
1623   (< (length vector1) (length vector2)))
1624
1625 (defun unicode= (string1 string2 &key (start1 0) end1 (start2 0) end2 (strict t))
1626   "Determines whether STRING1 and STRING2 are canonically equivalent according
1627 to Unicode. The START and END arguments behave like the arguments to STRING=.
1628 If :STRICT is NIL, UNICODE= tests compatibility equavalence instead."
1629   (let ((str1 (normalize-string (subseq string1 start1 end1) (if strict :nfd :nfkd)))
1630         (str2 (normalize-string (subseq string2 start2 end2) (if strict :nfd :nfkd))))
1631     (string= str1 str2)))
1632
1633 (defun unicode-equal (string1 string2 &key (start1 0) end1 (start2 0) end2 (strict t))
1634   "Determines whether STRING1 and STRING2 are canonically equivalent after
1635 casefoldin8 (that is, ignoring case differences) according to Unicode. The
1636 START and END arguments behave like the arguments to STRING=. If :STRICT is
1637 NIL, UNICODE= tests compatibility equavalence instead."
1638   (let ((str1 (normalize-string (subseq string1 start1 end1) (if strict :nfd :nfkd)))
1639         (str2 (normalize-string (subseq string2 start2 end2) (if strict :nfd :nfkd))))
1640     (string=
1641      (normalize-string (casefold str1) (if strict :nfd :nfkd))
1642      (normalize-string (casefold str2) (if strict :nfd :nfkd)))))
1643
1644 (defun unicode< (string1 string2 &key (start1 0) end1 (start2 0) end2)
1645   "Determines whether STRING1 sorts before STRING2 using the Unicode Collation
1646 Algorithm, The function uses an untailored Default Unicode Collation Element Table
1647 to produce the sort keys. The function uses the Shifted method for dealing
1648 with variable-weight characters, as described in UTS #10"
1649   (let* ((s1 (subseq string1 start1 end1))
1650          (s2 (subseq string2 start2 end2))
1651          (k1 (sort-key s1)) (k2 (sort-key s2)))
1652     (if (equalp k1 k2)
1653         (string< (normalize-string s1 :nfd) (normalize-string s2 :nfd))
1654         (vector< k1 k2))))
1655
1656 (defun unicode<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
1657   "Tests if STRING1 and STRING2 are either UNICODE< or UNICODE="
1658   (or
1659    (unicode= string1 string2 :start1 start1 :end1 end1
1660              :start2 start2 :end2 end2)
1661    (unicode< string1 string2 :start1 start1 :end1 end1
1662              :start2 start2 :end2 end2)))
1663
1664 (defun unicode> (string1 string2 &key (start1 0) end1 (start2 0) end2)
1665   "Tests if STRING2 is UNICODE< STRING1."
1666    (unicode< string2 string1 :start1 start2 :end1 end2
1667              :start2 start1 :end2 end1))
1668
1669 (defun unicode>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
1670   "Tests if STRING1 and STRING2 are either UNICODE= or UNICODE>"
1671   (or
1672    (unicode= string1 string2 :start1 start1 :end1 end1
1673              :start2 start2 :end2 end2)
1674    (unicode> string1 string2 :start1 start1 :end1 end1
1675              :start2 start2 :end2 end2)))
1676
1677 \f
1678 ;;; Confusable detection
1679
1680 (defun canonically-deconfuse (string)
1681   (let (ret (i 0) new-i (len (length string))
1682             best-node)
1683     (loop while (< i len) do
1684          (loop for offset from 1 to 5
1685             while (<= (+ i offset) len)
1686             do
1687               (let ((node (gethash (subseq string i (+ i offset))
1688                                    **confusables**)))
1689                 (when node (setf best-node node new-i (+ i offset)))))
1690          (cond
1691            (best-node (push best-node ret) (setf i new-i))
1692            (t (push (subseq string i (1+ i)) ret) (incf i)))
1693          (setf best-node nil new-i nil))
1694     (apply #'concatenate 'string (nreverse ret))))
1695
1696 (defun confusable-p (string1 string2 &key (start1 0) end1 (start2 0) end2)
1697   "Determines whether STRING1 and STRING2 could be visually confusable
1698 according to the IDNA confusableSummary.txt table"
1699     (let* ((form #!+sb-unicode :nfd #!-sb-unicode :nfc)
1700            (str1 (normalize-string (subseq string1 start1 end1) form))
1701            (str2 (normalize-string (subseq string2 start2 end2) form))
1702            (skeleton1 (normalize-string (canonically-deconfuse str1) form))
1703            (skeleton2 (normalize-string (canonically-deconfuse str2) form)))
1704       (string= skeleton1 skeleton2)))