src/code/target-unicode.lisp

   1 ;;;; Unicode functions
   2
   3 ;;;; This software is part of the SBCL system. See the README file for
   4 ;;;; more information.
   5 ;;;;
   6 ;;;; This software is derived from the CMU CL system, which was
   7 ;;;; written at Carnegie Mellon University and released into the
   8 ;;;; public domain. The software is in the public domain and is
   9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
  10 ;;;; files for more information.
  11
  12 (in-package "SB!UNICODE")
  13
  14 (declaim (type simple-vector **special-numerics**))
  15 (sb!impl::defglobal **special-numerics**
  16   #.(with-open-file (stream
  17                      (merge-pathnames
  18                       (make-pathname
  19                        :directory
  20                        '(:relative :up :up "output")
  21                        :name "numerics" :type "lisp-expr")
  22                       sb!xc:*compile-file-truename*)
  23                      :direction :input
  24                      :element-type 'character)
  25       (read stream)))
  26
  27
  28 (declaim (type (simple-array (unsigned-byte 32) (*)) **block-ranges**))
  29 (sb!impl::defglobal **block-ranges**
  30   #.(sb!int:!coerce-to-specialized
  31      (with-open-file (stream
  32                       (merge-pathnames
  33                        (make-pathname
  34                         :directory
  35                         '(:relative :up :up "output")
  36                         :name "blocks" :type "lisp-expr")
  37                        sb!xc:*compile-file-truename*)
  38                       :direction :input
  39                       :element-type 'character)
  40        (read stream))
  41      '(unsigned-byte 32)))
  42
  43 (macrolet ((unicode-property-init ()
  44              (let ((proplist-dump
  45                     (with-open-file (stream
  46                                      (merge-pathnames
  47                                       (make-pathname
  48                                        :directory
  49                                        '(:relative :up :up "output")
  50                                        :name "misc-properties" :type "lisp-expr")
  51                                       sb!xc:*compile-file-truename*)
  52                                      :direction :input
  53                                      :element-type 'character)
  54                       (read stream)))
  55                    (confusable-sets
  56                     (with-open-file (stream
  57                                      (merge-pathnames
  58                                       (make-pathname
  59                                        :directory
  60                                        '(:relative :up :up "output")
  61                                        :name "confusables" :type "lisp-expr")
  62                                       sb!xc:*compile-file-truename*)
  63                                      :direction :input
  64                                      :element-type 'character)
  65                       (read stream)))
  66                    (bidi-mirroring-list
  67                     (with-open-file (stream
  68                                      (merge-pathnames
  69                                       (make-pathname
  70                                        :directory
  71                                        '(:relative :up :up "output")
  72                                        :name "bidi-mirrors" :type "lisp-expr")
  73                                       sb!xc:*compile-file-truename*)
  74                                      :direction :input
  75                                      :element-type 'character)
  76                       (read stream))))
  77                `(progn
  78                   (sb!impl::defglobal **proplist-properties** ',proplist-dump)
  79                   (sb!impl::defglobal **confusables** ',confusable-sets)
  80                   (sb!impl::defglobal **bidi-mirroring-glyphs** ',bidi-mirroring-list)
  81                   (defun !unicode-properties-cold-init ()
  82                     (let ((hash (make-hash-table)) (list ',proplist-dump))
  83                       (do ((k (car list) (car list)) (v (cadr list) (cadr list)))
  84                           ((not list) hash)
  85                         (setf (gethash k hash) v)
  86                         (setf list (cddr list)))
  87                       (setf **proplist-properties** hash))
  88                     (let ((hash (make-hash-table :test #'equal)))
  89                       (loop for set in ',confusable-sets
  90                          for items = (mapcar #'(lambda (item)
  91                                                  (map 'simple-string
  92                                                       #'code-char item))
  93                                              #!+sb-unicode set
  94                                              #!-sb-unicode
  95                                              (remove-if-not
  96                                               #'(lambda (item)
  97                                                   (every
  98                                                    #'(lambda (x)
  99                                                        (< x sb!xc:char-code-limit))
 100                                                    item)) set))
 101                          do (dolist (i items)
 102                               (setf (gethash i hash) (first items))))
 103                       (setf **confusables** hash))
 104                     (let ((hash (make-hash-table)) (list ',bidi-mirroring-list))
 105                       (loop for (k v) in list do
 106                            (setf (gethash k hash) v))
 107                       (setf **bidi-mirroring-glyphs** hash)))))))
 108   (unicode-property-init))
 109
 110 ;;; Unicode property access
 111 (defun ordered-ranges-member (item vector)
 112   (declare (type simple-vector vector)
 113            (type fixnum item)
 114            (optimize speed))
 115   (labels ((recurse (start end)
 116              (declare (type index start end))
 117              (when (< start end)
 118                (let* ((i (+ start (truncate (- end start) 2)))
 119                       (index (* 2 i))
 120                       (elt1 (svref vector index))
 121                       (elt2 (svref vector (1+ index))))
 122                  (declare (type index i)
 123                           (fixnum elt1 elt2))
 124                  (cond ((< item elt1)
 125                         (recurse start i))
 126                        ((> item elt2)
 127                         (recurse (+ 1 i) end))
 128                        (t
 129                         item))))))
 130     (recurse 0 (truncate (length vector) 2))))
 131
 132 ;; Returns which range `item` was found in or NIL
 133 ;; First range = 0, second range = 1 ...
 134 (defun ordered-ranges-position (item vector)
 135   (declare (type (simple-array (unsigned-byte 32) (*)) vector)
 136            (type fixnum item))
 137   (labels ((recurse (start end)
 138              (declare (type index start end))
 139              (when (< start end)
 140                (let* ((i (+ start (truncate (- end start) 2)))
 141                       (index (* 2 i))
 142                       (elt1 (aref vector index))
 143                       (elt2 (aref vector (1+ index))))
 144                  (declare (type index i))
 145                  (cond ((< item elt1)
 146                         (recurse start i))
 147                        ((> item elt2)
 148                         (recurse (+ 1 i) end))
 149                        (t
 150                         i))))))
 151     (recurse 0 (truncate (length vector) 2))))
 152
 153 (defun proplist-p (character property)
 154   #!+sb-doc
 155   "Returns T if CHARACTER has the specified PROPERTY.
 156 PROPERTY is a keyword representing one of the properties from PropList.txt,
 157 with underscores replaced by dashes."
 158   (ordered-ranges-member (char-code character)
 159                          (gethash property **proplist-properties**)))
 160
 161 ;; WARNING: These have to be manually kept in sync with the values in ucd.lisp
 162 (declaim (type simple-vector *general-categories* *bidi-classes* *east-asian-widths*
 163                *scripts* *line-break-classes* *blocks*))
 164 (sb!impl::defglobal *general-categories*
 165   #(:Lu :Ll :Lt :Lm :Lo :Cc :Cf :Co :Cs :Cn :Mc :Me :Mn :Nd
 166     :Nl :No :Pc :Pd :Pe :Pf :Pi :Po :Ps :Sc :Sk :Sm :So :Zl
 167     :Zp :Zs))
 168
 169 (sb!impl::defglobal *bidi-classes*
 170   #(:BN :AL :AN :B :CS :EN :ES :ET :L :LRE :LRO :NSM :ON
 171     :PDF :R :RLE :RLO :S :WS :LRI :RLI :FSI :PDI))
 172
 173 (sb!impl::defglobal *east-asian-widths*
 174   #(:N :A :H :W :F :Na))
 175
 176 (sb!impl::defglobal *scripts*
 177   #(:Unknown :Common :Latin :Greek :Cyrillic :Armenian :Hebrew :Arabic :Syriac
 178     :Thaana :Devanagari :Bengali :Gurmukhi :Gujarati :Oriya :Tamil :Telugu
 179     :Kannada :Malayalam :Sinhala :Thai :Lao :Tibetan :Myanmar :Georgian :Hangul
 180     :Ethiopic :Cherokee :Canadian-Aboriginal :Ogham :Runic :Khmer :Mongolian
 181     :Hiragana :Katakana :Bopomofo :Han :Yi :Old-Italic :Gothic :Deseret
 182     :Inherited :Tagalog :Hanunoo :Buhid :Tagbanwa :Limbu :Tai-Le :Linear-B
 183     :Ugaritic :Shavian :Osmanya :Cypriot :Braille :Buginese :Coptic :New-Tai-Lue
 184     :Glagolitic :Tifinagh :Syloti-Nagri :Old-Persian :Kharoshthi :Balinese
 185     :Cuneiform :Phoenician :Phags-Pa :Nko :Sundanese :Lepcha :Ol-Chiki :Vai
 186     :Saurashtra :Kayah-Li :Rejang :Lycian :Carian :Lydian :Cham :Tai-Tham
 187     :Tai-Viet :Avestan :Egyptian-Hieroglyphs :Samaritan :Lisu :Bamum :Javanese
 188     :Meetei-Mayek :Imperial-Aramaic :Old-South-Arabian :Inscriptional-Parthian
 189     :Inscriptional-Pahlavi :Old-Turkic :Kaithi :Batak :Brahmi :Mandaic :Chakma
 190     :Meroitic-Cursive :Meroitic-Hieroglyphs :Miao :Sharada :Sora-Sompeng
 191     :Takri :Bassa-Vah :Mahajani :Pahawh-Hmong :Caucasian-Albanian :Manichaean
 192     :Palmyrene :Duployan :Mende-Kikakui :Pau-Cin-Hau :Elbasan :Modi
 193     :Psalter-Pahlavi :Grantha :Mro :Siddham :Khojki :Nabataean :Tirhuta
 194     :Khudawadi :Old-North-Arabian :Warang-Citi :Linear-A :Old-Permic))
 195
 196 (sb!impl::defglobal *line-break-classes*
 197     #(:XX :AI :AL :B2 :BA :BB :BK :CB :CJ :CL :CM :CP :CR :EX :GL
 198       :HL :HY :ID :IN :IS :LF :NL :NS :NU :OP :PO :PR :QU :RI :SA
 199       :SG :SP :SY :WJ :ZW))
 200
 201 (sb!impl::defglobal *blocks*
 202   #(:Basic-Latin :Latin-1-Supplement :Latin-Extended-A :Latin-Extended-B
 203     :IPA-Extensions :Spacing-Modifier-Letters :Combining-Diacritical-Marks
 204     :Greek-and-Coptic :Cyrillic :Cyrillic-Supplement :Armenian :Hebrew :Arabic
 205     :Syriac :Arabic-Supplement :Thaana :NKo :Samaritan :Mandaic
 206     :Arabic-Extended-A :Devanagari :Bengali :Gurmukhi :Gujarati :Oriya :Tamil
 207     :Telugu :Kannada :Malayalam :Sinhala :Thai :Lao :Tibetan :Myanmar :Georgian
 208     :Hangul-Jamo :Ethiopic :Ethiopic-Supplement :Cherokee
 209     :Unified-Canadian-Aboriginal-Syllabics :Ogham :Runic :Tagalog :Hanunoo
 210     :Buhid :Tagbanwa :Khmer :Mongolian
 211     :Unified-Canadian-Aboriginal-Syllabics-Extended :Limbu :Tai-Le :New-Tai-Lue
 212     :Khmer-Symbols :Buginese :Tai-Tham :Combining-Diacritical-Marks-Extended
 213     :Balinese :Sundanese :Batak :Lepcha :Ol-Chiki :Sundanese-Supplement
 214     :Vedic-Extensions :Phonetic-Extensions :Phonetic-Extensions-Supplement
 215     :Combining-Diacritical-Marks-Supplement :Latin-Extended-Additional
 216     :Greek-Extended :General-Punctuation :Superscripts-and-Subscripts
 217     :Currency-Symbols :Combining-Diacritical-Marks-for-Symbols
 218     :Letterlike-Symbols :Number-Forms :Arrows :Mathematical-Operators
 219     :Miscellaneous-Technical :Control-Pictures :Optical-Character-Recognition
 220     :Enclosed-Alphanumerics :Box-Drawing :Block-Elements :Geometric-Shapes
 221     :Miscellaneous-Symbols :Dingbats :Miscellaneous-Mathematical-Symbols-A
 222     :Supplemental-Arrows-A :Braille-Patterns :Supplemental-Arrows-B
 223     :Miscellaneous-Mathematical-Symbols-B :Supplemental-Mathematical-Operators
 224     :Miscellaneous-Symbols-and-Arrows :Glagolitic :Latin-Extended-C :Coptic
 225     :Georgian-Supplement :Tifinagh :Ethiopic-Extended :Cyrillic-Extended-A
 226     :Supplemental-Punctuation :CJK-Radicals-Supplement :Kangxi-Radicals
 227     :Ideographic-Description-Characters :CJK-Symbols-and-Punctuation :Hiragana
 228     :Katakana :Bopomofo :Hangul-Compatibility-Jamo :Kanbun :Bopomofo-Extended
 229     :CJK-Strokes :Katakana-Phonetic-Extensions :Enclosed-CJK-Letters-and-Months
 230     :CJK-Compatibility :CJK-Unified-Ideographs-Extension-A
 231     :Yijing-Hexagram-Symbols :CJK-Unified-Ideographs :Yi-Syllables :Yi-Radicals
 232     :Lisu :Vai :Cyrillic-Extended-B :Bamum :Modifier-Tone-Letters
 233     :Latin-Extended-D :Syloti-Nagri :Common-Indic-Number-Forms :Phags-pa
 234     :Saurashtra :Devanagari-Extended :Kayah-Li :Rejang :Hangul-Jamo-Extended-A
 235     :Javanese :Myanmar-Extended-B :Cham :Myanmar-Extended-A :Tai-Viet
 236     :Meetei-Mayek-Extensions :Ethiopic-Extended-A :Latin-Extended-E
 237     :Meetei-Mayek :Hangul-Syllables :Hangul-Jamo-Extended-B :High-Surrogates
 238     :High-Private-Use-Surrogates :Low-Surrogates :Private-Use-Area
 239     :CJK-Compatibility-Ideographs :Alphabetic-Presentation-Forms
 240     :Arabic-Presentation-Forms-A :Variation-Selectors :Vertical-Forms
 241     :Combining-Half-Marks :CJK-Compatibility-Forms :Small-Form-Variants
 242     :Arabic-Presentation-Forms-B :Halfwidth-and-Fullwidth-Forms :Specials
 243     :Linear-B-Syllabary :Linear-B-Ideograms :Aegean-Numbers
 244     :Ancient-Greek-Numbers :Ancient-Symbols :Phaistos-Disc :Lycian :Carian
 245     :Coptic-Epact-Numbers :Old-Italic :Gothic :Old-Permic :Ugaritic :Old-Persian
 246     :Deseret :Shavian :Osmanya :Elbasan :Caucasian-Albanian :Linear-A
 247     :Cypriot-Syllabary :Imperial-Aramaic :Palmyrene :Nabataean :Phoenician
 248     :Lydian :Meroitic-Hieroglyphs :Meroitic-Cursive :Kharoshthi
 249     :Old-South-Arabian :Old-North-Arabian :Manichaean :Avestan
 250     :Inscriptional-Parthian :Inscriptional-Pahlavi :Psalter-Pahlavi :Old-Turkic
 251     :Rumi-Numeral-Symbols :Brahmi :Kaithi :Sora-Sompeng :Chakma :Mahajani
 252     :Sharada :Sinhala-Archaic-Numbers :Khojki :Khudawadi :Grantha :Tirhuta
 253     :Siddham :Modi :Takri :Warang-Citi :Pau-Cin-Hau :Cuneiform
 254     :Cuneiform-Numbers-and-Punctuation :Egyptian-Hieroglyphs :Bamum-Supplement
 255     :Mro :Bassa-Vah :Pahawh-Hmong :Miao :Kana-Supplement :Duployan
 256     :Shorthand-Format-Controls :Byzantine-Musical-Symbols :Musical-Symbols
 257     :Ancient-Greek-Musical-Notation :Tai-Xuan-Jing-Symbols
 258     :Counting-Rod-Numerals :Mathematical-Alphanumeric-Symbols :Mende-Kikakui
 259     :Arabic-Mathematical-Alphabetic-Symbols :Mahjong-Tiles :Domino-Tiles
 260     :Playing-Cards :Enclosed-Alphanumeric-Supplement
 261     :Enclosed-Ideographic-Supplement :Miscellaneous-Symbols-and-Pictographs
 262     :Emoticons :Ornamental-Dingbats :Transport-and-Map-Symbols
 263     :Alchemical-Symbols :Geometric-Shapes-Extended :Supplemental-Arrows-C
 264     :CJK-Unified-Ideographs-Extension-B :CJK-Unified-Ideographs-Extension-C
 265     :CJK-Unified-Ideographs-Extension-D :CJK-Compatibility-Ideographs-Supplement
 266     :Tags :Variation-Selectors-Supplement :Supplementary-Private-Use-Area-A
 267     :Supplementary-Private-Use-Area-B))
 268
 269 (declaim (inline svref-or-null))
 270 (defun svref-or-null (vector index)
 271   (and (< index (length vector))
 272        (svref vector index)))
 273
 274 (defun general-category (character)
 275   #!+sb-doc
 276   "Returns the general category of CHARACTER as it appears in UnicodeData.txt"
 277   (svref-or-null *general-categories* (sb!impl::ucd-general-category character)))
 278
 279 (defun bidi-class (character)
 280   #!+sb-doc
 281   "Returns the bidirectional class of CHARACTER"
 282   (if (and (eql (general-category character) :Cn)
 283            (default-ignorable-p character))
 284       :Bn
 285       (svref-or-null
 286        *bidi-classes*
 287        (aref **character-misc-database** (1+ (misc-index character))))))
 288
 289 (defun combining-class (character)
 290   #!+sb-doc
 291   "Returns the canonical combining class (CCC) of CHARACTER"
 292   (aref **character-misc-database** (+ 2 (misc-index character))))
 293
 294 (defun decimal-value (character)
 295   #!+sb-doc
 296   "Returns the decimal digit value associated with CHARACTER or NIL if
 297 there is no such value.
 298
 299 The only characters in Unicode with a decimal digit value are those
 300 that are part of a range of characters that encode the digits 0-9.
 301 Because of this, `(decimal-digit c) <=> (digit-char-p c 10)` in
 302 #+sb-unicode builds"
 303   (sb!impl::ucd-decimal-digit character))
 304
 305 (defun digit-value (character)
 306   #!+sb-doc
 307   "Returns the Unicode digit value of CHARACTER or NIL if it doesn't exist.
 308
 309 Digit values are guaranteed to be integers between 0 and 9 inclusive.
 310 All characters with decimal digit values have the same digit value,
 311 but there are characters (such as digits of number systems without a 0 value)
 312 that have a digit value but no decimal digit value"
 313   (let ((%digit (clear-flag 6
 314                             (aref **character-misc-database**
 315                                   (+ 3 (misc-index character))))))
 316     (if (< %digit 10) %digit nil)))
 317
 318 (defun numeric-value (character)
 319   #!+sb-doc
 320   "Returns the numeric value of CHARACTER or NIL if there is no such value.
 321 Numeric value is the most general of the Unicode numeric properties.
 322 The only constraint on the numeric value is that it be a rational number."
 323   (or (double-vector-binary-search (char-code character)
 324                                    **special-numerics**)
 325       (digit-value character)))
 326
 327 (defun mirrored-p (character)
 328   #!+sb-doc
 329   "Returns T if CHARACTER needs to be mirrored in bidirectional text.
 330 Otherwise, returns NIL."
 331   (logbitp 5 (aref **character-misc-database**
 332                     (+ 5 (misc-index character)))))
 333
 334 (defun bidi-mirroring-glyph (character)
 335   #!+sb-doc
 336   "Returns the mirror image of CHARACTER if it exists.
 337 Otherwise, returns NIL."
 338   (when (mirrored-p character)
 339     (let ((ret (gethash (char-code character) **bidi-mirroring-glyphs**)))
 340       (when ret (code-char ret)))))
 341
 342 (defun east-asian-width (character)
 343   #!+sb-doc
 344   "Returns the East Asian Width property of CHARACTER as
 345 one of the keywords :N (Narrow), :A (Ambiguous), :H (Halfwidth),
 346 :W (Wide), :F (Fullwidth), or :NA (Not applicable)"
 347   (svref-or-null *east-asian-widths*
 348                  (ldb (byte 3 0)
 349                       (aref **character-misc-database**
 350                             (+ 5 (misc-index character))))))
 351
 352 (defun script (character)
 353   #!+sb-doc
 354   "Returns the Script property of CHARACTER as a keyword.
 355 If CHARACTER does not have a known script, returns :UNKNOWN"
 356   (svref-or-null *scripts*
 357                  (aref **character-misc-database** (+ 6 (misc-index character)))))
 358
 359 (defun char-block (character)
 360   #!+sb-doc
 361   "Returns the Unicode block in which CHARACTER resides as a keyword.
 362 If CHARACTER does not have a known block, returns :NO-BLOCK"
 363   (let* ((code (char-code character))
 364          (block-index (ordered-ranges-position code **block-ranges**)))
 365     (if block-index
 366         (aref *blocks* block-index) :no-block)))
 367
 368 (defun unicode-1-name (character)
 369   #!+sb-doc
 370   "Returns the name assigned to CHARACTER in Unicode 1.0 if it is distinct
 371 from the name currently assigned to CHARACTER. Otherwise, returns NIL.
 372 This property has been officially obsoleted by the Unicode standard, and
 373 is only included for backwards compatibility."
 374   (let* ((char-code (char-code character))
 375          (h-code (double-vector-binary-search char-code
 376                                               **unicode-1-char-name-database**)))
 377     (when h-code
 378       (huffman-decode h-code **unicode-character-name-huffman-tree**))))
 379
 380 (defun age (character)
 381   #!+sb-doc
 382   "Returns the version of Unicode in which CHARACTER was assigned as a pair
 383 of values, both integers, representing the major and minor version respectively.
 384 If CHARACTER is not assigned in Unicode, returns NIL for both values."
 385   (let* ((value (aref **character-misc-database** (+ 8 (misc-index character))))
 386          (major (ash value -3))
 387          (minor (ldb (byte 3 0) value)))
 388     (if (zerop value) (values nil nil) (values major minor))))
 389
 390 (defun hangul-syllable-type (character)
 391   #!+sb-doc
 392   "Returns the Hangul syllable type of CHARACTER.
 393 The syllable type can be one of :L, :V, :T, :LV, or :LVT.
 394 If the character is not a Hangul syllable or Jamo, returns NIL"
 395   (let ((cp (char-code character)))
 396     (cond
 397       ((or
 398         (and (<= #x1100 cp) (<= cp #x115f))
 399         (and (<= #xa960 cp) (<= cp #xa97c))) :L)
 400       ((or
 401         (and (<= #x1160 cp) (<= cp #x11a7))
 402         (and (<= #xd7B0 cp) (<= cp #xd7C6))) :V)
 403       ((or
 404         (and (<= #x11a8 cp) (<= cp #x11ff))
 405         (and (<= #xd7c8 cp) (<= cp #xd7fb))) :T)
 406       ((and (<= #xac00 cp) (<= cp #xd7a3))
 407        (if (= 0 (rem (- cp #xac00) 28)) :LV :LVT)))))
 408
 409 (defun line-break-class (character &key resolve)
 410   #!+sb-doc
 411   "Returns the line breaking class of CHARACTER, as specified in UAX #14.
 412 If :RESOLVE is NIL, returns the character class found in the property file.
 413 If :RESOLVE is non-NIL, centain line-breaking classes will be mapped to othec
 414 classes as specified in the applicable standards. Addinionally, if :RESOLVE
 415 is :EAST-ASIAN, Ambigious (class :AI) characters will be mapped to the
 416 Ideographic (:ID) class instead of Alphabetic (:AL)."
 417   (when (and resolve (listp character)) (setf character (car character)))
 418   (when (and resolve (not character)) (return-from line-break-class :nil))
 419   (let ((raw-class
 420          (svref-or-null *line-break-classes*
 421                         (aref **character-misc-database** (+ 7 (misc-index character)))))
 422         (syllable-type (hangul-syllable-type character)))
 423     (when syllable-type
 424       (setf raw-class
 425             (cdr (assoc syllable-type
 426                         '((:l . :JL) (:v . :JV) (:t . :JT)
 427                           (:lv . :H2) (:lvt . :H3))))))
 428     (when resolve
 429       (setf raw-class
 430             (case raw-class
 431               (:ai (if (eql resolve :east-asion) :ID :AL))
 432               ; If we see :CM when resolving, we have a CM that isn't subject
 433               ; to LB9, so we do LB10
 434               ((:xx :cm) :al)
 435               (:sa (if (member (general-category character) '(:Mn :Mc))
 436                        :CM :AL))
 437               (:cj :ns)
 438               (:sg (error "The character ~S is a surrogate, which should not
 439 appear in an SBCL string. The line-breaking behavior of surrogates is undefined."
 440                           character))
 441               (t raw-class))))
 442     raw-class))
 443
 444 (defun uppercase-p (character)
 445   #!+sb-doc
 446   "Returns T if CHARACTER has the Unicode property Uppercase and NIL otherwise"
 447   (or (eql (general-category character) :Lu) (proplist-p character :other-uppercase)))
 448
 449 (defun lowercase-p (character)
 450   #!+sb-doc
 451   "Returns T if CHARACTER has the Unicode property Lowercase and NIL otherwise"
 452   (or (eql (general-category character) :Ll) (proplist-p character :other-lowercase)))
 453
 454 (defun cased-p (character)
 455   #!+sb-doc
 456   "Returns T if CHARACTER has a (Unicode) case, and NIL otherwise"
 457   (or (uppercase-p character) (lowercase-p character)
 458       (eql (general-category character) :Lt)))
 459
 460 (defun case-ignorable-p (character)
 461   #!+sb-doc
 462   "Returns T if CHARACTER is Case Ignorable as defined in Unicode 6.3, Chapter
 463 3"
 464   (or (member (general-category character)
 465               '(:Mn :Me :Cf :Lm :Sk))
 466       (member (word-break-class character)
 467               '(:midletter :midnumlet :single-quote))))
 468
 469 (defun alphabetic-p (character)
 470   #!+sb-doc
 471   "Returns T if CHARACTER is Alphabetic according to the Unicode standard
 472 and NIL otherwise"
 473   (or (member (general-category character) '(:Lu :Ll :Lt :Lm :Lo :Nl))
 474       (proplist-p character :other-alphabetic)))
 475
 476 (defun ideographic-p (character)
 477   #!+sb-doc
 478   "Returns T if CHARACTER has the Unicode property Ideographic,
 479 which loosely corresponds to the set of \"Chinese characters\""
 480   (proplist-p character :ideographic))
 481
 482 (defun math-p (character)
 483   #!+sb-doc
 484   "Returns T if CHARACTER is a mathematical symbol according to Unicode and
 485 NIL otherwise"
 486   (or (eql (general-category character) :sm) (proplist-p character :other-math)))
 487
 488 (defun whitespace-p (character)
 489   #!+sb-doc
 490   "Returns T if CHARACTER is whitespace according to Unicode
 491 and NIL otherwise"
 492   (proplist-p character :white-space))
 493
 494 (defun hex-digit-p (character &key ascii)
 495   #!+sb-doc
 496   "Returns T if CHARACTER is a hexadecimal digit and NIL otherwise.
 497 If :ASCII is non-NIL, fullwidth equivalents of the Latin letters A through F
 498 are excluded."
 499   (proplist-p character (if ascii :ascii-hex-digit :hex-digit)))
 500
 501 (defun soft-dotted-p (character)
 502   #!+sb-doc
 503   "Returns T if CHARACTER has a soft dot (such as the dots on i and j) which
 504 disappears when accents are placed on top of it. and NIL otherwise"
 505   (proplist-p character :soft-dotted))
 506
 507 (defun default-ignorable-p (character)
 508   #!+sb-doc
 509   "Returns T if CHARACTER is a Default_Ignorable_Code_Point"
 510   (and
 511    (or (proplist-p character :other-default-ignorable-code-point)
 512        (eql (general-category character) :cf)
 513        (proplist-p character :variation-selector))
 514    (not
 515     (or (whitespace-p character)
 516         (ordered-ranges-member
 517          (char-code character)
 518          #(#x0600 #x0604 #x06DD #x06DD #x070F #x070F #xFFF9 #xFFFB
 519            #x110BD #x110BD))))))
 520
 521 \f
 522 ;;; Implements UAX#15: Normalization Forms
 523 (defun char-decomposition-info (char)
 524   (let ((value (aref **character-misc-database**
 525                      (+ 4 (misc-index char)))))
 526     (values (clear-flag 7 value) (logbitp 7 value))))
 527
 528 (defun char-decomposition (char length callback)
 529   (declare (function callback))
 530   ;; Caller should have gotten length from char-decomposition-info
 531   (let* ((cp (char-code char))
 532          (cp-high (ash cp -8))
 533          (decompositions **character-decompositions**)
 534          (high-page (aref **character-high-pages** cp-high))
 535          (index (unless (logbitp 15 high-page) ;; Hangul syllable
 536                   (aref **character-low-pages**
 537                         (+ 1 (* 2 (+ (ldb (byte 8 0) cp) (ash high-page 8))))))))
 538     (cond ((= length 1)
 539            (funcall callback (code-char (aref decompositions index))))
 540           ((<= #xac00 cp #xd7a3)
 541            ;; see Unicode 6.2, section 3-12
 542            (let* ((sbase #xac00)
 543                   (lbase #x1100)
 544                   (vbase #x1161)
 545                   (tbase #x11a7)
 546                   (vcount 21)
 547                   (tcount 28)
 548                   (ncount (* vcount tcount))
 549                   (sindex (- cp sbase))
 550                   (lindex (floor sindex ncount))
 551                   (vindex (floor (mod sindex ncount) tcount))
 552                   (tindex (mod sindex tcount)))
 553              (funcall callback (code-char (+ lbase lindex)))
 554              (funcall callback (code-char (+ vbase vindex)))
 555              (when (> tindex 0)
 556                (funcall callback  (code-char (+ tbase tindex))))))
 557
 558           (t
 559            (loop for i below length
 560                  do
 561                  (funcall callback (code-char (aref decompositions (+ index i)))))))))
 562
 563 (defun decompose-char (char compatibility callback)
 564   (declare (function callback))
 565   (multiple-value-bind (info compat) (char-decomposition-info char)
 566     (if (and (plusp info)
 567              (or compatibility
 568                  (not compat)))
 569         (if compatibility
 570             (dx-flet ((callback (char)
 571                         (decompose-char char t callback)))
 572               (char-decomposition char info #'callback))
 573             (char-decomposition char info callback))
 574         (funcall callback char))))
 575
 576 (defun decompose-string (string &optional (kind :canonical))
 577   (let ((compatibility (ecase kind
 578                          (:compatibility t)
 579                          (:canonical nil))))
 580     (let (chars
 581           (length 0)
 582           previous-char
 583           (previous-combining-class 0))
 584       (dx-flet ((callback (char)
 585                   (let ((combining-class (combining-class char)))
 586                     (incf length)
 587                     (cond ((< 0 combining-class previous-combining-class)
 588                            ;; Ensure it's sorted
 589                            (loop for cons on chars
 590                                  for next-char = (cadr cons)
 591                                  when (or (not next-char)
 592                                           (<= 0 (combining-class next-char) combining-class))
 593                                  do (setf (cdr cons)
 594                                           (cons char (cdr cons)))
 595                                     (return)))
 596                           (t
 597                            (push char chars)
 598                            (setf previous-char char
 599                                  previous-combining-class combining-class))))))
 600         (loop for char across string
 601               do
 602               (decompose-char char compatibility #'callback))
 603         (let ((result (make-string length)))
 604           (loop for char in (nreverse chars)
 605                 for i from 0
 606                 do (setf (schar result i) char))
 607           result)))))
 608
 609 (defun composition-hangul-syllable-type (cp)
 610   (cond
 611     ((and (<= #x1100 cp) (<= cp #x1112)) :L)
 612     ((and (<= #x1161 cp) (<= cp #x1175)) :V)
 613     ((and (<= #x11a8 cp) (<= cp #x11c2)) :T)
 614     ((and (<= #xac00 cp) (<= cp #.(+ #xac00 11171)))
 615      (if (= 0 (rem (- cp #xac00) 28)) :LV :LVT))))
 616
 617 (defun primary-composition (char1 char2)
 618   (flet ((maybe (fn x) (when x (funcall fn x))))
 619     (let ((c1 (char-code char1))
 620           (c2 (char-code char2)))
 621       (maybe
 622        #'code-char
 623        (cond
 624          ((gethash (dpb c1 (byte 21 21) c2)
 625                    **character-primary-compositions**))
 626          ((and (eql (composition-hangul-syllable-type c1) :L)
 627                (eql (composition-hangul-syllable-type c2) :V))
 628           (let ((lindex (- c1 #x1100))
 629                 (vindex (- c2 #x1161)))
 630             (+ #xac00 (* lindex 588) (* vindex 28))))
 631          ((and (eql (composition-hangul-syllable-type c1) :LV)
 632                (eql (composition-hangul-syllable-type c2) :T))
 633           (+ c1 (- c2 #x11a7))))))))
 634
 635 ;;; This implements a sequence data structure, specialized for
 636 ;;; efficient deletion of characters at an index, along with tolerable
 637 ;;; random access.  The purpose is to support the canonical
 638 ;;; composition algorithm from Unicode, which involves replacing (not
 639 ;;; necessarily consecutive) pairs of code points with a single code
 640 ;;; point (e.g. [#\e #\combining_acute_accent] with
 641 ;;; #\latin_small_letter_e_with_acute).  The data structure is a list
 642 ;;; of three-element lists, each denoting a chunk of string data
 643 ;;; starting at the first index and ending at the second.
 644 ;;;
 645 ;;; Actually, the implementation isn't particularly efficient, and
 646 ;;; would probably benefit from being rewritten in terms of displaced
 647 ;;; arrays, which would substantially reduce copying.
 648 ;;;
 649 ;;; (also, generic sequences.  *sigh*.)
 650 (defun lref (lstring index)
 651   (dolist (l lstring)
 652     (when (and (<= (first l) index)
 653                (< index (second l)))
 654       (return (aref (third l) (- index (first l)))))))
 655
 656 (defun (setf lref) (newchar lstring index)
 657   (dolist (l lstring)
 658     (when (and (<= (first l) index)
 659                (< index (second l)))
 660       (return (setf (aref (third l) (- index (first l))) newchar)))))
 661
 662 (defun llength (lstring)
 663   (second (first (last lstring))))
 664
 665 (defun lstring (lstring)
 666   (let ((result (make-string (llength lstring))))
 667     (dolist (l lstring result)
 668       (replace result (third l) :start1 (first l) :end1 (second l)))))
 669
 670 (defun ldelete (lstring index)
 671   (do* ((ls lstring (cdr ls))
 672         (l (car ls) (car ls))
 673         so-fars)
 674        ((and (<= (first l) index)
 675              (< index (second l)))
 676         (append
 677          (nreverse so-fars)
 678          (cond
 679            ((= (first l) index)
 680             (list (list (first l) (1- (second l)) (subseq (third l) 1))))
 681            ((= index (1- (second l)))
 682             (list (list (first l) (1- (second l)) (subseq (third l) 0 (1- (length (third l)))))))
 683            (t
 684             (list
 685              (list (first l) index
 686                    (subseq (third l) 0 (- index (first l))))
 687              (list index (1- (second l))
 688                    (subseq (third l) (1+ (- index (first l))))))))
 689          (mapcar (lambda (x) (list (1- (first x)) (1- (second x)) (third x)))
 690                  (cdr ls))))
 691     (push l so-fars)))
 692
 693 (defun canonically-compose (string)
 694   (let* ((result (list (list 0 (length string) string)))
 695          (previous-starter-index (position 0 string :key #'combining-class))
 696          (i (and previous-starter-index (1+ previous-starter-index))))
 697     (when (or (not i) (= i (length string)))
 698       (return-from canonically-compose string))
 699     (tagbody
 700      again
 701        (when (and (>= (- i previous-starter-index) 2)
 702                   ;; test for Blocked (Unicode 3.11 para. D115)
 703                   ;;
 704                   ;; (assumes here that string has sorted combiners,
 705                   ;; so can look back just one step)
 706                   (>= (combining-class (lref result (1- i)))
 707                       (combining-class (lref result i))))
 708          (when (= (combining-class (lref result i)) 0)
 709            (setf previous-starter-index i))
 710          (incf i)
 711          (go next))
 712
 713        (let ((comp (primary-composition (lref result previous-starter-index)
 714                                         (lref result i))))
 715          (cond
 716            (comp
 717             (setf (lref result previous-starter-index) comp)
 718             (setf result (ldelete result i)))
 719            (t
 720             (when (= (combining-class (lref result i)) 0)
 721               (setf previous-starter-index i))
 722             (incf i))))
 723      next
 724        (unless (= i (llength result))
 725          (go again)))
 726     (if (= i (length string))
 727         string
 728         (lstring result))))
 729
 730 (defun normalize-string (string &optional (form :nfd))
 731   #!+sb-doc
 732   "Normalize STRING to the Unicode normalization form form.
 733    Acceptable values for form are :NFD, :NFC, :NFKD, and :NFKC"
 734   (declare (type (member :nfd :nfkd :nfc :nfkc) form))
 735   #!-sb-unicode
 736   (etypecase string
 737     ((array nil (*)) string)
 738     (string
 739      (ecase form
 740        ((:nfc :nfkc) string)
 741        ((:nfd :nfkd) (error "Cannot normalize to ~A form in #-SB-UNICODE builds" form)))))
 742   #!+sb-unicode
 743   (etypecase string
 744     (base-string string)
 745     ((array character (*))
 746      (ecase form
 747        ((:nfc)
 748         (canonically-compose (decompose-string string)))
 749        ((:nfd)
 750         (decompose-string string))
 751        ((:nfkc)
 752         (canonically-compose (decompose-string string :compatibility)))
 753        ((:nfkd)
 754         (decompose-string string :compatibility))))
 755     ((array nil (*)) string)))
 756
 757 (defun normalized-p (string &optional (form :nfd))
 758   #!+sb-doc
 759   "Tests if STRING is normalized to FORM"
 760   ;; FIXME: can be optimized
 761   (string= string (normalize-string string form)))
 762
 763 \f
 764 ;;; Unicode case algorithms
 765 ;; FIXME: Make these parts less redundant (macro?)
 766 (defparameter **special-titlecases**
 767   '#.(with-open-file (stream
 768                      (merge-pathnames
 769                       (make-pathname
 770                        :directory
 771                        '(:relative :up :up "output")
 772                        :name "titlecases" :type "lisp-expr")
 773                       sb!xc:*compile-file-truename*)
 774                      :direction :input
 775                      :element-type 'character)
 776         (read stream)))
 777
 778 (defparameter **special-casefolds**
 779   '#.(with-open-file (stream
 780                      (merge-pathnames
 781                       (make-pathname
 782                        :directory
 783                        '(:relative :up :up "output")
 784                        :name "foldcases" :type "lisp-expr")
 785                       sb!xc:*compile-file-truename*)
 786                      :direction :input
 787                      :element-type 'character)
 788         (read stream)))
 789
 790 (defun has-case-p (char)
 791   ;; Bit 6 is the Unicode case flag, as opposed to the Common Lisp one
 792   (logbitp 6 (aref **character-misc-database** (+ 5 (misc-index char)))))
 793
 794 (defun char-uppercase (char)
 795   (if (has-case-p char)
 796       (let ((cp (car (char-case-info char))))
 797         (if (atom cp) (list (code-char cp)) (mapcar #'code-char cp)))
 798       (list char)))
 799
 800 (defun char-lowercase (char)
 801   (if (has-case-p char)
 802       (let ((cp (cdr (char-case-info char))))
 803         (if (atom cp) (list (code-char cp)) (mapcar #'code-char cp)))
 804       (list char)))
 805
 806 (defun char-titlecase (char)
 807   (unless (has-case-p char) (return-from char-titlecase (list char)))
 808   (let* ((cp (char-code char))
 809          (value (assoc cp **special-titlecases**)))
 810     (if value
 811         (if (atom (cdr value))
 812             (list (code-char (cdr value)))
 813             (mapcar #'code-char (cdr value)))
 814         (char-uppercase char))))
 815
 816 (defun char-foldcase (char)
 817   (unless (has-case-p char) (return-from char-foldcase (list char)))
 818   (let* ((cp (char-code char))
 819          (value (assoc cp **special-casefolds**)))
 820     (if value
 821         (if (atom (cdr value))
 822             (list (code-char (cdr value)))
 823             (mapcar #'code-char (cdr value)))
 824         (char-lowercase char))))
 825
 826 (defun string-somethingcase (fn string special-fn)
 827   (let (result (len (length string)))
 828     (loop for index from 0 below len
 829        for char = (char string index)
 830        for cased = (or (funcall special-fn char index len)
 831                        (funcall fn char))
 832        do (loop for c in (remove :none cased) do (push c result)))
 833     (setf result (nreverse result))
 834     (coerce result 'string)))
 835
 836 (declaim (type function sb!unix::posix-getenv))
 837 (defun get-user-locale ()
 838   (let ((raw-locale
 839          #!+(or win32 unix) (or (sb!unix::posix-getenv "LC_ALL")
 840                                 (sb!unix::posix-getenv "LANG"))
 841          #!-(or win32 unix) nil))
 842     (when raw-locale
 843       (let ((lang-code (string-upcase
 844                         (subseq raw-locale 0 (position #\_ raw-locale)))))
 845         (when lang-code
 846           (intern lang-code "KEYWORD"))))))
 847
 848
 849 (defun uppercase (string &key locale)
 850   #!+sb-doc
 851   "Returns the full uppercase of STRING according to the Unicode standard.
 852 The result is not guaranteed to have the same length as the input. If :LOCALE
 853 is NIL, no language-specific case transformations are applied. If :LOCALE is a
 854 keyword representing a two-letter ISO country code, the case transforms of that
 855 locale are used. If :LOCALE is T, the user's current locale is used (Unix and
 856 Win32 only)."
 857   (when (eq locale t) (setf locale (get-user-locale)))
 858   (string-somethingcase
 859    #'char-uppercase string
 860    #!-sb-unicode (constantly nil)
 861    #!+sb-unicode ;; code-char with a constant > 255 breaks the build
 862    #'(lambda (char index len)
 863        (declare (ignore len))
 864        (cond
 865          ((and (eql locale :lt) (char= char (code-char #x0307))
 866                   (loop for i from (1- index) downto 0
 867                      for c = (char string i)
 868                      do (case (combining-class c)
 869                           (0 (return (soft-dotted-p c)))
 870                           (230 (return nil))
 871                           (t t))
 872                      finally (return nil)))
 873           '(:none))
 874          ((and (or (eql locale :tr) (eql locale :az))
 875                (char= char #\i))
 876           (list (code-char #x0130)))
 877          (t nil)))))
 878
 879 (defun lowercase (string &key locale)
 880   #!+sb-doc
 881   "Returns the full lowercase of STRING according to the Unicode standard.
 882 The result is not guaranteed to have the same length as the input.
 883 :LOCALE has the same semantics as the :LOCALE argument to UPPERCASE."
 884   (when (eq locale t) (setf locale (get-user-locale)))
 885   (string-somethingcase
 886    #'char-lowercase string
 887    #!-sb-unicode (constantly nil)
 888    #!+sb-unicode
 889    #'(lambda (char index len)
 890        (cond
 891          ((and (char= char (code-char #x03A3))
 892                (loop for i from (1- index) downto 0
 893                   for c = (char string i)
 894                   do (cond ((cased-p c) (return t))
 895                            ((case-ignorable-p c))
 896                            (t (return nil)))
 897                   finally (return nil))
 898                (loop for i from (1+ index) below len
 899                   for c = (char string i)
 900                   do (cond ((cased-p c) (return nil))
 901                            ((case-ignorable-p c))
 902                            (t (return t)))
 903                   finally (return t)))
 904           (list (code-char #x03C2)))
 905        ((eql locale :lt)
 906         (mapcar
 907          #'code-char
 908          (cdr (or
 909                (assoc (char-code char)
 910                       '((#x00CC . (#x0069 #x0307 #x0300))
 911                         (#x00CD . (#x0069 #x0307 #x0301))
 912                         (#x0128 . (#x0069 #x0307 #x0303))))
 913                (and (loop for i from (1+ index) below len
 914                        for c = (char string i)
 915                        do (case (combining-class c)
 916                             (230 (return t))
 917                             (0 (return nil))
 918                             (t t))
 919                        finally (return nil))
 920                     (assoc (char-code char)
 921                            '((#x0049 . (#x0069 #x0307))
 922                              (#x004A . (#x006A #x0307))
 923                              (#x012E . (#x012F #x0307)))))))))
 924        ((or (eql locale :tr) (eql locale :az))
 925         (cond
 926           ((char= char (code-char #x0130)) (list #\i))
 927           ((and (char= char (code-char #x0307))
 928                 (loop for i from (1- index) downto 0
 929                    for c = (char string i)
 930                    do (case (combining-class c)
 931                         (0 (return (char= c #\I)))
 932                         (230 (return nil))
 933                         (t t))
 934                    finally (return nil)))
 935            '(:none))
 936           ((and (char= char #\I)
 937                 (loop for i from (1+ index) below len
 938                    for c = (char string i)
 939                    do (case (combining-class c)
 940                         (0 (return t))
 941                         (230 (return (char/= c (code-char #x0307))))
 942                         (t t))
 943                    finally (return t)))
 944            (list (code-char #x0131)))
 945           (t nil)))
 946        (t nil)))))
 947
 948 (defun titlecase (string &key locale)
 949   #!+sb-doc
 950   "Returns the titlecase of STRING. The resulting string can
 951 be longer than the input.
 952 :LOCALE has the same semantics as the :LOCALE argument to UPPERCASE."
 953   (when (eq locale t) (setf locale (get-user-locale)))
 954   (let ((words (words string))
 955         (cased nil))
 956    (loop for word in words
 957       for first-cased = (or (position-if #'cased-p word) 0)
 958       for pre = (subseq word 0 first-cased)
 959       for initial = (char word first-cased)
 960       for rest = (subseq word (1+ first-cased))
 961       do (let ((up (char-titlecase initial)) (down (lowercase rest)))
 962            #!+sb-unicode
 963            (when (and (or (eql locale :tr) (eql locale :az))
 964                       (eql initial #\i))
 965              (setf up (list (code-char #x0130))))
 966            #!+sb-unicode
 967            (when (and (eql locale :lt)
 968                       (soft-dotted-p initial)
 969                       (eql (char down
 970                                  (position-if
 971                                   #'(lambda (c)
 972                                       (or (eql (combining-class c) 0)
 973                                           (eql (combining-class c) 230))) down))
 974                            (code-char #x0307)))
 975              (setf down (delete (code-char #x0307) down :count 1)))
 976            (push (concatenate 'string pre up down) cased)))
 977    (apply #'concatenate 'string (nreverse cased))))
 978
 979 (defun casefold (string)
 980   #!+sb-doc
 981   "Returns the full casefolding of STRING according to the Unicode standard.
 982 Casefolding remove case information in a way that allaws the results to be used
 983 for case-insensitive comparisons.
 984 The result is not guaranteed to have the same length as the input."
 985   (string-somethingcase #'char-foldcase string (constantly nil)))
 986
 987 \f
 988 ;;; Unicode break algorithms
 989 ;;; In all the breaking methods:
 990 ;;; (brk) establishes a break between `first` and `second`
 991 ;;; (nobrk) prevents a break between `first` and `second`
 992 ;;; Setting flag=T/state=:nobrk-next prevents a break between `second` and `htird`
 993
 994 ;; Word breaking sets this to make their algorithms less tricky
 995 (defvar *other-break-special-graphemes* nil)
 996 (defun grapheme-break-class (char)
 997   #!+sb-doc
 998   "Returns the grapheme breaking class of CHARACTER, as specified in UAX #29."
 999   (let ((cp (when char (char-code char)))
1000         (gc (when char (general-category char)))
1001         (not-spacing-mark
1002          #(#x102B #x102C #x1038 #x1062 #x1063 #x1064 #x1067 #x1068 #x1069
1003            #x106A #x106B #x106C #x106D #x1083 #x1087 #x1088 #x1089 #x108A
1004            #x108B #x108C #x108F #x109A #x109B #x109C #x19B0 #x19B1 #x19B2
1005            #x19B3 #x19B4 #x19B8 #x19B9 #x19BB #x19BC #x19BD #x19BE #x19BF
1006            #x19C0 #x19C8 #x19C9 #x1A61 #x1A63 #x1A64 #xAA7B #xAA7D)))
1007     (cond
1008       ((not char) nil)
1009       ((= cp 10) :LF)
1010       ((= cp 13) :CR)
1011       ((or (member gc '(:Mn :Me))
1012            (proplist-p char :other-grapheme-extend)
1013            (and *other-break-special-graphemes*
1014                 (member gc '(:Mc :Cf)) (not (<= #x200B cp #x200D))))
1015        :extend)
1016       ((or (member gc '(:Zl :Zp :Cc :Cs :Cf))
1017            ;; From Cn and Default_Ignorable_Code_Point
1018            (eql cp #x2065) (eql cp #xE0000)
1019            (<= #xFFF0 cp #xFFF8)
1020            (<= #xE0002 cp #xE001F)
1021            (<= #xE0080 cp #xE00FF)
1022            (<= #xE01F0 cp #xE0FFF)) :control)
1023       ((<= #x1F1E6 cp #x1F1FF) :regional-indicator)
1024       ((and (or (eql gc :Mc)
1025                 (eql cp #x0E33) (eql cp #x0EB3))
1026             (not (binary-search cp not-spacing-mark))) :spacing-mark)
1027       (t (hangul-syllable-type char)))))
1028
1029 (defun graphemes (string)
1030   #!+sb-doc
1031   "Breaks STRING into graphemes acording to the default
1032 grapheme breaking rules specified in UAX #29, returning a list of strings."
1033   (let* ((chars (coerce string 'list)) clusters (cluster (list (car chars))))
1034     (do ((first (car chars) second)
1035          (tail (cdr chars) (when tail (cdr tail)))
1036          (second (cadr chars) (when tail (cadr tail))))
1037         ((not first) (nreverse (mapcar #'(lambda (l) (coerce l 'string)) clusters)))
1038       (flet ((brk () (push (nreverse cluster) clusters) (setf cluster (list second)))
1039              (nobrk () (push second cluster)))
1040         (let ((c1 (grapheme-break-class first))
1041               (c2 (grapheme-break-class second)))
1042           (cond
1043             ((and (eql c1 :cr) (eql c2 :lf)) (nobrk))
1044             ((or (member c1 '(:control :cr :lf))
1045                  (member c2 '(:control :cr :lf))) (brk))
1046              ((or (and (eql c1 :l) (member c2 '(:l :v :lv :lvt)))
1047                   (and (or (eql c1 :v) (eql c1 :lv))
1048                        (or (eql c2 :v) (eql c2 :t)))
1049                   (and (eql c2 :t) (or (eql c1 :lvt) (eql c1 :t))))
1050               (nobrk))
1051              ((and (eql c1 :regional-indicator) (eql c2 :regional-indicator)) (nobrk))
1052              ((or (eql c2 :extend) (eql c2 :spacing-mark) (eql c1 :prepend)) (nobrk))
1053              (t (brk))))))))
1054
1055 (defun word-break-class (char)
1056   #!+sb-doc
1057   "Returns the word breaking class of CHARACTER, as specified in UAX #29."
1058   ;; Words use graphemes as characters to deal with the ignore rule
1059   (when (listp char) (setf char (car char)))
1060   (let ((cp (when char (char-code char)))
1061         (gc (when char (general-category char)))
1062         (newlines #(#xB #xC #x0085 #x0085 #x2028 #x2029))
1063         (also-katakana
1064          #(#x3031 #x3035 #x309B #x309C
1065            #x30A0 #x30A0 #x30FC #x30FC
1066            #xFF70 #xFF70))
1067         (midnumlet #(#x002E #x2018 #x2019 #x2024 #xFE52 #xFF07 #xFF0E))
1068         (midletter
1069          #(#x003A #x00B7 #x002D7 #x0387 #x05F4 #x2027 #xFE13 #xFE55 #xFF1A))
1070         (midnum
1071          ;; Grepping of Line_Break = IS adjusted per UAX #29
1072          #(#x002C #x003B #x037E #x0589 #x060C #x060D #x066C #x07F8 #x2044
1073            #xFE10 #xFE14 #xFE50 #xFE54 #xFF0C #xFF1B)))
1074     (cond
1075       ((not char) nil)
1076       ((= cp 10) :LF)
1077       ((= cp 13) :CR)
1078       ((= cp #x27) :single-quote)
1079       ((= cp #x22) :double-quote)
1080       ((ordered-ranges-member cp newlines) :newline)
1081       ((or (eql (grapheme-break-class char) :extend)
1082            (eql gc :mc)) :extend)
1083       ((<= #x1F1E6 cp #x1F1FF) :regional-indicator)
1084       ((and (eql gc :Cf) (not (<= #x200B cp #x200D))) :format)
1085       ((or (eql (script char) :katakana)
1086            (ordered-ranges-member cp also-katakana)) :katakana)
1087       ((and (eql (script char) :Hebrew) (eql gc :lo)) :hebrew-letter)
1088       ((and (or (alphabetic-p char) (= cp #x05F3))
1089             (not (or (ideographic-p char)
1090                      (eql (line-break-class char) :sa)
1091                      (eql (script char) :hiragana)))) :aletter)
1092       ((binary-search cp midnumlet) :midnumlet)
1093       ((binary-search cp midletter) :midletter)
1094       ((binary-search cp midnum) :midnum)
1095       ((or (and (eql gc :Nd) (not (<= #xFF10 cp #xFF19))) ;Fullwidth digits
1096            (eql cp #x066B)) :numeric)
1097       ((eql gc :Pc) :extendnumlet)
1098       (t nil))))
1099
1100 (defmacro flatpush (thing list)
1101   (let ((%thing (gensym)) (%i (gensym)))
1102     `(let ((,%thing ,thing))
1103        (if (listp ,%thing)
1104            (dolist (,%i ,%thing)
1105              (push ,%i ,list))
1106            (push ,%thing ,list)))))
1107
1108 (defun words (string)
1109   #!+sb-doc
1110   "Breaks STRING into words acording to the default
1111 word breaking rules specified in UAX #29. Returns a list of strings"
1112   (let ((chars (mapcar
1113                  #'(lambda (s)
1114                      (let ((l (coerce s 'list)))
1115                        (if (cdr l) l (car l))))
1116                  (let ((*other-break-special-graphemes* t)) (graphemes string))))
1117          words word flag)
1118     (flatpush (car chars) word)
1119     (do ((first (car chars) second)
1120          (tail (cdr chars) (cdr tail))
1121          (second (cadr chars) (cadr tail)))
1122         ((not first) (nreverse (mapcar #'(lambda (l) (coerce l 'string)) words)))
1123       (flet ((brk () (push (nreverse word) words) (setf word nil) (flatpush second word))
1124              (nobrk () (flatpush second word)))
1125         (let ((c1 (word-break-class first))
1126               (c2 (word-break-class second))
1127               (c3 (when (and tail (cdr tail)) (word-break-class (cadr tail)))))
1128           (cond
1129             (flag (nobrk) (setf flag nil))
1130             ;; CR+LF are bound together by the grapheme clustering
1131             ((or (eql c1 :newline) (eql c1 :cr) (eql c1 :lf)
1132                  (eql c2 :newline) (eql c2 :cr) (eql c2 :lf)) (brk))
1133             ((or (eql c2 :format) (eql c2 :extend)) (nobrk))
1134             ((and (or (eql c1 :aletter) (eql c1 :hebrew-letter))
1135                   (or (eql c2 :aletter) (eql c2 :hebrew-letter))) (nobrk))
1136             ((and (or (eql c1 :aletter) (eql c1 :hebrew-letter))
1137                   (member c2 '(:midletter :midnumlet :single-quote))
1138                   (or (eql c3 :aletter) (eql c3 :hebrew-letter)))
1139              (nobrk) (setf flag t)) ; Handle the multiple breaks from this rule
1140             ((and (eql c1 :hebrew-letter) (eql c2 :double-quote)
1141                   (eql c3 :hebrew-letter))
1142              (nobrk) (setf flag t))
1143             ((and (eql c1 :hebrew-letter) (eql c2 :single-quote)) (nobrk))
1144             ((or (and (eql c1 :numeric) (member c2 '(:numeric :aletter :hebrew-letter)))
1145                  (and (eql c2 :numeric) (member c1 '(:numeric :aletter :hebrew-letter))))
1146              (nobrk))
1147             ((and (eql c1 :numeric)
1148                   (member c2 '(:midnum :midnumlet :single-quote))
1149                   (eql c3 :numeric))
1150              (nobrk) (setf flag t))
1151             ((and (eql c1 :katakana) (eql c2 :katakana)) (nobrk))
1152             ((or (and (member c1
1153                               '(:aletter :hebrew-letter :katakana
1154                                 :numeric :extendnumlet)) (eql c2 :extendnumlet))
1155                  (and (member c2
1156                               '(:aletter :hebrew-letter :katakana
1157                                 :numeric :extendnumlet)) (eql c1 :extendnumlet)))
1158              (nobrk))
1159             ((and (eql c1 :regional-indicator) (eql c2 :regional-indicator)) (nobrk))
1160             (t (brk))))))))
1161
1162 (defun sentence-break-class (char)
1163   #!+sb-doc
1164   "Returns the sentence breaking class of CHARACTER, as specified in UAX #29."
1165   (when (listp char) (setf char (car char)))
1166   (let ((cp (when char (char-code char)))
1167         (gc (when char (general-category char)))
1168         (aterms #(#x002E #x2024 #xFE52 #xFF0E))
1169         (scontinues
1170          #(#x002C #x002D #x003A #x055D #x060C #x060D #x07F8 #x1802 #x1808
1171            #x2013 #x2014 #x3001 #xFE10 #xFE11 #xFE13 #xFE31 #xFE32 #xFE50
1172            #xFE51 #xFE55 #xFE58 #xFE63 #xFF0C #xFF0D #xFF1A #xFF64)))
1173     (cond
1174       ((not char) nil)
1175       ((= cp 10) :LF)
1176       ((= cp 13) :CR)
1177       ((or (eql (grapheme-break-class char) :extend)
1178            (eql gc :mc)) :extend)
1179       ((or (eql cp #x0085) (<= #x2028 cp #x2029)) :sep)
1180       ((and (eql gc :Cf) (not (<= #x200C cp #x200D))) :format)
1181       ((whitespace-p char) :sp)
1182       ((lowercase-p char) :lower)
1183       ((or (uppercase-p char) (eql gc :Lt)) :upper)
1184       ((or (alphabetic-p char) (eql cp #x00A0) (eql cp #x05F3)) :oletter)
1185       ((or (and (eql gc :Nd) (not (<= #xFF10 cp #xFF19))) ;Fullwidth digits
1186            (<= #x066B cp #x066C)) :numeric)
1187       ((binary-search cp aterms) :aterm)
1188       ((binary-search cp scontinues) :scontinue)
1189       ((proplist-p char :sterm) :sterm)
1190       ((and (or (member gc '(:Po :Ps :Pe :Pf :Pi))
1191                 (eql (line-break-class char) :qu))
1192             (not (eql cp #x05F3))) :close)
1193       (t nil))))
1194
1195 (defun sentence-prebreak (string)
1196   #!+sb-doc
1197   "Pre-combines some sequences of characters to make the sentence-break
1198 algorithm simpler..
1199 Specifically,
1200 - Combines any character with the following extend of format characters
1201 - Combines CR + LF into '(CR LF)
1202 - Combines any run of :cp*:close* into one character"
1203   (let ((chars (coerce string 'list))
1204         cluster clusters last-seen sp-run)
1205     (labels ((flush () (if (cdr cluster) (push (nreverse cluster) clusters)
1206                            (if cluster (push (car cluster) clusters)))
1207                     (setf cluster nil))
1208              (brk (x)
1209                (flush) (push x clusters))
1210              (nobrk (x) (push x cluster)))
1211     (loop for ch in chars
1212        for type = (sentence-break-class ch)
1213        do (cond
1214             ((and (eql last-seen :cr) (eql type :lf)) (nobrk ch) (flush) (setf last-seen nil))
1215             ((eql last-seen :cr) (brk ch) (setf last-seen nil))
1216             ((eql type :cr) (nobrk ch) (setf last-seen :cr))
1217             ((eql type :lf) (brk ch) (setf last-seen nil))
1218             ((eql type :sep) (brk ch) (setf last-seen nil))
1219             ((and last-seen (or (eql type :extend) (eql type :format)))
1220              (nobrk ch))
1221             ((eql type :close)
1222              (unless (eql last-seen :close) (flush))
1223              (nobrk ch) (setf last-seen :close sp-run nil))
1224             ((eql type :sp)
1225              (unless (or (and (not sp-run) (eql last-seen :close)) (eql last-seen :sp))
1226                (flush) (setf sp-run t))
1227              (nobrk ch) (setf last-seen :sp))
1228             (t (flush) (nobrk ch) (setf last-seen type sp-run nil))))
1229     (flush) (nreverse clusters))))
1230
1231 (defun sentences (string)
1232   #!+sb-doc
1233   "Breaks STRING into sentences acording to the default
1234 sentence breaking rules specified in UAX #29"
1235   (let ((special-handling '(:close :sp :sep :cr :lf :scontinue :sterm :aterm))
1236         (chars (sentence-prebreak string))
1237         sentence sentences state)
1238     (flatpush (car chars) sentence)
1239     (do ((first (car chars) second)
1240          (tail (cdr chars) (cdr tail))
1241          (second (cadr chars) (cadr tail))
1242          (third (caddr chars) (caddr tail)))
1243         ((not first)
1244          (progn
1245            ; Shake off last sentence
1246            (when sentence (push (nreverse sentence) sentences))
1247            (nreverse (mapcar #'(lambda (l) (coerce l 'string)) sentences))))
1248       (flet ((brk () (push (nreverse sentence) sentences)
1249                   (setf sentence nil) (flatpush second sentence))
1250              (nobrk () (flatpush second sentence)))
1251       (let ((c1 (sentence-break-class first))
1252             (c2 (sentence-break-class second))
1253             (c3 (sentence-break-class third)))
1254         (cond
1255           ((eql state :brk-next) (brk) (setf state nil))
1256           ((eql state :nobrk-next) (nobrk) (setf state nil))
1257           ((member c1 '(:sep :cr :lf)) (brk))
1258           ((and (eql c1 :aterm) (eql c2 :numeric)) (nobrk))
1259           ((and (eql c1 :upper) (eql c2 :aterm)
1260                 (eql c3 :upper)) (nobrk) (setf state :nobrk-next))
1261           ((or (and (member c1 '(:sterm :aterm)) (member c2 '(:close :sp))
1262                     (member c3 '(:scontinue :sterm :aterm)))
1263                (and (member c1 '(:sterm :aterm))
1264                     (member c2 '(:scontinue :sterm :aterm))))
1265            (nobrk) (when (member c2 '(:close :sp)) (setf state :nobrk-next)))
1266           ((and (member c1 '(:sterm :aterm)) (member c2 '(:close :sp))
1267                 (member c3 '(:sep :cr :lf)))
1268            (nobrk) (setf state :nobrk-next)) ;; Let the linebreak call (brk)
1269           ((and (member c1 '(:sterm :aterm)) (member c2 '(:sep :cr :lf)))
1270            (nobrk)) ; Doesn't trigger rule 8
1271           ((eql c1 :sterm) ; Not ambiguous anymore, rule 8a already handled
1272            (if (member c2 '(:close :sp))
1273                (progn (nobrk) (setf state :brk-next))
1274                (brk)))
1275           ((and (eql c2 :sterm) third (not (member c3 special-handling)))
1276            (nobrk) (setf state :brk-next)) ; STerm followed by nothing important
1277           ((or (eql c1 :aterm)
1278                (and (eql c2 :aterm) third
1279                     (not (member c3 special-handling)) (not (eql c3 :numeric))))
1280            ; Finally handle rule 8
1281            (if (loop for c in
1282                     (if (and third (not (or (member c3 special-handling)
1283                                             (eql c3 :numeric))))
1284                         (cdr tail) tail)
1285                   for type = (sentence-break-class c) do
1286                     (when (member type '(:oletter :upper :sep :cr :lf
1287                                          :sterm :aterm))
1288                       (return nil))
1289                     (when (eql type :lower) (return t)) finally (return nil))
1290                ; Ambiguous case
1291                (progn (nobrk) (setf state :nobrk-next))
1292                ; Otherwise
1293                (if (member c2 '(:close :sp :aterm))
1294                    (progn (nobrk) (setf state :brk-next))
1295                    (brk))))
1296           (t (nobrk))))))))
1297
1298 (defun line-prebreak (string)
1299   (let ((chars (coerce string 'list))
1300         cluster clusters last-seen)
1301     (loop for char in chars
1302        for type = (line-break-class char)
1303        do
1304          (when
1305              (and cluster
1306                   (or
1307                    (not (eql type :cm))
1308                    (and (eql type :cm)
1309                         (member last-seen '(nil :BK :CR :LF :NL :SP :ZW)))))
1310            (if (cdr cluster)
1311                (push (nreverse cluster) clusters)
1312                (push (car cluster) clusters))
1313            (setf cluster nil))
1314          (unless (eql type :cm) (setf last-seen type))
1315          (push char cluster))
1316     (if (cdr cluster)
1317         (push (nreverse cluster) clusters)
1318         (push (car cluster) clusters))
1319     (nreverse clusters)))
1320
1321 (defun line-break-annotate (string)
1322   (let ((chars (line-prebreak string))
1323         first second t1 t2 tail (ret (list :cant))
1324         state after-spaces)
1325     (macrolet ((cmpush (thing)
1326                  (let ((gthing (gensym)))
1327                    `(let ((,gthing ,thing))
1328                       (if (listp ,gthing)
1329                           (loop for (c next) on ,gthing do
1330                                (push c ret)
1331                                (when next (push :cant ret)))
1332                           (push ,thing ret)))))
1333                (between (a b action)
1334                  (let ((atest (if (eql a :any) t
1335                                   (if (listp a)
1336                                       `(member t1 ,a)
1337                                       `(eql t1 ,a))))
1338                        (btest (if (eql b :any) t
1339                                   (if (listp b)
1340                                       `(member t2 ,b)
1341                                       `(eql t2 ,b)))))
1342                  `(when (and ,atest ,btest)
1343                     (cmpush ,action)
1344                     (cmpush second)
1345                     (go tail))))
1346                (after-spaces (a b action)
1347                  (let ((atest (if (eql a :any) t
1348                                   (if (listp a)
1349                                       `(member t1 ,a)
1350                                       `(eql t1 ,a))))
1351                        (btest (if (eql b :any) t
1352                                   (if (listp b)
1353                                       `(member type ,b)
1354                                       `(eql type ,b)))))
1355                    `(when
1356                         (and ,atest
1357                              (loop for c in tail
1358                                 for type = (line-break-class c :resolve t)
1359                                 do
1360                                   (when (not (eql type :sp))
1361                                     (return ,btest))))
1362                       (if (eql t2 :sp)
1363                          (progn (cmpush :cant)
1364                                 (cmpush second)
1365                                 (setf state :eat-spaces)
1366                                 (setf after-spaces ,action)
1367                                 (go tail))
1368                          (progn (cmpush ,action)
1369                                 (cmpush second)
1370                                 (go tail)))))))
1371
1372       (cmpush (car chars))
1373       (setf first (car chars))
1374       (setf tail (cdr chars))
1375       (setf second (car tail))
1376       (tagbody
1377          top
1378          (when (not first) (go end))
1379          (setf t1 (line-break-class first :resolve t))
1380          (setf t2 (line-break-class second :resolve t))
1381          (between :any :nil :must)
1382          (when (and (eql state :eat-spaces) (eql t2 :sp))
1383             (cmpush :cant) (cmpush second) (go tail))
1384          (between :bk :any :must)
1385          (between :cr :lf :cant)
1386          (between '(:cr :lf :nl) :any :must)
1387          (between :any '(:zw :bk :cr :lf :nl) :cant)
1388          (when after-spaces (cmpush after-spaces) (cmpush second)
1389                (setf state nil after-spaces nil) (go tail))
1390          (after-spaces :zw :any :can)
1391          (between :any :wj :cant)
1392          (between :wj :any :cant)
1393          (between :gl :any :cant)
1394          (between '(:ZW :WJ :SY :SG :SA :RI :QU :PR :PO :OP :NU :NS :NL
1395                     :LF :IS :IN :ID :HL :GL :EX :CR :CP :CM :CL :CJ :CB
1396                     :BK :BB :B2 :AL :AI :JL :JV :JT :H2 :H3 :XX)
1397                   :gl :cant)
1398          (between :any '(:cl :cp :ex :is :sy) :cant)
1399          (after-spaces :op :any :cant)
1400          (after-spaces :qu :op :cant)
1401          (after-spaces '(:cl :cp) :ns :cant)
1402          (after-spaces :b2 :b2 :cant)
1403          (between :any :sp :cant) ;; Goes here to deal with after-spaces
1404          (between :sp :any :can)
1405          (between :any :qu :cant)
1406          (between :qu :any :cant)
1407          (between :any :cb :can)
1408          (between :cb :any :can)
1409          (between :any '(:ba :hy :ns) :cant)
1410          (between :bb :any :cant)
1411          (when (and (eql t1 :hl) (eql t2 :hy))
1412            (cmpush :cant) (cmpush second)
1413            (setf after-spaces :can) (go tail))
1414          (between '(:al :hl :id :in :nu) :in :cant)
1415          (between :id :po :cant)
1416          (between '(:al :hl) :nu :cant)
1417          (between '(:nu :po) '(:al :hl) :cant)
1418          (between :pr '(:id :al :hl) :cant)
1419          (between '(:cl :cp :nu) '(:po :pr) :cant)
1420          (between :nu '(:po :pr :nu) :cant)
1421          (between '(:po :pr) :op :cant)
1422          (between '(:po :pr :hy :is :sy) :nu :cant)
1423          (between :jl '(:jl :jv :h2 :h3) :cant)
1424          (between '(:jv :h2) '(:jv :jt) :cant)
1425          (between '(:jt :h3) :jt :cant)
1426          (between '(:jl :jv :jt :h2 :h3) '(:in :po) :cant)
1427          (between :pr '(:jl :jv :jt :h2 :h3) :cant)
1428          (between '(:al :hl :is) '(:al :hl) :cant)
1429          (between '(:al :hl :nu) :op :cant)
1430          (between :cp '(:al :hl :nu) :cant)
1431          (between :ri :ri :cant)
1432          (between :any :any :can)
1433          tail
1434          (setf first second)
1435          (setf tail (cdr tail))
1436          (setf second (car tail))
1437          (go top)
1438          end)
1439       ;; LB3 satisfied by (:any :nil) -> :must
1440       (setf ret (nreverse ret))
1441       ret)))
1442
1443 (defun break-list-at (list n)
1444   (let ((tail list) (pre-tail nil))
1445     (loop repeat n do (setf pre-tail tail) (setf tail (cdr tail)))
1446     (setf (cdr pre-tail) nil)
1447     (values list tail)))
1448
1449 (defun lines (string &key (margin *print-right-margin*))
1450   #!+sb-doc
1451   "Breaks STRING into lines that are no wider than :MARGIN according to the
1452 line breaking rules outlined in UAX #14. Combining marks will awsays be kept
1453 together with their base characters, and spaces (but not other types of
1454 whitespace) will be removed from the end of lines. If :MARGIN is unspecified,
1455 it defaults to 80 characters"
1456   (when (string= string "") (return-from lines (list "")))
1457   (unless margin (setf margin 80))
1458   (do* ((chars (line-break-annotate string))
1459         line lines (filled 0) last-break-distance
1460         (break-type (car chars) (car tail))
1461         (char (cadr chars) (cadr tail))
1462         (tail (cddr chars) (cddr tail)))
1463        ((not break-type)
1464         (mapcar #'(lambda (s) (coerce s 'string)) (nreverse lines)))
1465     (ecase break-type
1466       (:cant
1467        (push char line)
1468        (unless (eql (line-break-class char) :CM)
1469          (incf filled))
1470        (when last-break-distance (incf last-break-distance)))
1471       (:can
1472        (push char line)
1473        (setf last-break-distance 1)
1474        (incf filled))
1475       (:must
1476        (push char line)
1477        (setf last-break-distance 1)
1478        (incf filled)
1479        (go break)))
1480     (if (> filled margin)
1481         (go break)
1482         (go next))
1483    break
1484     (when (not last-break-distance)
1485       ;; If we don't have any line breaks, remove the last thing we added that
1486       ;; takes up space, and all its combining marks
1487       (setf last-break-distance
1488             (1+ (loop for c in line while (eql (line-break-class c) :cm) summing 1))))
1489     (multiple-value-bind (next-line this-line) (break-list-at line last-break-distance)
1490       (loop while (eql (line-break-class (car this-line)) :sp)
1491          do (setf this-line (cdr this-line)))
1492       (push (nreverse this-line) lines)
1493       (setf line next-line)
1494       (setf filled (length line))
1495       (setf last-break-distance nil))
1496    next))
1497
1498 \f
1499 ;;; Collation
1500 (defconstant +maximum-variable-primary-element+
1501   #.(with-open-file (stream
1502                      (merge-pathnames
1503                       (make-pathname
1504                        :directory
1505                        '(:relative :up :up "output")
1506                        :name "other-collation-info" :type "lisp-expr")
1507                       sb!xc:*compile-file-truename*)
1508                      :direction :input
1509                      :element-type 'character)
1510       (read stream)))
1511
1512 (defun unpack-collation-key (key)
1513   (declare (type (simple-array (unsigned-byte 32) (*)) key))
1514   (loop for value across key
1515         collect
1516         (list (ldb (byte 16 16) value)
1517               (ldb (byte 11 5) value)
1518               (ldb (byte 5 0) value))))
1519
1520 (declaim (inline variable-p))
1521 (defun variable-p (x)
1522   (<= 1 x +maximum-variable-primary-element+))
1523
1524 (defun collation-key (string start end)
1525   (let (char1
1526         (char2 (code-char 0))
1527         (char3 (code-char 0)))
1528     (case (- end start)
1529       (1 (setf char1 (char string start)))
1530       (2 (setf char1 (char string start)
1531                char2 (char string (+ start 1))))
1532       (3 (setf char1 (char string start)
1533                char2 (char string (+ start 1))
1534                char3 (char string (+ start 2))))
1535       (t
1536        ;; There are never more than three characters in a contraction, right?
1537        (return-from collation-key nil)))
1538     (let ((packed-key (gethash (pack-3-codepoints
1539                                 (char-code char1)
1540                                 (char-code char2)
1541                                 (char-code char3))
1542                                **character-collations**)))
1543       (if packed-key
1544           (unpack-collation-key packed-key)
1545           (when (char= (code-char 0) char2 char3)
1546             (let* ((cp (char-code char1))
1547                    (base
1548                      (cond ((not (proplist-p char1 :unified-ideograph))
1549                             #xFBC0)
1550                            ((or (<= #x4E00 cp #x9FFF)
1551                                 (<= #xF900 cp #xFAFF))
1552                             #xFB40)
1553                            (t
1554                             #xFB80)))
1555                    (a (+ base (ash cp -15)))
1556                    (b (logior #.(ash 1 15) (logand cp #x7FFFF))))
1557               (list (list a #x20 #x2) (list b 0 0))))))))
1558
1559 (defun sort-key (string)
1560   (let* ((str (normalize-string string :nfd))
1561          (i 0) (len (length str)) max-match new-i
1562          sort-key
1563          after-variable)
1564     (loop while (< i len)
1565           do
1566           (loop for offset from 1 to 3
1567                 for index = (+ i offset)
1568                 while (<= index len)
1569                 do
1570                 (let ((key (collation-key str i index)))
1571                   (when key
1572                     (setf max-match key
1573                           new-i index))))
1574           (loop for index from new-i below len
1575                 for char = (char str index)
1576                 for previous-combining-class = combining-class
1577                 for combining-class = (combining-class char)
1578                 until (eql combining-class 0)
1579                 unless (and (>= (- index new-i) 1)
1580                             ;; Combiners are sorted, we only have to look back
1581                             ;; one step (see canonically-compose)
1582                             (>= (combining-class (char str (1- index)))
1583                                 combining-class))
1584                 do
1585                 (rotatef (char str new-i) (char str index))
1586                 (let ((key (collation-key str i (1+ new-i))))
1587                   (if key
1588                       (setf max-match key
1589                             new-i (1+ new-i))
1590                       (rotatef (char str new-i) (char str index)))))
1591           (loop for key in max-match do (push key sort-key))
1592           (setf i new-i))
1593     (macrolet ((push-non-zero (obj place)
1594                  `(when (/= ,obj 0)
1595                     (push ,obj ,place))))
1596       (let (primary secondary tertiary quatenary)
1597         (loop for (k1 k2 k3) in (nreverse sort-key)
1598               do
1599               (cond
1600                 ((= k1 k2 k3 0))
1601                 ((variable-p k1)
1602                  (setf after-variable t)
1603                  (push k1 quatenary))
1604                 ((/= k1 0)
1605                  (setf after-variable nil)
1606                  (push k1 primary)
1607                  (push-non-zero k2 secondary)
1608                  (push-non-zero k3 tertiary)
1609                  (push #xFFFF quatenary))
1610                 ((/= k3 0)
1611                  (unless after-variable
1612                    (push-non-zero k2 secondary)
1613                    (push k3 tertiary)
1614                    (push #xFFFF quatenary)))))
1615         (concatenate 'vector
1616                      (nreverse primary) #(0) (nreverse secondary) #(0)
1617                      (nreverse tertiary) #(0) (nreverse quatenary))))))
1618
1619 (defun vector< (vector1 vector2)
1620   (loop for i across vector1
1621         for j across vector2
1622         do
1623         (cond ((< i j) (return-from vector< t))
1624               ((> i j) (return-from vector< nil))))
1625   ;; If there's no differences, shortest vector wins
1626   (< (length vector1) (length vector2)))
1627
1628 (defun unicode= (string1 string2 &key (start1 0) end1 (start2 0) end2 (strict t))
1629   #!+sb-doc
1630   "Determines whether STRING1 and STRING2 are canonically equivalent according
1631 to Unicode. The START and END arguments behave like the arguments to STRING=.
1632 If :STRICT is NIL, UNICODE= tests compatibility equavalence instead."
1633   (let ((str1 (normalize-string (subseq string1 start1 end1) (if strict :nfd :nfkd)))
1634         (str2 (normalize-string (subseq string2 start2 end2) (if strict :nfd :nfkd))))
1635     (string= str1 str2)))
1636
1637 (defun unicode-equal (string1 string2 &key (start1 0) end1 (start2 0) end2 (strict t))
1638     #!+sb-doc
1639   "Determines whether STRING1 and STRING2 are canonically equivalent after
1640 casefoldin8 (that is, ignoring case differences) according to Unicode. The
1641 START and END arguments behave like the arguments to STRING=. If :STRICT is
1642 NIL, UNICODE= tests compatibility equavalence instead."
1643   (let ((str1 (normalize-string (subseq string1 start1 end1) (if strict :nfd :nfkd)))
1644         (str2 (normalize-string (subseq string2 start2 end2) (if strict :nfd :nfkd))))
1645     (string=
1646      (normalize-string (casefold str1) (if strict :nfd :nfkd))
1647      (normalize-string (casefold str2) (if strict :nfd :nfkd)))))
1648
1649 (defun unicode< (string1 string2 &key (start1 0) end1 (start2 0) end2)
1650   #!+sb-doc
1651   "Determines whether STRING1 sorts before STRING2 using the Unicode Collation
1652 Algorithm, The function uses an untailored Default Unicode Collation Element Table
1653 to produce the sort keys. The function uses the Shifted method for dealing
1654 with variable-weight characters, as described in UTS #10"
1655   (let* ((s1 (subseq string1 start1 end1))
1656          (s2 (subseq string2 start2 end2))
1657          (k1 (sort-key s1)) (k2 (sort-key s2)))
1658     (if (equalp k1 k2)
1659         (string< (normalize-string s1 :nfd) (normalize-string s2 :nfd))
1660         (vector< k1 k2))))
1661
1662 (defun unicode<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
1663   #!+sb-doc
1664   "Tests if STRING1 and STRING2 are either UNICODE< or UNICODE="
1665   (or
1666    (unicode= string1 string2 :start1 start1 :end1 end1
1667              :start2 start2 :end2 end2)
1668    (unicode< string1 string2 :start1 start1 :end1 end1
1669              :start2 start2 :end2 end2)))
1670
1671 (defun unicode> (string1 string2 &key (start1 0) end1 (start2 0) end2)
1672   #!+sb-doc
1673   "Tests if STRING2 is UNICODE< STRING1."
1674    (unicode< string2 string1 :start1 start2 :end1 end2
1675              :start2 start1 :end2 end1))
1676
1677 (defun unicode>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
1678   #!+sb-doc
1679   "Tests if STRING1 and STRING2 are either UNICODE= or UNICODE>"
1680   (or
1681    (unicode= string1 string2 :start1 start1 :end1 end1
1682              :start2 start2 :end2 end2)
1683    (unicode> string1 string2 :start1 start1 :end1 end1
1684              :start2 start2 :end2 end2)))
1685
1686 \f
1687 ;;; Confusable detection
1688
1689 (defun canonically-deconfuse (string)
1690   (let (ret (i 0) new-i (len (length string))
1691             best-node)
1692     (loop while (< i len) do
1693          (loop for offset from 1 to 5
1694             while (<= (+ i offset) len)
1695             do
1696               (let ((node (gethash (subseq string i (+ i offset))
1697                                    **confusables**)))
1698                 (when node (setf best-node node new-i (+ i offset)))))
1699          (cond
1700            (best-node (push best-node ret) (setf i new-i))
1701            (t (push (subseq string i (1+ i)) ret) (incf i)))
1702          (setf best-node nil new-i nil))
1703     (apply #'concatenate 'string (nreverse ret))))
1704
1705 (defun confusable-p (string1 string2 &key (start1 0) end1 (start2 0) end2)
1706   #!+sb-doc
1707   "Determines whether STRING1 and STRING2 could be visually confusable
1708 according to the IDNA confusableSummary.txt table"
1709     (let* ((form #!+sb-unicode :nfd #!-sb-unicode :nfc)
1710            (str1 (normalize-string (subseq string1 start1 end1) form))
1711            (str2 (normalize-string (subseq string2 start2 end2) form))
1712            (skeleton1 (normalize-string (canonically-deconfuse str1) form))
1713            (skeleton2 (normalize-string (canonically-deconfuse str2) form)))
1714       (string= skeleton1 skeleton2)))