src/code/target-unicode.lisp

   1 ;;;; Unicode functions
   2
   3 ;;;; This software is part of the SBCL system. See the README file for
   4 ;;;; more information.
   5 ;;;;
   6 ;;;; This software is derived from the CMU CL system, which was
   7 ;;;; written at Carnegie Mellon University and released into the
   8 ;;;; public domain. The software is in the public domain and is
   9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
  10 ;;;; files for more information.
  11
  12 (in-package "SB!UNICODE")
  13
  14 (declaim (type simple-vector **special-numerics**))
  15 (sb!impl::defglobal **special-numerics**
  16   #.(with-open-file (stream
  17                      (merge-pathnames
  18                       (make-pathname
  19                        :directory
  20                        '(:relative :up :up "output")
  21                        :name "numerics" :type "lisp-expr")
  22                       sb!xc:*compile-file-truename*)
  23                      :direction :input
  24                      :element-type 'character)
  25       (read stream)))
  26
  27
  28 (declaim (type (simple-array (unsigned-byte 32) (*)) **block-ranges**))
  29 (sb!impl::defglobal **block-ranges**
  30   #.(sb!int:!coerce-to-specialized
  31      (with-open-file (stream
  32                       (merge-pathnames
  33                        (make-pathname
  34                         :directory
  35                         '(:relative :up :up "output")
  36                         :name "blocks" :type "lisp-expr")
  37                        sb!xc:*compile-file-truename*)
  38                       :direction :input
  39                       :element-type 'character)
  40        (read stream))
  41      '(unsigned-byte 32)))
  42
  43 (macrolet ((unicode-property-init ()
  44              (let ((proplist-dump
  45                     (with-open-file (stream
  46                                      (merge-pathnames
  47                                       (make-pathname
  48                                        :directory
  49                                        '(:relative :up :up "output")
  50                                        :name "misc-properties" :type "lisp-expr")
  51                                       sb!xc:*compile-file-truename*)
  52                                      :direction :input
  53                                      :element-type 'character)
  54                       (read stream)))
  55                    (confusable-sets
  56                     (with-open-file (stream
  57                                      (merge-pathnames
  58                                       (make-pathname
  59                                        :directory
  60                                        '(:relative :up :up "output")
  61                                        :name "confusables" :type "lisp-expr")
  62                                       sb!xc:*compile-file-truename*)
  63                                      :direction :input
  64                                      :element-type 'character)
  65                       (read stream)))
  66                    (bidi-mirroring-list
  67                     (with-open-file (stream
  68                                      (merge-pathnames
  69                                       (make-pathname
  70                                        :directory
  71                                        '(:relative :up :up "output")
  72                                        :name "bidi-mirrors" :type "lisp-expr")
  73                                       sb!xc:*compile-file-truename*)
  74                                      :direction :input
  75                                      :element-type 'character)
  76                       (read stream))))
  77                `(progn
  78                   (sb!impl::defglobal **proplist-properties** ',proplist-dump)
  79                   (sb!impl::defglobal **confusables** ',confusable-sets)
  80                   (sb!impl::defglobal **bidi-mirroring-glyphs** ',bidi-mirroring-list)
  81                   (defun !unicode-properties-cold-init ()
  82                     (let ((hash (make-hash-table)) (list ',proplist-dump))
  83                       (do ((k (car list) (car list)) (v (cadr list) (cadr list)))
  84                           ((not list) hash)
  85                         (setf (gethash k hash) v)
  86                         (setf list (cddr list)))
  87                       (setf **proplist-properties** hash))
  88                     (let ((hash (make-hash-table :test #'equal)))
  89                       (loop for set in ',confusable-sets
  90                          for items = (mapcar #'(lambda (item)
  91                                                  (map 'simple-string
  92                                                       #'code-char item))
  93                                              #!+sb-unicode set
  94                                              #!-sb-unicode
  95                                              (remove-if-not
  96                                               #'(lambda (item)
  97                                                   (every
  98                                                    #'(lambda (x)
  99                                                        (< x sb!xc:char-code-limit))
 100                                                    item)) set))
 101                          do (dolist (i items)
 102                               (setf (gethash i hash) (first items))))
 103                       (setf **confusables** hash))
 104                     (let ((hash (make-hash-table)) (list ',bidi-mirroring-list))
 105                       (loop for (k v) in list do
 106                            (setf (gethash k hash) v))
 107                       (setf **bidi-mirroring-glyphs** hash)))))))
 108   (unicode-property-init))
 109
 110 ;;; Unicode property access
 111 (defun ordered-ranges-member (item vector)
 112   (declare (type simple-vector vector)
 113            (type fixnum item)
 114            (optimize speed))
 115   (labels ((recurse (start end)
 116              (declare (type index start end))
 117              (when (< start end)
 118                (let* ((i (+ start (truncate (- end start) 2)))
 119                       (index (* 2 i))
 120                       (elt1 (svref vector index))
 121                       (elt2 (svref vector (1+ index))))
 122                  (declare (type index i)
 123                           (fixnum elt1 elt2))
 124                  (cond ((< item elt1)
 125                         (recurse start i))
 126                        ((> item elt2)
 127                         (recurse (+ 1 i) end))
 128                        (t
 129                         item))))))
 130     (recurse 0 (truncate (length vector) 2))))
 131
 132 ;; Returns which range `item` was found in or NIL
 133 ;; First range = 0, second range = 1 ...
 134 (defun ordered-ranges-position (item vector)
 135   (declare (type (simple-array (unsigned-byte 32) (*)) vector)
 136            (type fixnum item))
 137   (labels ((recurse (start end)
 138              (declare (type index start end))
 139              (when (< start end)
 140                (let* ((i (+ start (truncate (- end start) 2)))
 141                       (index (* 2 i))
 142                       (elt1 (aref vector index))
 143                       (elt2 (aref vector (1+ index))))
 144                  (declare (type index i))
 145                  (cond ((< item elt1)
 146                         (recurse start i))
 147                        ((> item elt2)
 148                         (recurse (+ 1 i) end))
 149                        (t
 150                         i))))))
 151     (recurse 0 (truncate (length vector) 2))))
 152
 153 (defun proplist-p (character property)
 154   #!+sb-doc
 155   "Returns T if CHARACTER has the specified PROPERTY.
 156 PROPERTY is a keyword representing one of the properties from PropList.txt,
 157 with underscores replaced by dashes."
 158   (ordered-ranges-member (char-code character)
 159                          (gethash property **proplist-properties**)))
 160
 161 ;; WARNING: These have to be manually kept in sync with the values in ucd.lisp
 162 (declaim (type simple-vector *general-categories* *bidi-classes* *east-asian-widths*
 163                *scripts* *line-break-classes* *blocks*))
 164 (sb!impl::defglobal *general-categories*
 165   #(:Lu :Ll :Lt :Lm :Lo :Cc :Cf :Co :Cs :Cn :Mc :Me :Mn :Nd
 166     :Nl :No :Pc :Pd :Pe :Pf :Pi :Po :Ps :Sc :Sk :Sm :So :Zl
 167     :Zp :Zs))
 168
 169 (sb!impl::defglobal *bidi-classes*
 170   #(:BN :AL :AN :B :CS :EN :ES :ET :L :LRE :LRO :NSM :ON
 171     :PDF :R :RLE :RLO :S :WS :LRI :RLI :FSI :PDI))
 172
 173 (sb!impl::defglobal *east-asian-widths*
 174   #(:N :A :H :W :F :Na))
 175
 176 (sb!impl::defglobal *scripts*
 177   #(:Unknown :Common :Latin :Greek :Cyrillic :Armenian :Hebrew :Arabic :Syriac
 178     :Thaana :Devanagari :Bengali :Gurmukhi :Gujarati :Oriya :Tamil :Telugu
 179     :Kannada :Malayalam :Sinhala :Thai :Lao :Tibetan :Myanmar :Georgian :Hangul
 180     :Ethiopic :Cherokee :Canadian-Aboriginal :Ogham :Runic :Khmer :Mongolian
 181     :Hiragana :Katakana :Bopomofo :Han :Yi :Old-Italic :Gothic :Deseret
 182     :Inherited :Tagalog :Hanunoo :Buhid :Tagbanwa :Limbu :Tai-Le :Linear-B
 183     :Ugaritic :Shavian :Osmanya :Cypriot :Braille :Buginese :Coptic :New-Tai-Lue
 184     :Glagolitic :Tifinagh :Syloti-Nagri :Old-Persian :Kharoshthi :Balinese
 185     :Cuneiform :Phoenician :Phags-Pa :Nko :Sundanese :Lepcha :Ol-Chiki :Vai
 186     :Saurashtra :Kayah-Li :Rejang :Lycian :Carian :Lydian :Cham :Tai-Tham
 187     :Tai-Viet :Avestan :Egyptian-Hieroglyphs :Samaritan :Lisu :Bamum :Javanese
 188     :Meetei-Mayek :Imperial-Aramaic :Old-South-Arabian :Inscriptional-Parthian
 189     :Inscriptional-Pahlavi :Old-Turkic :Kaithi :Batak :Brahmi :Mandaic :Chakma
 190     :Meroitic-Cursive :Meroitic-Hieroglyphs :Miao :Sharada :Sora-Sompeng
 191     :Takri :Bassa-Vah :Mahajani :Pahawh-Hmong :Caucasian-Albanian :Manichaean
 192     :Palmyrene :Duployan :Mende-Kikakui :Pau-Cin-Hau :Elbasan :Modi
 193     :Psalter-Pahlavi :Grantha :Mro :Siddham :Khojki :Nabataean :Tirhuta
 194     :Khudawadi :Old-North-Arabian :Warang-Citi :Linear-A :Old-Permic))
 195
 196 (sb!impl::defglobal *line-break-classes*
 197     #(:XX :AI :AL :B2 :BA :BB :BK :CB :CJ :CL :CM :CP :CR :EX :GL
 198       :HL :HY :ID :IN :IS :LF :NL :NS :NU :OP :PO :PR :QU :RI :SA
 199       :SG :SP :SY :WJ :ZW))
 200
 201 (sb!impl::defglobal *blocks*
 202   #(:Basic-Latin :Latin-1-Supplement :Latin-Extended-A :Latin-Extended-B
 203     :IPA-Extensions :Spacing-Modifier-Letters :Combining-Diacritical-Marks
 204     :Greek-and-Coptic :Cyrillic :Cyrillic-Supplement :Armenian :Hebrew :Arabic
 205     :Syriac :Arabic-Supplement :Thaana :NKo :Samaritan :Mandaic
 206     :Arabic-Extended-A :Devanagari :Bengali :Gurmukhi :Gujarati :Oriya :Tamil
 207     :Telugu :Kannada :Malayalam :Sinhala :Thai :Lao :Tibetan :Myanmar :Georgian
 208     :Hangul-Jamo :Ethiopic :Ethiopic-Supplement :Cherokee
 209     :Unified-Canadian-Aboriginal-Syllabics :Ogham :Runic :Tagalog :Hanunoo
 210     :Buhid :Tagbanwa :Khmer :Mongolian
 211     :Unified-Canadian-Aboriginal-Syllabics-Extended :Limbu :Tai-Le :New-Tai-Lue
 212     :Khmer-Symbols :Buginese :Tai-Tham :Combining-Diacritical-Marks-Extended
 213     :Balinese :Sundanese :Batak :Lepcha :Ol-Chiki :Sundanese-Supplement
 214     :Vedic-Extensions :Phonetic-Extensions :Phonetic-Extensions-Supplement
 215     :Combining-Diacritical-Marks-Supplement :Latin-Extended-Additional
 216     :Greek-Extended :General-Punctuation :Superscripts-and-Subscripts
 217     :Currency-Symbols :Combining-Diacritical-Marks-for-Symbols
 218     :Letterlike-Symbols :Number-Forms :Arrows :Mathematical-Operators
 219     :Miscellaneous-Technical :Control-Pictures :Optical-Character-Recognition
 220     :Enclosed-Alphanumerics :Box-Drawing :Block-Elements :Geometric-Shapes
 221     :Miscellaneous-Symbols :Dingbats :Miscellaneous-Mathematical-Symbols-A
 222     :Supplemental-Arrows-A :Braille-Patterns :Supplemental-Arrows-B
 223     :Miscellaneous-Mathematical-Symbols-B :Supplemental-Mathematical-Operators
 224     :Miscellaneous-Symbols-and-Arrows :Glagolitic :Latin-Extended-C :Coptic
 225     :Georgian-Supplement :Tifinagh :Ethiopic-Extended :Cyrillic-Extended-A
 226     :Supplemental-Punctuation :CJK-Radicals-Supplement :Kangxi-Radicals
 227     :Ideographic-Description-Characters :CJK-Symbols-and-Punctuation :Hiragana
 228     :Katakana :Bopomofo :Hangul-Compatibility-Jamo :Kanbun :Bopomofo-Extended
 229     :CJK-Strokes :Katakana-Phonetic-Extensions :Enclosed-CJK-Letters-and-Months
 230     :CJK-Compatibility :CJK-Unified-Ideographs-Extension-A
 231     :Yijing-Hexagram-Symbols :CJK-Unified-Ideographs :Yi-Syllables :Yi-Radicals
 232     :Lisu :Vai :Cyrillic-Extended-B :Bamum :Modifier-Tone-Letters
 233     :Latin-Extended-D :Syloti-Nagri :Common-Indic-Number-Forms :Phags-pa
 234     :Saurashtra :Devanagari-Extended :Kayah-Li :Rejang :Hangul-Jamo-Extended-A
 235     :Javanese :Myanmar-Extended-B :Cham :Myanmar-Extended-A :Tai-Viet
 236     :Meetei-Mayek-Extensions :Ethiopic-Extended-A :Latin-Extended-E
 237     :Meetei-Mayek :Hangul-Syllables :Hangul-Jamo-Extended-B :High-Surrogates
 238     :High-Private-Use-Surrogates :Low-Surrogates :Private-Use-Area
 239     :CJK-Compatibility-Ideographs :Alphabetic-Presentation-Forms
 240     :Arabic-Presentation-Forms-A :Variation-Selectors :Vertical-Forms
 241     :Combining-Half-Marks :CJK-Compatibility-Forms :Small-Form-Variants
 242     :Arabic-Presentation-Forms-B :Halfwidth-and-Fullwidth-Forms :Specials
 243     :Linear-B-Syllabary :Linear-B-Ideograms :Aegean-Numbers
 244     :Ancient-Greek-Numbers :Ancient-Symbols :Phaistos-Disc :Lycian :Carian
 245     :Coptic-Epact-Numbers :Old-Italic :Gothic :Old-Permic :Ugaritic :Old-Persian
 246     :Deseret :Shavian :Osmanya :Elbasan :Caucasian-Albanian :Linear-A
 247     :Cypriot-Syllabary :Imperial-Aramaic :Palmyrene :Nabataean :Phoenician
 248     :Lydian :Meroitic-Hieroglyphs :Meroitic-Cursive :Kharoshthi
 249     :Old-South-Arabian :Old-North-Arabian :Manichaean :Avestan
 250     :Inscriptional-Parthian :Inscriptional-Pahlavi :Psalter-Pahlavi :Old-Turkic
 251     :Rumi-Numeral-Symbols :Brahmi :Kaithi :Sora-Sompeng :Chakma :Mahajani
 252     :Sharada :Sinhala-Archaic-Numbers :Khojki :Khudawadi :Grantha :Tirhuta
 253     :Siddham :Modi :Takri :Warang-Citi :Pau-Cin-Hau :Cuneiform
 254     :Cuneiform-Numbers-and-Punctuation :Egyptian-Hieroglyphs :Bamum-Supplement
 255     :Mro :Bassa-Vah :Pahawh-Hmong :Miao :Kana-Supplement :Duployan
 256     :Shorthand-Format-Controls :Byzantine-Musical-Symbols :Musical-Symbols
 257     :Ancient-Greek-Musical-Notation :Tai-Xuan-Jing-Symbols
 258     :Counting-Rod-Numerals :Mathematical-Alphanumeric-Symbols :Mende-Kikakui
 259     :Arabic-Mathematical-Alphabetic-Symbols :Mahjong-Tiles :Domino-Tiles
 260     :Playing-Cards :Enclosed-Alphanumeric-Supplement
 261     :Enclosed-Ideographic-Supplement :Miscellaneous-Symbols-and-Pictographs
 262     :Emoticons :Ornamental-Dingbats :Transport-and-Map-Symbols
 263     :Alchemical-Symbols :Geometric-Shapes-Extended :Supplemental-Arrows-C
 264     :CJK-Unified-Ideographs-Extension-B :CJK-Unified-Ideographs-Extension-C
 265     :CJK-Unified-Ideographs-Extension-D :CJK-Compatibility-Ideographs-Supplement
 266     :Tags :Variation-Selectors-Supplement :Supplementary-Private-Use-Area-A
 267     :Supplementary-Private-Use-Area-B))
 268
 269 (declaim (inline svref-or-null))
 270 (defun svref-or-null (vector index)
 271   (and (< index (length vector))
 272        (svref vector index)))
 273
 274 (defun general-category (character)
 275   #!+sb-doc
 276   "Returns the general category of CHARACTER as it appears in UnicodeData.txt"
 277   (svref-or-null *general-categories* (sb!impl::ucd-general-category character)))
 278
 279 (defun bidi-class (character)
 280   #!+sb-doc
 281   "Returns the bidirectional class of CHARACTER"
 282   (if (and (eql (general-category character) :Cn)
 283            (default-ignorable-p character))
 284       :Bn
 285       (svref-or-null
 286        *bidi-classes*
 287        (aref **character-misc-database** (1+ (misc-index character))))))
 288
 289 (declaim (inline combining-class))
 290 (defun combining-class (character)
 291   #!+sb-doc
 292   "Returns the canonical combining class (CCC) of CHARACTER"
 293   (aref **character-misc-database** (+ 2 (misc-index character))))
 294
 295 (defun decimal-value (character)
 296   #!+sb-doc
 297   "Returns the decimal digit value associated with CHARACTER or NIL if
 298 there is no such value.
 299
 300 The only characters in Unicode with a decimal digit value are those
 301 that are part of a range of characters that encode the digits 0-9.
 302 Because of this, `(decimal-digit c) <=> (digit-char-p c 10)` in
 303 #+sb-unicode builds"
 304   (sb!impl::ucd-decimal-digit character))
 305
 306 (defun digit-value (character)
 307   #!+sb-doc
 308   "Returns the Unicode digit value of CHARACTER or NIL if it doesn't exist.
 309
 310 Digit values are guaranteed to be integers between 0 and 9 inclusive.
 311 All characters with decimal digit values have the same digit value,
 312 but there are characters (such as digits of number systems without a 0 value)
 313 that have a digit value but no decimal digit value"
 314   (let ((%digit (clear-flag 6
 315                             (aref **character-misc-database**
 316                                   (+ 3 (misc-index character))))))
 317     (if (< %digit 10) %digit nil)))
 318
 319 (defun numeric-value (character)
 320   #!+sb-doc
 321   "Returns the numeric value of CHARACTER or NIL if there is no such value.
 322 Numeric value is the most general of the Unicode numeric properties.
 323 The only constraint on the numeric value is that it be a rational number."
 324   (or (double-vector-binary-search (char-code character)
 325                                    **special-numerics**)
 326       (digit-value character)))
 327
 328 (defun mirrored-p (character)
 329   #!+sb-doc
 330   "Returns T if CHARACTER needs to be mirrored in bidirectional text.
 331 Otherwise, returns NIL."
 332   (logbitp 5 (aref **character-misc-database**
 333                     (+ 5 (misc-index character)))))
 334
 335 (defun bidi-mirroring-glyph (character)
 336   #!+sb-doc
 337   "Returns the mirror image of CHARACTER if it exists.
 338 Otherwise, returns NIL."
 339   (when (mirrored-p character)
 340     (let ((ret (gethash (char-code character) **bidi-mirroring-glyphs**)))
 341       (when ret (code-char ret)))))
 342
 343 (defun east-asian-width (character)
 344   #!+sb-doc
 345   "Returns the East Asian Width property of CHARACTER as
 346 one of the keywords :N (Narrow), :A (Ambiguous), :H (Halfwidth),
 347 :W (Wide), :F (Fullwidth), or :NA (Not applicable)"
 348   (svref-or-null *east-asian-widths*
 349                  (ldb (byte 3 0)
 350                       (aref **character-misc-database**
 351                             (+ 5 (misc-index character))))))
 352
 353 (defun script (character)
 354   #!+sb-doc
 355   "Returns the Script property of CHARACTER as a keyword.
 356 If CHARACTER does not have a known script, returns :UNKNOWN"
 357   (svref-or-null *scripts*
 358                  (aref **character-misc-database** (+ 6 (misc-index character)))))
 359
 360 (defun char-block (character)
 361   #!+sb-doc
 362   "Returns the Unicode block in which CHARACTER resides as a keyword.
 363 If CHARACTER does not have a known block, returns :NO-BLOCK"
 364   (let* ((code (char-code character))
 365          (block-index (ordered-ranges-position code **block-ranges**)))
 366     (if block-index
 367         (aref *blocks* block-index) :no-block)))
 368
 369 (defun unicode-1-name (character)
 370   #!+sb-doc
 371   "Returns the name assigned to CHARACTER in Unicode 1.0 if it is distinct
 372 from the name currently assigned to CHARACTER. Otherwise, returns NIL.
 373 This property has been officially obsoleted by the Unicode standard, and
 374 is only included for backwards compatibility."
 375   (let* ((char-code (char-code character))
 376          (h-code (double-vector-binary-search char-code
 377                                               **unicode-1-char-name-database**)))
 378     (when h-code
 379       (huffman-decode h-code **unicode-character-name-huffman-tree**))))
 380
 381 (defun age (character)
 382   #!+sb-doc
 383   "Returns the version of Unicode in which CHARACTER was assigned as a pair
 384 of values, both integers, representing the major and minor version respectively.
 385 If CHARACTER is not assigned in Unicode, returns NIL for both values."
 386   (let* ((value (aref **character-misc-database** (+ 8 (misc-index character))))
 387          (major (ash value -3))
 388          (minor (ldb (byte 3 0) value)))
 389     (if (zerop value) (values nil nil) (values major minor))))
 390
 391 (defun hangul-syllable-type (character)
 392   #!+sb-doc
 393   "Returns the Hangul syllable type of CHARACTER.
 394 The syllable type can be one of :L, :V, :T, :LV, or :LVT.
 395 If the character is not a Hangul syllable or Jamo, returns NIL"
 396   (let ((cp (char-code character)))
 397     (cond
 398       ((or
 399         (and (<= #x1100 cp) (<= cp #x115f))
 400         (and (<= #xa960 cp) (<= cp #xa97c))) :L)
 401       ((or
 402         (and (<= #x1160 cp) (<= cp #x11a7))
 403         (and (<= #xd7B0 cp) (<= cp #xd7C6))) :V)
 404       ((or
 405         (and (<= #x11a8 cp) (<= cp #x11ff))
 406         (and (<= #xd7c8 cp) (<= cp #xd7fb))) :T)
 407       ((and (<= #xac00 cp) (<= cp #xd7a3))
 408        (if (= 0 (rem (- cp #xac00) 28)) :LV :LVT)))))
 409
 410 (defun line-break-class (character &key resolve)
 411   #!+sb-doc
 412   "Returns the line breaking class of CHARACTER, as specified in UAX #14.
 413 If :RESOLVE is NIL, returns the character class found in the property file.
 414 If :RESOLVE is non-NIL, centain line-breaking classes will be mapped to othec
 415 classes as specified in the applicable standards. Addinionally, if :RESOLVE
 416 is :EAST-ASIAN, Ambigious (class :AI) characters will be mapped to the
 417 Ideographic (:ID) class instead of Alphabetic (:AL)."
 418   (when (and resolve (listp character)) (setf character (car character)))
 419   (when (and resolve (not character)) (return-from line-break-class :nil))
 420   (let ((raw-class
 421          (svref-or-null *line-break-classes*
 422                         (aref **character-misc-database** (+ 7 (misc-index character)))))
 423         (syllable-type (hangul-syllable-type character)))
 424     (when syllable-type
 425       (setf raw-class
 426             (cdr (assoc syllable-type
 427                         '((:l . :JL) (:v . :JV) (:t . :JT)
 428                           (:lv . :H2) (:lvt . :H3))))))
 429     (when resolve
 430       (setf raw-class
 431             (case raw-class
 432               (:ai (if (eql resolve :east-asion) :ID :AL))
 433               ; If we see :CM when resolving, we have a CM that isn't subject
 434               ; to LB9, so we do LB10
 435               ((:xx :cm) :al)
 436               (:sa (if (member (general-category character) '(:Mn :Mc))
 437                        :CM :AL))
 438               (:cj :ns)
 439               (:sg (error "The character ~S is a surrogate, which should not
 440 appear in an SBCL string. The line-breaking behavior of surrogates is undefined."
 441                           character))
 442               (t raw-class))))
 443     raw-class))
 444
 445 (defun uppercase-p (character)
 446   #!+sb-doc
 447   "Returns T if CHARACTER has the Unicode property Uppercase and NIL otherwise"
 448   (or (eql (general-category character) :Lu) (proplist-p character :other-uppercase)))
 449
 450 (defun lowercase-p (character)
 451   #!+sb-doc
 452   "Returns T if CHARACTER has the Unicode property Lowercase and NIL otherwise"
 453   (or (eql (general-category character) :Ll) (proplist-p character :other-lowercase)))
 454
 455 (defun cased-p (character)
 456   #!+sb-doc
 457   "Returns T if CHARACTER has a (Unicode) case, and NIL otherwise"
 458   (or (uppercase-p character) (lowercase-p character)
 459       (eql (general-category character) :Lt)))
 460
 461 (defun case-ignorable-p (character)
 462   #!+sb-doc
 463   "Returns T if CHARACTER is Case Ignorable as defined in Unicode 6.3, Chapter
 464 3"
 465   (or (member (general-category character)
 466               '(:Mn :Me :Cf :Lm :Sk))
 467       (member (word-break-class character)
 468               '(:midletter :midnumlet :single-quote))))
 469
 470 (defun alphabetic-p (character)
 471   #!+sb-doc
 472   "Returns T if CHARACTER is Alphabetic according to the Unicode standard
 473 and NIL otherwise"
 474   (or (member (general-category character) '(:Lu :Ll :Lt :Lm :Lo :Nl))
 475       (proplist-p character :other-alphabetic)))
 476
 477 (defun ideographic-p (character)
 478   #!+sb-doc
 479   "Returns T if CHARACTER has the Unicode property Ideographic,
 480 which loosely corresponds to the set of \"Chinese characters\""
 481   (proplist-p character :ideographic))
 482
 483 (defun math-p (character)
 484   #!+sb-doc
 485   "Returns T if CHARACTER is a mathematical symbol according to Unicode and
 486 NIL otherwise"
 487   (or (eql (general-category character) :sm) (proplist-p character :other-math)))
 488
 489 (defun whitespace-p (character)
 490   #!+sb-doc
 491   "Returns T if CHARACTER is whitespace according to Unicode
 492 and NIL otherwise"
 493   (proplist-p character :white-space))
 494
 495 (defun hex-digit-p (character &key ascii)
 496   #!+sb-doc
 497   "Returns T if CHARACTER is a hexadecimal digit and NIL otherwise.
 498 If :ASCII is non-NIL, fullwidth equivalents of the Latin letters A through F
 499 are excluded."
 500   (proplist-p character (if ascii :ascii-hex-digit :hex-digit)))
 501
 502 (defun soft-dotted-p (character)
 503   #!+sb-doc
 504   "Returns T if CHARACTER has a soft dot (such as the dots on i and j) which
 505 disappears when accents are placed on top of it. and NIL otherwise"
 506   (proplist-p character :soft-dotted))
 507
 508 (defun default-ignorable-p (character)
 509   #!+sb-doc
 510   "Returns T if CHARACTER is a Default_Ignorable_Code_Point"
 511   (and
 512    (or (proplist-p character :other-default-ignorable-code-point)
 513        (eql (general-category character) :cf)
 514        (proplist-p character :variation-selector))
 515    (not
 516     (or (whitespace-p character)
 517         (ordered-ranges-member
 518          (char-code character)
 519          #(#x0600 #x0604 #x06DD #x06DD #x070F #x070F #xFFF9 #xFFFB
 520            #x110BD #x110BD))))))
 521
 522 \f
 523 ;;; Implements UAX#15: Normalization Forms
 524 (declaim (inline char-decomposition-info))
 525 (defun char-decomposition-info (char)
 526   (let ((value (aref **character-misc-database**
 527                      (+ 4 (misc-index char)))))
 528     (values (clear-flag 7 value) (logbitp 7 value))))
 529
 530 (defun char-decomposition (char length callback)
 531   (declare (function callback))
 532   ;; Caller should have gotten length from char-decomposition-info
 533   (let* ((cp (char-code char))
 534          (cp-high (ash cp -8))
 535          (decompositions **character-decompositions**)
 536          (high-page (aref **character-high-pages** cp-high))
 537          (index (unless (logbitp 15 high-page) ;; Hangul syllable
 538                   (aref **character-low-pages**
 539                         (+ 1 (* 2 (+ (ldb (byte 8 0) cp) (ash high-page 8))))))))
 540     (cond ((= length 1)
 541            (funcall callback (code-char (aref decompositions index))))
 542           ((<= #xac00 cp #xd7a3)
 543            ;; see Unicode 6.2, section 3-12
 544            (let* ((sbase #xac00)
 545                   (lbase #x1100)
 546                   (vbase #x1161)
 547                   (tbase #x11a7)
 548                   (vcount 21)
 549                   (tcount 28)
 550                   (ncount (* vcount tcount))
 551                   (sindex (- cp sbase))
 552                   (lindex (floor sindex ncount))
 553                   (vindex (floor (mod sindex ncount) tcount))
 554                   (tindex (mod sindex tcount)))
 555              (funcall callback (code-char (+ lbase lindex)))
 556              (funcall callback (code-char (+ vbase vindex)))
 557              (when (> tindex 0)
 558                (funcall callback  (code-char (+ tbase tindex))))))
 559
 560           (t
 561            (loop for i below length
 562                  do
 563                  (funcall callback (code-char (aref decompositions (+ index i)))))))))
 564
 565 (defun decompose-char (char compatibility callback)
 566   (declare (function callback))
 567   (multiple-value-bind (info compat) (char-decomposition-info char)
 568     (if (and (plusp info)
 569              (or compatibility
 570                  (not compat)))
 571         (if compatibility
 572             (dx-flet ((callback (char)
 573                         (decompose-char char t callback)))
 574               (char-decomposition char info #'callback))
 575             (char-decomposition char info callback))
 576         (funcall callback char))))
 577
 578 (defun decompose-string (string compatibility filter)
 579   (let (chars
 580         (length 0)
 581         (previous-combining-class 0))
 582     (declare (type index length))
 583     (dx-flet ((callback (char)
 584                         (let ((combining-class (combining-class char)))
 585                           (incf length)
 586                           (cond ((< 0 combining-class previous-combining-class)
 587                                  ;; Ensure it's sorted
 588                                  (loop for cons on chars
 589                                        for next-char = (cadr cons)
 590                                        when (or (not next-char)
 591                                                 (<= 0 (combining-class next-char) combining-class))
 592                                        do (setf (cdr cons)
 593                                                 (cons char (cdr cons)))
 594                                           (return)))
 595                                 (t
 596                                  (push char chars)
 597                                  (setf previous-combining-class combining-class))))))
 598       (sb!kernel:with-array-data ((string string) (start) (end))
 599         (declare (ignore start))
 600         (let ((calback (if filter
 601                            (let ((filter (sb!kernel:%coerce-callable-to-fun filter)))
 602                              (lambda (char)
 603                                (when (funcall filter char)
 604                                  (callback char))))
 605                            #'callback)))
 606           (loop for i below end
 607                 for char = (schar string i)
 608                 do
 609                 (decompose-char char compatibility calback))))
 610       (let ((result (make-string length)))
 611         (loop for char in chars
 612               for i from (1- length) downto 0
 613               do (setf (schar result i) char))
 614         result))))
 615
 616 (defun composition-hangul-syllable-type (cp)
 617   (cond
 618     ((and (<= #x1100 cp) (<= cp #x1112)) :L)
 619     ((and (<= #x1161 cp) (<= cp #x1175)) :V)
 620     ((and (<= #x11a8 cp) (<= cp #x11c2)) :T)
 621     ((and (<= #xac00 cp) (<= cp #.(+ #xac00 11171)))
 622      (if (= 0 (rem (- cp #xac00) 28)) :LV :LVT))))
 623
 624 (defun primary-composition (char1 char2)
 625   (flet ((maybe (fn x) (when x (funcall fn x))))
 626     (let ((c1 (char-code char1))
 627           (c2 (char-code char2)))
 628       (maybe
 629        #'code-char
 630        (cond
 631          ((gethash (dpb c1 (byte 21 21) c2)
 632                    **character-primary-compositions**))
 633          ((and (eql (composition-hangul-syllable-type c1) :L)
 634                (eql (composition-hangul-syllable-type c2) :V))
 635           (let ((lindex (- c1 #x1100))
 636                 (vindex (- c2 #x1161)))
 637             (+ #xac00 (* lindex 588) (* vindex 28))))
 638          ((and (eql (composition-hangul-syllable-type c1) :LV)
 639                (eql (composition-hangul-syllable-type c2) :T))
 640           (+ c1 (- c2 #x11a7))))))))
 641
 642 ;;; This implements a sequence data structure, specialized for
 643 ;;; efficient deletion of characters at an index, along with tolerable
 644 ;;; random access.  The purpose is to support the canonical
 645 ;;; composition algorithm from Unicode, which involves replacing (not
 646 ;;; necessarily consecutive) pairs of code points with a single code
 647 ;;; point (e.g. [#\e #\combining_acute_accent] with
 648 ;;; #\latin_small_letter_e_with_acute).  The data structure is a list
 649 ;;; of three-element lists, each denoting a chunk of string data
 650 ;;; starting at the first index and ending at the second.
 651 ;;;
 652 ;;; Actually, the implementation isn't particularly efficient, and
 653 ;;; would probably benefit from being rewritten in terms of displaced
 654 ;;; arrays, which would substantially reduce copying.
 655 ;;;
 656 ;;; (also, generic sequences.  *sigh*.)
 657 (defun lref (lstring index)
 658   (dolist (l lstring)
 659     (when (and (<= (first l) index)
 660                (< index (second l)))
 661       (return (aref (third l) (- index (first l)))))))
 662
 663 (defun (setf lref) (newchar lstring index)
 664   (dolist (l lstring)
 665     (when (and (<= (first l) index)
 666                (< index (second l)))
 667       (return (setf (aref (third l) (- index (first l))) newchar)))))
 668
 669 (defun llength (lstring)
 670   (second (first (last lstring))))
 671
 672 (defun lstring (lstring)
 673   (let ((result (make-string (llength lstring))))
 674     (dolist (l lstring result)
 675       (replace result (third l) :start1 (first l) :end1 (second l)))))
 676
 677 (defun ldelete (lstring index)
 678   (do* ((ls lstring (cdr ls))
 679         (l (car ls) (car ls))
 680         so-fars)
 681        ((and (<= (first l) index)
 682              (< index (second l)))
 683         (append
 684          (nreverse so-fars)
 685          (cond
 686            ((= (first l) index)
 687             (list (list (first l) (1- (second l)) (subseq (third l) 1))))
 688            ((= index (1- (second l)))
 689             (list (list (first l) (1- (second l)) (subseq (third l) 0 (1- (length (third l)))))))
 690            (t
 691             (list
 692              (list (first l) index
 693                    (subseq (third l) 0 (- index (first l))))
 694              (list index (1- (second l))
 695                    (subseq (third l) (1+ (- index (first l))))))))
 696          (mapcar (lambda (x) (list (1- (first x)) (1- (second x)) (third x)))
 697                  (cdr ls))))
 698     (push l so-fars)))
 699
 700 (defun canonically-compose (string)
 701   (let* ((result (list (list 0 (length string) string)))
 702          (previous-starter-index (position 0 string :key #'combining-class))
 703          (i (and previous-starter-index (1+ previous-starter-index))))
 704     (when (or (not i) (= i (length string)))
 705       (return-from canonically-compose string))
 706     (tagbody
 707      again
 708        (when (and (>= (- i previous-starter-index) 2)
 709                   ;; test for Blocked (Unicode 3.11 para. D115)
 710                   ;;
 711                   ;; (assumes here that string has sorted combiners,
 712                   ;; so can look back just one step)
 713                   (>= (combining-class (lref result (1- i)))
 714                       (combining-class (lref result i))))
 715          (when (= (combining-class (lref result i)) 0)
 716            (setf previous-starter-index i))
 717          (incf i)
 718          (go next))
 719
 720        (let ((comp (primary-composition (lref result previous-starter-index)
 721                                         (lref result i))))
 722          (cond
 723            (comp
 724             (setf (lref result previous-starter-index) comp)
 725             (setf result (ldelete result i)))
 726            (t
 727             (when (= (combining-class (lref result i)) 0)
 728               (setf previous-starter-index i))
 729             (incf i))))
 730      next
 731        (unless (= i (llength result))
 732          (go again)))
 733     (if (= i (length string))
 734         string
 735         (lstring result))))
 736
 737 (defun normalize-string (string &optional (form :nfd)
 738                                           filter)
 739   #!+sb-doc
 740   "Normalize STRING to the Unicode normalization form form.
 741 Acceptable values for form are :NFD, :NFC, :NFKD, and :NFKC.
 742 If FILTER is a function it is called on each decomposed character and
 743 only characters for which it returns T are collected."
 744   (declare (type (member :nfd :nfkd :nfc :nfkc) form))
 745   #!-sb-unicode
 746   (declare (ignore filter))
 747   #!-sb-unicode
 748   (etypecase string
 749     ((array nil (*)) string)
 750     (string
 751      (ecase form
 752        ((:nfc :nfkc) string)
 753        ((:nfd :nfkd) (error "Cannot normalize to ~A form in #-SB-UNICODE builds" form)))))
 754   #!+sb-unicode
 755   (etypecase string
 756     (base-string string)
 757     ((array character (*))
 758      (ecase form
 759        ((:nfc)
 760         (canonically-compose (decompose-string string nil filter)))
 761        ((:nfd)
 762         (decompose-string string nil filter))
 763        ((:nfkc)
 764         (canonically-compose (decompose-string string t filter)))
 765        ((:nfkd)
 766         (decompose-string string t filter))))
 767     ((array nil (*)) string)))
 768
 769 (defun normalized-p (string &optional (form :nfd))
 770   #!+sb-doc
 771   "Tests if STRING is normalized to FORM"
 772   ;; FIXME: can be optimized
 773   (string= string (normalize-string string form)))
 774
 775 \f
 776 ;;; Unicode case algorithms
 777 ;; FIXME: Make these parts less redundant (macro?)
 778 (defparameter **special-titlecases**
 779   '#.(with-open-file (stream
 780                      (merge-pathnames
 781                       (make-pathname
 782                        :directory
 783                        '(:relative :up :up "output")
 784                        :name "titlecases" :type "lisp-expr")
 785                       sb!xc:*compile-file-truename*)
 786                      :direction :input
 787                      :element-type 'character)
 788         (read stream)))
 789
 790 (defparameter **special-casefolds**
 791   '#.(with-open-file (stream
 792                      (merge-pathnames
 793                       (make-pathname
 794                        :directory
 795                        '(:relative :up :up "output")
 796                        :name "foldcases" :type "lisp-expr")
 797                       sb!xc:*compile-file-truename*)
 798                      :direction :input
 799                      :element-type 'character)
 800         (read stream)))
 801
 802 (defun has-case-p (char)
 803   ;; Bit 6 is the Unicode case flag, as opposed to the Common Lisp one
 804   (logbitp 6 (aref **character-misc-database** (+ 5 (misc-index char)))))
 805
 806 (defun char-uppercase (char)
 807   (if (has-case-p char)
 808       (let ((cp (car (char-case-info char))))
 809         (if (atom cp) (list (code-char cp)) (mapcar #'code-char cp)))
 810       (list char)))
 811
 812 (defun char-lowercase (char)
 813   (if (has-case-p char)
 814       (let ((cp (cdr (char-case-info char))))
 815         (if (atom cp) (list (code-char cp)) (mapcar #'code-char cp)))
 816       (list char)))
 817
 818 (defun char-titlecase (char)
 819   (unless (has-case-p char) (return-from char-titlecase (list char)))
 820   (let* ((cp (char-code char))
 821          (value (assoc cp **special-titlecases**)))
 822     (if value
 823         (if (atom (cdr value))
 824             (list (code-char (cdr value)))
 825             (mapcar #'code-char (cdr value)))
 826         (char-uppercase char))))
 827
 828 (defun char-foldcase (char)
 829   (unless (has-case-p char) (return-from char-foldcase (list char)))
 830   (let* ((cp (char-code char))
 831          (value (assoc cp **special-casefolds**)))
 832     (if value
 833         (if (atom (cdr value))
 834             (list (code-char (cdr value)))
 835             (mapcar #'code-char (cdr value)))
 836         (char-lowercase char))))
 837
 838 (defun string-somethingcase (fn string special-fn)
 839   (let (result (len (length string)))
 840     (loop for index from 0 below len
 841        for char = (char string index)
 842        for cased = (or (funcall special-fn char index len)
 843                        (funcall fn char))
 844        do (loop for c in (remove :none cased) do (push c result)))
 845     (setf result (nreverse result))
 846     (coerce result 'string)))
 847
 848 (declaim (type function sb!unix::posix-getenv))
 849 (defun get-user-locale ()
 850   (let ((raw-locale
 851          #!+(or win32 unix) (or (sb!unix::posix-getenv "LC_ALL")
 852                                 (sb!unix::posix-getenv "LANG"))
 853          #!-(or win32 unix) nil))
 854     (when raw-locale
 855       (let ((lang-code (string-upcase
 856                         (subseq raw-locale 0 (position #\_ raw-locale)))))
 857         (when lang-code
 858           (intern lang-code "KEYWORD"))))))
 859
 860
 861 (defun uppercase (string &key locale)
 862   #!+sb-doc
 863   "Returns the full uppercase of STRING according to the Unicode standard.
 864 The result is not guaranteed to have the same length as the input. If :LOCALE
 865 is NIL, no language-specific case transformations are applied. If :LOCALE is a
 866 keyword representing a two-letter ISO country code, the case transforms of that
 867 locale are used. If :LOCALE is T, the user's current locale is used (Unix and
 868 Win32 only)."
 869   (when (eq locale t) (setf locale (get-user-locale)))
 870   (string-somethingcase
 871    #'char-uppercase string
 872    #!-sb-unicode (constantly nil)
 873    #!+sb-unicode ;; code-char with a constant > 255 breaks the build
 874    #'(lambda (char index len)
 875        (declare (ignore len))
 876        (cond
 877          ((and (eql locale :lt) (char= char (code-char #x0307))
 878                   (loop for i from (1- index) downto 0
 879                      for c = (char string i)
 880                      do (case (combining-class c)
 881                           (0 (return (soft-dotted-p c)))
 882                           (230 (return nil))
 883                           (t t))
 884                      finally (return nil)))
 885           '(:none))
 886          ((and (or (eql locale :tr) (eql locale :az))
 887                (char= char #\i))
 888           (list (code-char #x0130)))
 889          (t nil)))))
 890
 891 (defun lowercase (string &key locale)
 892   #!+sb-doc
 893   "Returns the full lowercase of STRING according to the Unicode standard.
 894 The result is not guaranteed to have the same length as the input.
 895 :LOCALE has the same semantics as the :LOCALE argument to UPPERCASE."
 896   (when (eq locale t) (setf locale (get-user-locale)))
 897   (string-somethingcase
 898    #'char-lowercase string
 899    #!-sb-unicode (constantly nil)
 900    #!+sb-unicode
 901    #'(lambda (char index len)
 902        (cond
 903          ((and (char= char (code-char #x03A3))
 904                (loop for i from (1- index) downto 0
 905                   for c = (char string i)
 906                   do (cond ((cased-p c) (return t))
 907                            ((case-ignorable-p c))
 908                            (t (return nil)))
 909                   finally (return nil))
 910                (loop for i from (1+ index) below len
 911                   for c = (char string i)
 912                   do (cond ((cased-p c) (return nil))
 913                            ((case-ignorable-p c))
 914                            (t (return t)))
 915                   finally (return t)))
 916           (list (code-char #x03C2)))
 917        ((eql locale :lt)
 918         (mapcar
 919          #'code-char
 920          (cdr (or
 921                (assoc (char-code char)
 922                       '((#x00CC . (#x0069 #x0307 #x0300))
 923                         (#x00CD . (#x0069 #x0307 #x0301))
 924                         (#x0128 . (#x0069 #x0307 #x0303))))
 925                (and (loop for i from (1+ index) below len
 926                        for c = (char string i)
 927                        do (case (combining-class c)
 928                             (230 (return t))
 929                             (0 (return nil))
 930                             (t t))
 931                        finally (return nil))
 932                     (assoc (char-code char)
 933                            '((#x0049 . (#x0069 #x0307))
 934                              (#x004A . (#x006A #x0307))
 935                              (#x012E . (#x012F #x0307)))))))))
 936        ((or (eql locale :tr) (eql locale :az))
 937         (cond
 938           ((char= char (code-char #x0130)) (list #\i))
 939           ((and (char= char (code-char #x0307))
 940                 (loop for i from (1- index) downto 0
 941                    for c = (char string i)
 942                    do (case (combining-class c)
 943                         (0 (return (char= c #\I)))
 944                         (230 (return nil))
 945                         (t t))
 946                    finally (return nil)))
 947            '(:none))
 948           ((and (char= char #\I)
 949                 (loop for i from (1+ index) below len
 950                    for c = (char string i)
 951                    do (case (combining-class c)
 952                         (0 (return t))
 953                         (230 (return (char/= c (code-char #x0307))))
 954                         (t t))
 955                    finally (return t)))
 956            (list (code-char #x0131)))
 957           (t nil)))
 958        (t nil)))))
 959
 960 (defun titlecase (string &key locale)
 961   #!+sb-doc
 962   "Returns the titlecase of STRING. The resulting string can
 963 be longer than the input.
 964 :LOCALE has the same semantics as the :LOCALE argument to UPPERCASE."
 965   (when (eq locale t) (setf locale (get-user-locale)))
 966   (let ((words (words string))
 967         (cased nil))
 968    (loop for word in words
 969       for first-cased = (or (position-if #'cased-p word) 0)
 970       for pre = (subseq word 0 first-cased)
 971       for initial = (char word first-cased)
 972       for rest = (subseq word (1+ first-cased))
 973       do (let ((up (char-titlecase initial)) (down (lowercase rest)))
 974            #!+sb-unicode
 975            (when (and (or (eql locale :tr) (eql locale :az))
 976                       (eql initial #\i))
 977              (setf up (list (code-char #x0130))))
 978            #!+sb-unicode
 979            (when (and (eql locale :lt)
 980                       (soft-dotted-p initial)
 981                       (eql (char down
 982                                  (position-if
 983                                   #'(lambda (c)
 984                                       (or (eql (combining-class c) 0)
 985                                           (eql (combining-class c) 230))) down))
 986                            (code-char #x0307)))
 987              (setf down (delete (code-char #x0307) down :count 1)))
 988            (push (concatenate 'string pre up down) cased)))
 989    (apply #'concatenate 'string (nreverse cased))))
 990
 991 (defun casefold (string)
 992   #!+sb-doc
 993   "Returns the full casefolding of STRING according to the Unicode standard.
 994 Casefolding remove case information in a way that allaws the results to be used
 995 for case-insensitive comparisons.
 996 The result is not guaranteed to have the same length as the input."
 997   (string-somethingcase #'char-foldcase string (constantly nil)))
 998
 999 \f
1000 ;;; Unicode break algorithms
1001 ;;; In all the breaking methods:
1002 ;;; (brk) establishes a break between `first` and `second`
1003 ;;; (nobrk) prevents a break between `first` and `second`
1004 ;;; Setting flag=T/state=:nobrk-next prevents a break between `second` and `htird`
1005
1006 ;; Word breaking sets this to make their algorithms less tricky
1007 (defvar *other-break-special-graphemes* nil)
1008 (defun grapheme-break-class (char)
1009   #!+sb-doc
1010   "Returns the grapheme breaking class of CHARACTER, as specified in UAX #29."
1011   (let ((cp (when char (char-code char)))
1012         (gc (when char (general-category char)))
1013         (not-spacing-mark
1014          #(#x102B #x102C #x1038 #x1062 #x1063 #x1064 #x1067 #x1068 #x1069
1015            #x106A #x106B #x106C #x106D #x1083 #x1087 #x1088 #x1089 #x108A
1016            #x108B #x108C #x108F #x109A #x109B #x109C #x19B0 #x19B1 #x19B2
1017            #x19B3 #x19B4 #x19B8 #x19B9 #x19BB #x19BC #x19BD #x19BE #x19BF
1018            #x19C0 #x19C8 #x19C9 #x1A61 #x1A63 #x1A64 #xAA7B #xAA7D)))
1019     (cond
1020       ((not char) nil)
1021       ((= cp 10) :LF)
1022       ((= cp 13) :CR)
1023       ((or (member gc '(:Mn :Me))
1024            (proplist-p char :other-grapheme-extend)
1025            (and *other-break-special-graphemes*
1026                 (member gc '(:Mc :Cf)) (not (<= #x200B cp #x200D))))
1027        :extend)
1028       ((or (member gc '(:Zl :Zp :Cc :Cs :Cf))
1029            ;; From Cn and Default_Ignorable_Code_Point
1030            (eql cp #x2065) (eql cp #xE0000)
1031            (<= #xFFF0 cp #xFFF8)
1032            (<= #xE0002 cp #xE001F)
1033            (<= #xE0080 cp #xE00FF)
1034            (<= #xE01F0 cp #xE0FFF)) :control)
1035       ((<= #x1F1E6 cp #x1F1FF) :regional-indicator)
1036       ((and (or (eql gc :Mc)
1037                 (eql cp #x0E33) (eql cp #x0EB3))
1038             (not (binary-search cp not-spacing-mark))) :spacing-mark)
1039       (t (hangul-syllable-type char)))))
1040
1041 (defun graphemes (string)
1042   #!+sb-doc
1043   "Breaks STRING into graphemes acording to the default
1044 grapheme breaking rules specified in UAX #29, returning a list of strings."
1045   (let* ((chars (coerce string 'list)) clusters (cluster (list (car chars))))
1046     (do ((first (car chars) second)
1047          (tail (cdr chars) (when tail (cdr tail)))
1048          (second (cadr chars) (when tail (cadr tail))))
1049         ((not first) (nreverse (mapcar #'(lambda (l) (coerce l 'string)) clusters)))
1050       (flet ((brk () (push (nreverse cluster) clusters) (setf cluster (list second)))
1051              (nobrk () (push second cluster)))
1052         (let ((c1 (grapheme-break-class first))
1053               (c2 (grapheme-break-class second)))
1054           (cond
1055             ((and (eql c1 :cr) (eql c2 :lf)) (nobrk))
1056             ((or (member c1 '(:control :cr :lf))
1057                  (member c2 '(:control :cr :lf))) (brk))
1058              ((or (and (eql c1 :l) (member c2 '(:l :v :lv :lvt)))
1059                   (and (or (eql c1 :v) (eql c1 :lv))
1060                        (or (eql c2 :v) (eql c2 :t)))
1061                   (and (eql c2 :t) (or (eql c1 :lvt) (eql c1 :t))))
1062               (nobrk))
1063              ((and (eql c1 :regional-indicator) (eql c2 :regional-indicator)) (nobrk))
1064              ((or (eql c2 :extend) (eql c2 :spacing-mark) (eql c1 :prepend)) (nobrk))
1065              (t (brk))))))))
1066
1067 (defun word-break-class (char)
1068   #!+sb-doc
1069   "Returns the word breaking class of CHARACTER, as specified in UAX #29."
1070   ;; Words use graphemes as characters to deal with the ignore rule
1071   (when (listp char) (setf char (car char)))
1072   (let ((cp (when char (char-code char)))
1073         (gc (when char (general-category char)))
1074         (newlines #(#xB #xC #x0085 #x0085 #x2028 #x2029))
1075         (also-katakana
1076          #(#x3031 #x3035 #x309B #x309C
1077            #x30A0 #x30A0 #x30FC #x30FC
1078            #xFF70 #xFF70))
1079         (midnumlet #(#x002E #x2018 #x2019 #x2024 #xFE52 #xFF07 #xFF0E))
1080         (midletter
1081          #(#x003A #x00B7 #x002D7 #x0387 #x05F4 #x2027 #xFE13 #xFE55 #xFF1A))
1082         (midnum
1083          ;; Grepping of Line_Break = IS adjusted per UAX #29
1084          #(#x002C #x003B #x037E #x0589 #x060C #x060D #x066C #x07F8 #x2044
1085            #xFE10 #xFE14 #xFE50 #xFE54 #xFF0C #xFF1B)))
1086     (cond
1087       ((not char) nil)
1088       ((= cp 10) :LF)
1089       ((= cp 13) :CR)
1090       ((= cp #x27) :single-quote)
1091       ((= cp #x22) :double-quote)
1092       ((ordered-ranges-member cp newlines) :newline)
1093       ((or (eql (grapheme-break-class char) :extend)
1094            (eql gc :mc)) :extend)
1095       ((<= #x1F1E6 cp #x1F1FF) :regional-indicator)
1096       ((and (eql gc :Cf) (not (<= #x200B cp #x200D))) :format)
1097       ((or (eql (script char) :katakana)
1098            (ordered-ranges-member cp also-katakana)) :katakana)
1099       ((and (eql (script char) :Hebrew) (eql gc :lo)) :hebrew-letter)
1100       ((and (or (alphabetic-p char) (= cp #x05F3))
1101             (not (or (ideographic-p char)
1102                      (eql (line-break-class char) :sa)
1103                      (eql (script char) :hiragana)))) :aletter)
1104       ((binary-search cp midnumlet) :midnumlet)
1105       ((binary-search cp midletter) :midletter)
1106       ((binary-search cp midnum) :midnum)
1107       ((or (and (eql gc :Nd) (not (<= #xFF10 cp #xFF19))) ;Fullwidth digits
1108            (eql cp #x066B)) :numeric)
1109       ((eql gc :Pc) :extendnumlet)
1110       (t nil))))
1111
1112 (defmacro flatpush (thing list)
1113   (let ((%thing (gensym)) (%i (gensym)))
1114     `(let ((,%thing ,thing))
1115        (if (listp ,%thing)
1116            (dolist (,%i ,%thing)
1117              (push ,%i ,list))
1118            (push ,%thing ,list)))))
1119
1120 (defun words (string)
1121   #!+sb-doc
1122   "Breaks STRING into words acording to the default
1123 word breaking rules specified in UAX #29. Returns a list of strings"
1124   (let ((chars (mapcar
1125                  #'(lambda (s)
1126                      (let ((l (coerce s 'list)))
1127                        (if (cdr l) l (car l))))
1128                  (let ((*other-break-special-graphemes* t)) (graphemes string))))
1129          words word flag)
1130     (flatpush (car chars) word)
1131     (do ((first (car chars) second)
1132          (tail (cdr chars) (cdr tail))
1133          (second (cadr chars) (cadr tail)))
1134         ((not first) (nreverse (mapcar #'(lambda (l) (coerce l 'string)) words)))
1135       (flet ((brk () (push (nreverse word) words) (setf word nil) (flatpush second word))
1136              (nobrk () (flatpush second word)))
1137         (let ((c1 (word-break-class first))
1138               (c2 (word-break-class second))
1139               (c3 (when (and tail (cdr tail)) (word-break-class (cadr tail)))))
1140           (cond
1141             (flag (nobrk) (setf flag nil))
1142             ;; CR+LF are bound together by the grapheme clustering
1143             ((or (eql c1 :newline) (eql c1 :cr) (eql c1 :lf)
1144                  (eql c2 :newline) (eql c2 :cr) (eql c2 :lf)) (brk))
1145             ((or (eql c2 :format) (eql c2 :extend)) (nobrk))
1146             ((and (or (eql c1 :aletter) (eql c1 :hebrew-letter))
1147                   (or (eql c2 :aletter) (eql c2 :hebrew-letter))) (nobrk))
1148             ((and (or (eql c1 :aletter) (eql c1 :hebrew-letter))
1149                   (member c2 '(:midletter :midnumlet :single-quote))
1150                   (or (eql c3 :aletter) (eql c3 :hebrew-letter)))
1151              (nobrk) (setf flag t)) ; Handle the multiple breaks from this rule
1152             ((and (eql c1 :hebrew-letter) (eql c2 :double-quote)
1153                   (eql c3 :hebrew-letter))
1154              (nobrk) (setf flag t))
1155             ((and (eql c1 :hebrew-letter) (eql c2 :single-quote)) (nobrk))
1156             ((or (and (eql c1 :numeric) (member c2 '(:numeric :aletter :hebrew-letter)))
1157                  (and (eql c2 :numeric) (member c1 '(:numeric :aletter :hebrew-letter))))
1158              (nobrk))
1159             ((and (eql c1 :numeric)
1160                   (member c2 '(:midnum :midnumlet :single-quote))
1161                   (eql c3 :numeric))
1162              (nobrk) (setf flag t))
1163             ((and (eql c1 :katakana) (eql c2 :katakana)) (nobrk))
1164             ((or (and (member c1
1165                               '(:aletter :hebrew-letter :katakana
1166                                 :numeric :extendnumlet)) (eql c2 :extendnumlet))
1167                  (and (member c2
1168                               '(:aletter :hebrew-letter :katakana
1169                                 :numeric :extendnumlet)) (eql c1 :extendnumlet)))
1170              (nobrk))
1171             ((and (eql c1 :regional-indicator) (eql c2 :regional-indicator)) (nobrk))
1172             (t (brk))))))))
1173
1174 (defun sentence-break-class (char)
1175   #!+sb-doc
1176   "Returns the sentence breaking class of CHARACTER, as specified in UAX #29."
1177   (when (listp char) (setf char (car char)))
1178   (let ((cp (when char (char-code char)))
1179         (gc (when char (general-category char)))
1180         (aterms #(#x002E #x2024 #xFE52 #xFF0E))
1181         (scontinues
1182          #(#x002C #x002D #x003A #x055D #x060C #x060D #x07F8 #x1802 #x1808
1183            #x2013 #x2014 #x3001 #xFE10 #xFE11 #xFE13 #xFE31 #xFE32 #xFE50
1184            #xFE51 #xFE55 #xFE58 #xFE63 #xFF0C #xFF0D #xFF1A #xFF64)))
1185     (cond
1186       ((not char) nil)
1187       ((= cp 10) :LF)
1188       ((= cp 13) :CR)
1189       ((or (eql (grapheme-break-class char) :extend)
1190            (eql gc :mc)) :extend)
1191       ((or (eql cp #x0085) (<= #x2028 cp #x2029)) :sep)
1192       ((and (eql gc :Cf) (not (<= #x200C cp #x200D))) :format)
1193       ((whitespace-p char) :sp)
1194       ((lowercase-p char) :lower)
1195       ((or (uppercase-p char) (eql gc :Lt)) :upper)
1196       ((or (alphabetic-p char) (eql cp #x00A0) (eql cp #x05F3)) :oletter)
1197       ((or (and (eql gc :Nd) (not (<= #xFF10 cp #xFF19))) ;Fullwidth digits
1198            (<= #x066B cp #x066C)) :numeric)
1199       ((binary-search cp aterms) :aterm)
1200       ((binary-search cp scontinues) :scontinue)
1201       ((proplist-p char :sterm) :sterm)
1202       ((and (or (member gc '(:Po :Ps :Pe :Pf :Pi))
1203                 (eql (line-break-class char) :qu))
1204             (not (eql cp #x05F3))) :close)
1205       (t nil))))
1206
1207 (defun sentence-prebreak (string)
1208   #!+sb-doc
1209   "Pre-combines some sequences of characters to make the sentence-break
1210 algorithm simpler..
1211 Specifically,
1212 - Combines any character with the following extend of format characters
1213 - Combines CR + LF into '(CR LF)
1214 - Combines any run of :cp*:close* into one character"
1215   (let ((chars (coerce string 'list))
1216         cluster clusters last-seen sp-run)
1217     (labels ((flush () (if (cdr cluster) (push (nreverse cluster) clusters)
1218                            (if cluster (push (car cluster) clusters)))
1219                     (setf cluster nil))
1220              (brk (x)
1221                (flush) (push x clusters))
1222              (nobrk (x) (push x cluster)))
1223     (loop for ch in chars
1224        for type = (sentence-break-class ch)
1225        do (cond
1226             ((and (eql last-seen :cr) (eql type :lf)) (nobrk ch) (flush) (setf last-seen nil))
1227             ((eql last-seen :cr) (brk ch) (setf last-seen nil))
1228             ((eql type :cr) (nobrk ch) (setf last-seen :cr))
1229             ((eql type :lf) (brk ch) (setf last-seen nil))
1230             ((eql type :sep) (brk ch) (setf last-seen nil))
1231             ((and last-seen (or (eql type :extend) (eql type :format)))
1232              (nobrk ch))
1233             ((eql type :close)
1234              (unless (eql last-seen :close) (flush))
1235              (nobrk ch) (setf last-seen :close sp-run nil))
1236             ((eql type :sp)
1237              (unless (or (and (not sp-run) (eql last-seen :close)) (eql last-seen :sp))
1238                (flush) (setf sp-run t))
1239              (nobrk ch) (setf last-seen :sp))
1240             (t (flush) (nobrk ch) (setf last-seen type sp-run nil))))
1241     (flush) (nreverse clusters))))
1242
1243 (defun sentences (string)
1244   #!+sb-doc
1245   "Breaks STRING into sentences acording to the default
1246 sentence breaking rules specified in UAX #29"
1247   (let ((special-handling '(:close :sp :sep :cr :lf :scontinue :sterm :aterm))
1248         (chars (sentence-prebreak string))
1249         sentence sentences state)
1250     (flatpush (car chars) sentence)
1251     (do ((first (car chars) second)
1252          (tail (cdr chars) (cdr tail))
1253          (second (cadr chars) (cadr tail))
1254          (third (caddr chars) (caddr tail)))
1255         ((not first)
1256          (progn
1257            ; Shake off last sentence
1258            (when sentence (push (nreverse sentence) sentences))
1259            (nreverse (mapcar #'(lambda (l) (coerce l 'string)) sentences))))
1260       (flet ((brk () (push (nreverse sentence) sentences)
1261                   (setf sentence nil) (flatpush second sentence))
1262              (nobrk () (flatpush second sentence)))
1263       (let ((c1 (sentence-break-class first))
1264             (c2 (sentence-break-class second))
1265             (c3 (sentence-break-class third)))
1266         (cond
1267           ((eql state :brk-next) (brk) (setf state nil))
1268           ((eql state :nobrk-next) (nobrk) (setf state nil))
1269           ((member c1 '(:sep :cr :lf)) (brk))
1270           ((and (eql c1 :aterm) (eql c2 :numeric)) (nobrk))
1271           ((and (eql c1 :upper) (eql c2 :aterm)
1272                 (eql c3 :upper)) (nobrk) (setf state :nobrk-next))
1273           ((or (and (member c1 '(:sterm :aterm)) (member c2 '(:close :sp))
1274                     (member c3 '(:scontinue :sterm :aterm)))
1275                (and (member c1 '(:sterm :aterm))
1276                     (member c2 '(:scontinue :sterm :aterm))))
1277            (nobrk) (when (member c2 '(:close :sp)) (setf state :nobrk-next)))
1278           ((and (member c1 '(:sterm :aterm)) (member c2 '(:close :sp))
1279                 (member c3 '(:sep :cr :lf)))
1280            (nobrk) (setf state :nobrk-next)) ;; Let the linebreak call (brk)
1281           ((and (member c1 '(:sterm :aterm)) (member c2 '(:sep :cr :lf)))
1282            (nobrk)) ; Doesn't trigger rule 8
1283           ((eql c1 :sterm) ; Not ambiguous anymore, rule 8a already handled
1284            (if (member c2 '(:close :sp))
1285                (progn (nobrk) (setf state :brk-next))
1286                (brk)))
1287           ((and (eql c2 :sterm) third (not (member c3 special-handling)))
1288            (nobrk) (setf state :brk-next)) ; STerm followed by nothing important
1289           ((or (eql c1 :aterm)
1290                (and (eql c2 :aterm) third
1291                     (not (member c3 special-handling)) (not (eql c3 :numeric))))
1292            ; Finally handle rule 8
1293            (if (loop for c in
1294                     (if (and third (not (or (member c3 special-handling)
1295                                             (eql c3 :numeric))))
1296                         (cdr tail) tail)
1297                   for type = (sentence-break-class c) do
1298                     (when (member type '(:oletter :upper :sep :cr :lf
1299                                          :sterm :aterm))
1300                       (return nil))
1301                     (when (eql type :lower) (return t)) finally (return nil))
1302                ; Ambiguous case
1303                (progn (nobrk) (setf state :nobrk-next))
1304                ; Otherwise
1305                (if (member c2 '(:close :sp :aterm))
1306                    (progn (nobrk) (setf state :brk-next))
1307                    (brk))))
1308           (t (nobrk))))))))
1309
1310 (defun line-prebreak (string)
1311   (let ((chars (coerce string 'list))
1312         cluster clusters last-seen)
1313     (loop for char in chars
1314        for type = (line-break-class char)
1315        do
1316          (when
1317              (and cluster
1318                   (or
1319                    (not (eql type :cm))
1320                    (and (eql type :cm)
1321                         (member last-seen '(nil :BK :CR :LF :NL :SP :ZW)))))
1322            (if (cdr cluster)
1323                (push (nreverse cluster) clusters)
1324                (push (car cluster) clusters))
1325            (setf cluster nil))
1326          (unless (eql type :cm) (setf last-seen type))
1327          (push char cluster))
1328     (if (cdr cluster)
1329         (push (nreverse cluster) clusters)
1330         (push (car cluster) clusters))
1331     (nreverse clusters)))
1332
1333 (defun line-break-annotate (string)
1334   (let ((chars (line-prebreak string))
1335         first second t1 t2 tail (ret (list :cant))
1336         state after-spaces)
1337     (macrolet ((cmpush (thing)
1338                  (let ((gthing (gensym)))
1339                    `(let ((,gthing ,thing))
1340                       (if (listp ,gthing)
1341                           (loop for (c next) on ,gthing do
1342                                (push c ret)
1343                                (when next (push :cant ret)))
1344                           (push ,thing ret)))))
1345                (between (a b action)
1346                  (let ((atest (if (eql a :any) t
1347                                   (if (listp a)
1348                                       `(member t1 ,a)
1349                                       `(eql t1 ,a))))
1350                        (btest (if (eql b :any) t
1351                                   (if (listp b)
1352                                       `(member t2 ,b)
1353                                       `(eql t2 ,b)))))
1354                  `(when (and ,atest ,btest)
1355                     (cmpush ,action)
1356                     (cmpush second)
1357                     (go tail))))
1358                (after-spaces (a b action)
1359                  (let ((atest (if (eql a :any) t
1360                                   (if (listp a)
1361                                       `(member t1 ,a)
1362                                       `(eql t1 ,a))))
1363                        (btest (if (eql b :any) t
1364                                   (if (listp b)
1365                                       `(member type ,b)
1366                                       `(eql type ,b)))))
1367                    `(when
1368                         (and ,atest
1369                              (loop for c in tail
1370                                 for type = (line-break-class c :resolve t)
1371                                 do
1372                                   (when (not (eql type :sp))
1373                                     (return ,btest))))
1374                       (if (eql t2 :sp)
1375                          (progn (cmpush :cant)
1376                                 (cmpush second)
1377                                 (setf state :eat-spaces)
1378                                 (setf after-spaces ,action)
1379                                 (go tail))
1380                          (progn (cmpush ,action)
1381                                 (cmpush second)
1382                                 (go tail)))))))
1383
1384       (cmpush (car chars))
1385       (setf first (car chars))
1386       (setf tail (cdr chars))
1387       (setf second (car tail))
1388       (tagbody
1389          top
1390          (when (not first) (go end))
1391          (setf t1 (line-break-class first :resolve t))
1392          (setf t2 (line-break-class second :resolve t))
1393          (between :any :nil :must)
1394          (when (and (eql state :eat-spaces) (eql t2 :sp))
1395             (cmpush :cant) (cmpush second) (go tail))
1396          (between :bk :any :must)
1397          (between :cr :lf :cant)
1398          (between '(:cr :lf :nl) :any :must)
1399          (between :any '(:zw :bk :cr :lf :nl) :cant)
1400          (when after-spaces (cmpush after-spaces) (cmpush second)
1401                (setf state nil after-spaces nil) (go tail))
1402          (after-spaces :zw :any :can)
1403          (between :any :wj :cant)
1404          (between :wj :any :cant)
1405          (between :gl :any :cant)
1406          (between '(:ZW :WJ :SY :SG :SA :RI :QU :PR :PO :OP :NU :NS :NL
1407                     :LF :IS :IN :ID :HL :GL :EX :CR :CP :CM :CL :CJ :CB
1408                     :BK :BB :B2 :AL :AI :JL :JV :JT :H2 :H3 :XX)
1409                   :gl :cant)
1410          (between :any '(:cl :cp :ex :is :sy) :cant)
1411          (after-spaces :op :any :cant)
1412          (after-spaces :qu :op :cant)
1413          (after-spaces '(:cl :cp) :ns :cant)
1414          (after-spaces :b2 :b2 :cant)
1415          (between :any :sp :cant) ;; Goes here to deal with after-spaces
1416          (between :sp :any :can)
1417          (between :any :qu :cant)
1418          (between :qu :any :cant)
1419          (between :any :cb :can)
1420          (between :cb :any :can)
1421          (between :any '(:ba :hy :ns) :cant)
1422          (between :bb :any :cant)
1423          (when (and (eql t1 :hl) (eql t2 :hy))
1424            (cmpush :cant) (cmpush second)
1425            (setf after-spaces :can) (go tail))
1426          (between '(:al :hl :id :in :nu) :in :cant)
1427          (between :id :po :cant)
1428          (between '(:al :hl) :nu :cant)
1429          (between '(:nu :po) '(:al :hl) :cant)
1430          (between :pr '(:id :al :hl) :cant)
1431          (between '(:cl :cp :nu) '(:po :pr) :cant)
1432          (between :nu '(:po :pr :nu) :cant)
1433          (between '(:po :pr) :op :cant)
1434          (between '(:po :pr :hy :is :sy) :nu :cant)
1435          (between :jl '(:jl :jv :h2 :h3) :cant)
1436          (between '(:jv :h2) '(:jv :jt) :cant)
1437          (between '(:jt :h3) :jt :cant)
1438          (between '(:jl :jv :jt :h2 :h3) '(:in :po) :cant)
1439          (between :pr '(:jl :jv :jt :h2 :h3) :cant)
1440          (between '(:al :hl :is) '(:al :hl) :cant)
1441          (between '(:al :hl :nu) :op :cant)
1442          (between :cp '(:al :hl :nu) :cant)
1443          (between :ri :ri :cant)
1444          (between :any :any :can)
1445          tail
1446          (setf first second)
1447          (setf tail (cdr tail))
1448          (setf second (car tail))
1449          (go top)
1450          end)
1451       ;; LB3 satisfied by (:any :nil) -> :must
1452       (setf ret (nreverse ret))
1453       ret)))
1454
1455 (defun break-list-at (list n)
1456   (let ((tail list) (pre-tail nil))
1457     (loop repeat n do (setf pre-tail tail) (setf tail (cdr tail)))
1458     (setf (cdr pre-tail) nil)
1459     (values list tail)))
1460
1461 (defun lines (string &key (margin *print-right-margin*))
1462   #!+sb-doc
1463   "Breaks STRING into lines that are no wider than :MARGIN according to the
1464 line breaking rules outlined in UAX #14. Combining marks will awsays be kept
1465 together with their base characters, and spaces (but not other types of
1466 whitespace) will be removed from the end of lines. If :MARGIN is unspecified,
1467 it defaults to 80 characters"
1468   (when (string= string "") (return-from lines (list "")))
1469   (unless margin (setf margin 80))
1470   (do* ((chars (line-break-annotate string))
1471         line lines (filled 0) last-break-distance
1472         (break-type (car chars) (car tail))
1473         (char (cadr chars) (cadr tail))
1474         (tail (cddr chars) (cddr tail)))
1475        ((not break-type)
1476         (mapcar #'(lambda (s) (coerce s 'string)) (nreverse lines)))
1477     (ecase break-type
1478       (:cant
1479        (push char line)
1480        (unless (eql (line-break-class char) :CM)
1481          (incf filled))
1482        (when last-break-distance (incf last-break-distance)))
1483       (:can
1484        (push char line)
1485        (setf last-break-distance 1)
1486        (incf filled))
1487       (:must
1488        (push char line)
1489        (setf last-break-distance 1)
1490        (incf filled)
1491        (go break)))
1492     (if (> filled margin)
1493         (go break)
1494         (go next))
1495    break
1496     (when (not last-break-distance)
1497       ;; If we don't have any line breaks, remove the last thing we added that
1498       ;; takes up space, and all its combining marks
1499       (setf last-break-distance
1500             (1+ (loop for c in line while (eql (line-break-class c) :cm) summing 1))))
1501     (multiple-value-bind (next-line this-line) (break-list-at line last-break-distance)
1502       (loop while (eql (line-break-class (car this-line)) :sp)
1503          do (setf this-line (cdr this-line)))
1504       (push (nreverse this-line) lines)
1505       (setf line next-line)
1506       (setf filled (length line))
1507       (setf last-break-distance nil))
1508    next))
1509
1510 \f
1511 ;;; Collation
1512 (defconstant +maximum-variable-primary-element+
1513   #.(with-open-file (stream
1514                      (merge-pathnames
1515                       (make-pathname
1516                        :directory
1517                        '(:relative :up :up "output")
1518                        :name "other-collation-info" :type "lisp-expr")
1519                       sb!xc:*compile-file-truename*)
1520                      :direction :input
1521                      :element-type 'character)
1522       (read stream)))
1523
1524 (defun unpack-collation-key (key)
1525   (declare (type (simple-array (unsigned-byte 32) (*)) key))
1526   (loop for value across key
1527         collect
1528         (list (ldb (byte 16 16) value)
1529               (ldb (byte 11 5) value)
1530               (ldb (byte 5 0) value))))
1531
1532 (declaim (inline variable-p))
1533 (defun variable-p (x)
1534   (<= 1 x +maximum-variable-primary-element+))
1535
1536 (defun collation-key (string start end)
1537   (let (char1
1538         (char2 (code-char 0))
1539         (char3 (code-char 0)))
1540     (case (- end start)
1541       (1 (setf char1 (char string start)))
1542       (2 (setf char1 (char string start)
1543                char2 (char string (+ start 1))))
1544       (3 (setf char1 (char string start)
1545                char2 (char string (+ start 1))
1546                char3 (char string (+ start 2))))
1547       (t
1548        ;; There are never more than three characters in a contraction, right?
1549        (return-from collation-key nil)))
1550     (let ((packed-key (gethash (pack-3-codepoints
1551                                 (char-code char1)
1552                                 (char-code char2)
1553                                 (char-code char3))
1554                                **character-collations**)))
1555       (if packed-key
1556           (unpack-collation-key packed-key)
1557           (when (char= (code-char 0) char2 char3)
1558             (let* ((cp (char-code char1))
1559                    (base
1560                      (cond ((not (proplist-p char1 :unified-ideograph))
1561                             #xFBC0)
1562                            ((or (<= #x4E00 cp #x9FFF)
1563                                 (<= #xF900 cp #xFAFF))
1564                             #xFB40)
1565                            (t
1566                             #xFB80)))
1567                    (a (+ base (ash cp -15)))
1568                    (b (logior #.(ash 1 15) (logand cp #x7FFFF))))
1569               (list (list a #x20 #x2) (list b 0 0))))))))
1570
1571 (defun sort-key (string)
1572   (let* ((str (normalize-string string :nfd))
1573          (i 0) (len (length str)) max-match new-i
1574          sort-key
1575          after-variable)
1576     (loop while (< i len)
1577           do
1578           (loop for offset from 1 to 3
1579                 for index = (+ i offset)
1580                 while (<= index len)
1581                 do
1582                 (let ((key (collation-key str i index)))
1583                   (when key
1584                     (setf max-match key
1585                           new-i index))))
1586           (loop for index from new-i below len
1587                 for char = (char str index)
1588                 for previous-combining-class = combining-class
1589                 for combining-class = (combining-class char)
1590                 until (eql combining-class 0)
1591                 unless (and (>= (- index new-i) 1)
1592                             ;; Combiners are sorted, we only have to look back
1593                             ;; one step (see canonically-compose)
1594                             (>= (combining-class (char str (1- index)))
1595                                 combining-class))
1596                 do
1597                 (rotatef (char str new-i) (char str index))
1598                 (let ((key (collation-key str i (1+ new-i))))
1599                   (if key
1600                       (setf max-match key
1601                             new-i (1+ new-i))
1602                       (rotatef (char str new-i) (char str index)))))
1603           (loop for key in max-match do (push key sort-key))
1604           (setf i new-i))
1605     (macrolet ((push-non-zero (obj place)
1606                  `(when (/= ,obj 0)
1607                     (push ,obj ,place))))
1608       (let (primary secondary tertiary quatenary)
1609         (loop for (k1 k2 k3) in (nreverse sort-key)
1610               do
1611               (cond
1612                 ((= k1 k2 k3 0))
1613                 ((variable-p k1)
1614                  (setf after-variable t)
1615                  (push k1 quatenary))
1616                 ((/= k1 0)
1617                  (setf after-variable nil)
1618                  (push k1 primary)
1619                  (push-non-zero k2 secondary)
1620                  (push-non-zero k3 tertiary)
1621                  (push #xFFFF quatenary))
1622                 ((/= k3 0)
1623                  (unless after-variable
1624                    (push-non-zero k2 secondary)
1625                    (push k3 tertiary)
1626                    (push #xFFFF quatenary)))))
1627         (concatenate 'vector
1628                      (nreverse primary) #(0) (nreverse secondary) #(0)
1629                      (nreverse tertiary) #(0) (nreverse quatenary))))))
1630
1631 (defun vector< (vector1 vector2)
1632   (loop for i across vector1
1633         for j across vector2
1634         do
1635         (cond ((< i j) (return-from vector< t))
1636               ((> i j) (return-from vector< nil))))
1637   ;; If there's no differences, shortest vector wins
1638   (< (length vector1) (length vector2)))
1639
1640 (defun unicode= (string1 string2 &key (start1 0) end1 (start2 0) end2 (strict t))
1641   #!+sb-doc
1642   "Determines whether STRING1 and STRING2 are canonically equivalent according
1643 to Unicode. The START and END arguments behave like the arguments to STRING=.
1644 If :STRICT is NIL, UNICODE= tests compatibility equavalence instead."
1645   (let ((str1 (normalize-string (subseq string1 start1 end1) (if strict :nfd :nfkd)))
1646         (str2 (normalize-string (subseq string2 start2 end2) (if strict :nfd :nfkd))))
1647     (string= str1 str2)))
1648
1649 (defun unicode-equal (string1 string2 &key (start1 0) end1 (start2 0) end2 (strict t))
1650     #!+sb-doc
1651   "Determines whether STRING1 and STRING2 are canonically equivalent after
1652 casefoldin8 (that is, ignoring case differences) according to Unicode. The
1653 START and END arguments behave like the arguments to STRING=. If :STRICT is
1654 NIL, UNICODE= tests compatibility equavalence instead."
1655   (let ((str1 (normalize-string (subseq string1 start1 end1) (if strict :nfd :nfkd)))
1656         (str2 (normalize-string (subseq string2 start2 end2) (if strict :nfd :nfkd))))
1657     (string=
1658      (normalize-string (casefold str1) (if strict :nfd :nfkd))
1659      (normalize-string (casefold str2) (if strict :nfd :nfkd)))))
1660
1661 (defun unicode< (string1 string2 &key (start1 0) end1 (start2 0) end2)
1662   #!+sb-doc
1663   "Determines whether STRING1 sorts before STRING2 using the Unicode Collation
1664 Algorithm, The function uses an untailored Default Unicode Collation Element Table
1665 to produce the sort keys. The function uses the Shifted method for dealing
1666 with variable-weight characters, as described in UTS #10"
1667   (let* ((s1 (subseq string1 start1 end1))
1668          (s2 (subseq string2 start2 end2))
1669          (k1 (sort-key s1)) (k2 (sort-key s2)))
1670     (if (equalp k1 k2)
1671         (string< (normalize-string s1 :nfd) (normalize-string s2 :nfd))
1672         (vector< k1 k2))))
1673
1674 (defun unicode<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
1675   #!+sb-doc
1676   "Tests if STRING1 and STRING2 are either UNICODE< or UNICODE="
1677   (or
1678    (unicode= string1 string2 :start1 start1 :end1 end1
1679              :start2 start2 :end2 end2)
1680    (unicode< string1 string2 :start1 start1 :end1 end1
1681              :start2 start2 :end2 end2)))
1682
1683 (defun unicode> (string1 string2 &key (start1 0) end1 (start2 0) end2)
1684   #!+sb-doc
1685   "Tests if STRING2 is UNICODE< STRING1."
1686    (unicode< string2 string1 :start1 start2 :end1 end2
1687              :start2 start1 :end2 end1))
1688
1689 (defun unicode>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
1690   #!+sb-doc
1691   "Tests if STRING1 and STRING2 are either UNICODE= or UNICODE>"
1692   (or
1693    (unicode= string1 string2 :start1 start1 :end1 end1
1694              :start2 start2 :end2 end2)
1695    (unicode> string1 string2 :start1 start1 :end1 end1
1696              :start2 start2 :end2 end2)))
1697
1698 \f
1699 ;;; Confusable detection
1700
1701 (defun canonically-deconfuse (string)
1702   (let (ret (i 0) new-i (len (length string))
1703             best-node)
1704     (loop while (< i len) do
1705          (loop for offset from 1 to 5
1706             while (<= (+ i offset) len)
1707             do
1708               (let ((node (gethash (subseq string i (+ i offset))
1709                                    **confusables**)))
1710                 (when node (setf best-node node new-i (+ i offset)))))
1711          (cond
1712            (best-node (push best-node ret) (setf i new-i))
1713            (t (push (subseq string i (1+ i)) ret) (incf i)))
1714          (setf best-node nil new-i nil))
1715     (apply #'concatenate 'string (nreverse ret))))
1716
1717 (defun confusable-p (string1 string2 &key (start1 0) end1 (start2 0) end2)
1718   #!+sb-doc
1719   "Determines whether STRING1 and STRING2 could be visually confusable
1720 according to the IDNA confusableSummary.txt table"
1721     (let* ((form #!+sb-unicode :nfd #!-sb-unicode :nfc)
1722            (str1 (normalize-string (subseq string1 start1 end1) form))
1723            (str2 (normalize-string (subseq string2 start2 end2) form))
1724            (skeleton1 (normalize-string (canonically-deconfuse str1) form))
1725            (skeleton2 (normalize-string (canonically-deconfuse str2) form)))
1726       (string= skeleton1 skeleton2)))