src/code/target-unicode.lisp

   1 ;;;; Unicode functions
   2
   3 ;;;; This software is part of the SBCL system. See the README file for
   4 ;;;; more information.
   5 ;;;;
   6 ;;;; This software is derived from the CMU CL system, which was
   7 ;;;; written at Carnegie Mellon University and released into the
   8 ;;;; public domain. The software is in the public domain and is
   9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
  10 ;;;; files for more information.
  11
  12 (in-package "SB!UNICODE")
  13
  14 (declaim (type simple-vector **special-numerics**))
  15 (sb!impl::defglobal **special-numerics**
  16   #.(with-open-file (stream
  17                      (merge-pathnames
  18                       (make-pathname
  19                        :directory
  20                        '(:relative :up :up "output")
  21                        :name "numerics" :type "lisp-expr")
  22                       sb!xc:*compile-file-truename*)
  23                      :direction :input
  24                      :element-type 'character)
  25       (read stream)))
  26
  27
  28 (declaim (type (simple-array (unsigned-byte 32) (*)) **block-ranges**))
  29 (sb!impl::defglobal **block-ranges**
  30   #.(sb!int:!coerce-to-specialized
  31      (with-open-file (stream
  32                       (merge-pathnames
  33                        (make-pathname
  34                         :directory
  35                         '(:relative :up :up "output")
  36                         :name "blocks" :type "lisp-expr")
  37                        sb!xc:*compile-file-truename*)
  38                       :direction :input
  39                       :element-type 'character)
  40        (read stream))
  41      '(unsigned-byte 32)))
  42
  43 (macrolet ((unicode-property-init ()
  44              (let ((proplist-dump
  45                     (with-open-file (stream
  46                                      (merge-pathnames
  47                                       (make-pathname
  48                                        :directory
  49                                        '(:relative :up :up "output")
  50                                        :name "misc-properties" :type "lisp-expr")
  51                                       sb!xc:*compile-file-truename*)
  52                                      :direction :input
  53                                      :element-type 'character)
  54                       (read stream)))
  55                    (confusable-sets
  56                     (with-open-file (stream
  57                                      (merge-pathnames
  58                                       (make-pathname
  59                                        :directory
  60                                        '(:relative :up :up "output")
  61                                        :name "confusables" :type "lisp-expr")
  62                                       sb!xc:*compile-file-truename*)
  63                                      :direction :input
  64                                      :element-type 'character)
  65                       (read stream)))
  66                    (bidi-mirroring-list
  67                     (with-open-file (stream
  68                                      (merge-pathnames
  69                                       (make-pathname
  70                                        :directory
  71                                        '(:relative :up :up "output")
  72                                        :name "bidi-mirrors" :type "lisp-expr")
  73                                       sb!xc:*compile-file-truename*)
  74                                      :direction :input
  75                                      :element-type 'character)
  76                       (read stream))))
  77                `(progn
  78                   (sb!impl::defglobal **proplist-properties** ',proplist-dump)
  79                   (sb!impl::defglobal **confusables** ',confusable-sets)
  80                   (sb!impl::defglobal **bidi-mirroring-glyphs** ',bidi-mirroring-list)
  81                   (defun !unicode-properties-cold-init ()
  82                     (let ((hash (make-hash-table)) (list ',proplist-dump))
  83                       (do ((k (car list) (car list)) (v (cadr list) (cadr list)))
  84                           ((not list) hash)
  85                         (setf (gethash k hash) v)
  86                         (setf list (cddr list)))
  87                       (setf **proplist-properties** hash))
  88                     (let ((hash (make-hash-table :test #'equal)))
  89                       (loop for set in ',confusable-sets
  90                          for items = (mapcar #'(lambda (item)
  91                                                  (map 'simple-string
  92                                                       #'code-char item))
  93                                              #!+sb-unicode set
  94                                              #!-sb-unicode
  95                                              (remove-if-not
  96                                               #'(lambda (item)
  97                                                   (every
  98                                                    #'(lambda (x)
  99                                                        (< x sb!xc:char-code-limit))
 100                                                    item)) set))
 101                          do (dolist (i items)
 102                               (setf (gethash i hash) (first items))))
 103                       (setf **confusables** hash))
 104                     (let ((hash (make-hash-table)) (list ',bidi-mirroring-list))
 105                       (loop for (k v) in list do
 106                            (setf (gethash k hash) v))
 107                       (setf **bidi-mirroring-glyphs** hash)))))))
 108   (unicode-property-init))
 109
 110 ;;; Unicode property access
 111 (defun ordered-ranges-member (item vector)
 112   (declare (type simple-vector vector)
 113            (type fixnum item)
 114            (optimize speed))
 115   (labels ((recurse (start end)
 116              (declare (type index start end)
 117                       (optimize (safety 0)))
 118              (when (< start end)
 119                (let* ((i (+ start (truncate (the index (- end start)) 2)))
 120                       (index (* 2 i))
 121                       (elt1 (svref vector index))
 122                       (elt2 (svref vector (1+ index))))
 123                  (declare (type index i)
 124                           (fixnum elt1 elt2))
 125                  (cond ((< item elt1)
 126                         (recurse start i))
 127                        ((> item elt2)
 128                         (recurse (+ 1 i) end))
 129                        (t
 130                         item))))))
 131     (recurse 0 (truncate (length vector) 2))))
 132
 133 ;; Returns which range `item` was found in or NIL
 134 ;; First range = 0, second range = 1 ...
 135 (defun ordered-ranges-position (item vector)
 136   (declare (type (simple-array (unsigned-byte 32) (*)) vector)
 137            (type fixnum item))
 138   (labels ((recurse (start end)
 139              (declare (type index start end))
 140              (when (< start end)
 141                (let* ((i (+ start (truncate (- end start) 2)))
 142                       (index (* 2 i))
 143                       (elt1 (aref vector index))
 144                       (elt2 (aref vector (1+ index))))
 145                  (declare (type index i))
 146                  (cond ((< item elt1)
 147                         (recurse start i))
 148                        ((> item elt2)
 149                         (recurse (+ 1 i) end))
 150                        (t
 151                         i))))))
 152     (recurse 0 (truncate (length vector) 2))))
 153
 154 (defun proplist-p (character property)
 155   #!+sb-doc
 156   "Returns T if CHARACTER has the specified PROPERTY.
 157 PROPERTY is a keyword representing one of the properties from PropList.txt,
 158 with underscores replaced by dashes."
 159   (ordered-ranges-member (char-code character)
 160                          (gethash property **proplist-properties**)))
 161
 162 ;; WARNING: These have to be manually kept in sync with the values in ucd.lisp
 163 (declaim (type simple-vector *general-categories* *bidi-classes* *east-asian-widths*
 164                *scripts* *line-break-classes* *blocks*))
 165 (sb!impl::defglobal *general-categories*
 166   #(:Lu :Ll :Lt :Lm :Lo :Cc :Cf :Co :Cs :Cn :Mc :Me :Mn :Nd
 167     :Nl :No :Pc :Pd :Pe :Pf :Pi :Po :Ps :Sc :Sk :Sm :So :Zl
 168     :Zp :Zs))
 169
 170 (sb!impl::defglobal *bidi-classes*
 171   #(:BN :AL :AN :B :CS :EN :ES :ET :L :LRE :LRO :NSM :ON
 172     :PDF :R :RLE :RLO :S :WS :LRI :RLI :FSI :PDI))
 173
 174 (sb!impl::defglobal *east-asian-widths*
 175   #(:N :A :H :W :F :Na))
 176
 177 (sb!impl::defglobal *scripts*
 178   #(:Unknown :Common :Latin :Greek :Cyrillic :Armenian :Hebrew :Arabic :Syriac
 179     :Thaana :Devanagari :Bengali :Gurmukhi :Gujarati :Oriya :Tamil :Telugu
 180     :Kannada :Malayalam :Sinhala :Thai :Lao :Tibetan :Myanmar :Georgian :Hangul
 181     :Ethiopic :Cherokee :Canadian-Aboriginal :Ogham :Runic :Khmer :Mongolian
 182     :Hiragana :Katakana :Bopomofo :Han :Yi :Old-Italic :Gothic :Deseret
 183     :Inherited :Tagalog :Hanunoo :Buhid :Tagbanwa :Limbu :Tai-Le :Linear-B
 184     :Ugaritic :Shavian :Osmanya :Cypriot :Braille :Buginese :Coptic :New-Tai-Lue
 185     :Glagolitic :Tifinagh :Syloti-Nagri :Old-Persian :Kharoshthi :Balinese
 186     :Cuneiform :Phoenician :Phags-Pa :Nko :Sundanese :Lepcha :Ol-Chiki :Vai
 187     :Saurashtra :Kayah-Li :Rejang :Lycian :Carian :Lydian :Cham :Tai-Tham
 188     :Tai-Viet :Avestan :Egyptian-Hieroglyphs :Samaritan :Lisu :Bamum :Javanese
 189     :Meetei-Mayek :Imperial-Aramaic :Old-South-Arabian :Inscriptional-Parthian
 190     :Inscriptional-Pahlavi :Old-Turkic :Kaithi :Batak :Brahmi :Mandaic :Chakma
 191     :Meroitic-Cursive :Meroitic-Hieroglyphs :Miao :Sharada :Sora-Sompeng
 192     :Takri :Bassa-Vah :Mahajani :Pahawh-Hmong :Caucasian-Albanian :Manichaean
 193     :Palmyrene :Duployan :Mende-Kikakui :Pau-Cin-Hau :Elbasan :Modi
 194     :Psalter-Pahlavi :Grantha :Mro :Siddham :Khojki :Nabataean :Tirhuta
 195     :Khudawadi :Old-North-Arabian :Warang-Citi :Linear-A :Old-Permic))
 196
 197 (sb!impl::defglobal *line-break-classes*
 198     #(:XX :AI :AL :B2 :BA :BB :BK :CB :CJ :CL :CM :CP :CR :EX :GL
 199       :HL :HY :ID :IN :IS :LF :NL :NS :NU :OP :PO :PR :QU :RI :SA
 200       :SG :SP :SY :WJ :ZW))
 201
 202 (sb!impl::defglobal *blocks*
 203   #(:Basic-Latin :Latin-1-Supplement :Latin-Extended-A :Latin-Extended-B
 204     :IPA-Extensions :Spacing-Modifier-Letters :Combining-Diacritical-Marks
 205     :Greek-and-Coptic :Cyrillic :Cyrillic-Supplement :Armenian :Hebrew :Arabic
 206     :Syriac :Arabic-Supplement :Thaana :NKo :Samaritan :Mandaic
 207     :Arabic-Extended-A :Devanagari :Bengali :Gurmukhi :Gujarati :Oriya :Tamil
 208     :Telugu :Kannada :Malayalam :Sinhala :Thai :Lao :Tibetan :Myanmar :Georgian
 209     :Hangul-Jamo :Ethiopic :Ethiopic-Supplement :Cherokee
 210     :Unified-Canadian-Aboriginal-Syllabics :Ogham :Runic :Tagalog :Hanunoo
 211     :Buhid :Tagbanwa :Khmer :Mongolian
 212     :Unified-Canadian-Aboriginal-Syllabics-Extended :Limbu :Tai-Le :New-Tai-Lue
 213     :Khmer-Symbols :Buginese :Tai-Tham :Combining-Diacritical-Marks-Extended
 214     :Balinese :Sundanese :Batak :Lepcha :Ol-Chiki :Sundanese-Supplement
 215     :Vedic-Extensions :Phonetic-Extensions :Phonetic-Extensions-Supplement
 216     :Combining-Diacritical-Marks-Supplement :Latin-Extended-Additional
 217     :Greek-Extended :General-Punctuation :Superscripts-and-Subscripts
 218     :Currency-Symbols :Combining-Diacritical-Marks-for-Symbols
 219     :Letterlike-Symbols :Number-Forms :Arrows :Mathematical-Operators
 220     :Miscellaneous-Technical :Control-Pictures :Optical-Character-Recognition
 221     :Enclosed-Alphanumerics :Box-Drawing :Block-Elements :Geometric-Shapes
 222     :Miscellaneous-Symbols :Dingbats :Miscellaneous-Mathematical-Symbols-A
 223     :Supplemental-Arrows-A :Braille-Patterns :Supplemental-Arrows-B
 224     :Miscellaneous-Mathematical-Symbols-B :Supplemental-Mathematical-Operators
 225     :Miscellaneous-Symbols-and-Arrows :Glagolitic :Latin-Extended-C :Coptic
 226     :Georgian-Supplement :Tifinagh :Ethiopic-Extended :Cyrillic-Extended-A
 227     :Supplemental-Punctuation :CJK-Radicals-Supplement :Kangxi-Radicals
 228     :Ideographic-Description-Characters :CJK-Symbols-and-Punctuation :Hiragana
 229     :Katakana :Bopomofo :Hangul-Compatibility-Jamo :Kanbun :Bopomofo-Extended
 230     :CJK-Strokes :Katakana-Phonetic-Extensions :Enclosed-CJK-Letters-and-Months
 231     :CJK-Compatibility :CJK-Unified-Ideographs-Extension-A
 232     :Yijing-Hexagram-Symbols :CJK-Unified-Ideographs :Yi-Syllables :Yi-Radicals
 233     :Lisu :Vai :Cyrillic-Extended-B :Bamum :Modifier-Tone-Letters
 234     :Latin-Extended-D :Syloti-Nagri :Common-Indic-Number-Forms :Phags-pa
 235     :Saurashtra :Devanagari-Extended :Kayah-Li :Rejang :Hangul-Jamo-Extended-A
 236     :Javanese :Myanmar-Extended-B :Cham :Myanmar-Extended-A :Tai-Viet
 237     :Meetei-Mayek-Extensions :Ethiopic-Extended-A :Latin-Extended-E
 238     :Meetei-Mayek :Hangul-Syllables :Hangul-Jamo-Extended-B :High-Surrogates
 239     :High-Private-Use-Surrogates :Low-Surrogates :Private-Use-Area
 240     :CJK-Compatibility-Ideographs :Alphabetic-Presentation-Forms
 241     :Arabic-Presentation-Forms-A :Variation-Selectors :Vertical-Forms
 242     :Combining-Half-Marks :CJK-Compatibility-Forms :Small-Form-Variants
 243     :Arabic-Presentation-Forms-B :Halfwidth-and-Fullwidth-Forms :Specials
 244     :Linear-B-Syllabary :Linear-B-Ideograms :Aegean-Numbers
 245     :Ancient-Greek-Numbers :Ancient-Symbols :Phaistos-Disc :Lycian :Carian
 246     :Coptic-Epact-Numbers :Old-Italic :Gothic :Old-Permic :Ugaritic :Old-Persian
 247     :Deseret :Shavian :Osmanya :Elbasan :Caucasian-Albanian :Linear-A
 248     :Cypriot-Syllabary :Imperial-Aramaic :Palmyrene :Nabataean :Phoenician
 249     :Lydian :Meroitic-Hieroglyphs :Meroitic-Cursive :Kharoshthi
 250     :Old-South-Arabian :Old-North-Arabian :Manichaean :Avestan
 251     :Inscriptional-Parthian :Inscriptional-Pahlavi :Psalter-Pahlavi :Old-Turkic
 252     :Rumi-Numeral-Symbols :Brahmi :Kaithi :Sora-Sompeng :Chakma :Mahajani
 253     :Sharada :Sinhala-Archaic-Numbers :Khojki :Khudawadi :Grantha :Tirhuta
 254     :Siddham :Modi :Takri :Warang-Citi :Pau-Cin-Hau :Cuneiform
 255     :Cuneiform-Numbers-and-Punctuation :Egyptian-Hieroglyphs :Bamum-Supplement
 256     :Mro :Bassa-Vah :Pahawh-Hmong :Miao :Kana-Supplement :Duployan
 257     :Shorthand-Format-Controls :Byzantine-Musical-Symbols :Musical-Symbols
 258     :Ancient-Greek-Musical-Notation :Tai-Xuan-Jing-Symbols
 259     :Counting-Rod-Numerals :Mathematical-Alphanumeric-Symbols :Mende-Kikakui
 260     :Arabic-Mathematical-Alphabetic-Symbols :Mahjong-Tiles :Domino-Tiles
 261     :Playing-Cards :Enclosed-Alphanumeric-Supplement
 262     :Enclosed-Ideographic-Supplement :Miscellaneous-Symbols-and-Pictographs
 263     :Emoticons :Ornamental-Dingbats :Transport-and-Map-Symbols
 264     :Alchemical-Symbols :Geometric-Shapes-Extended :Supplemental-Arrows-C
 265     :CJK-Unified-Ideographs-Extension-B :CJK-Unified-Ideographs-Extension-C
 266     :CJK-Unified-Ideographs-Extension-D :CJK-Compatibility-Ideographs-Supplement
 267     :Tags :Variation-Selectors-Supplement :Supplementary-Private-Use-Area-A
 268     :Supplementary-Private-Use-Area-B))
 269
 270 (declaim (inline svref-or-null))
 271 (defun svref-or-null (vector index)
 272   (and (< index (length vector))
 273        (svref vector index)))
 274
 275 (defun general-category (character)
 276   #!+sb-doc
 277   "Returns the general category of CHARACTER as it appears in UnicodeData.txt"
 278   (svref-or-null *general-categories* (sb!impl::ucd-general-category character)))
 279
 280 (defun bidi-class (character)
 281   #!+sb-doc
 282   "Returns the bidirectional class of CHARACTER"
 283   (if (and (eql (general-category character) :Cn)
 284            (default-ignorable-p character))
 285       :Bn
 286       (svref-or-null
 287        *bidi-classes*
 288        (aref **character-misc-database** (1+ (misc-index character))))))
 289
 290 (declaim (inline combining-class))
 291 (defun combining-class (character)
 292   #!+sb-doc
 293   "Returns the canonical combining class (CCC) of CHARACTER"
 294   (aref **character-misc-database** (+ 2 (misc-index character))))
 295
 296 (defun decimal-value (character)
 297   #!+sb-doc
 298   "Returns the decimal digit value associated with CHARACTER or NIL if
 299 there is no such value.
 300
 301 The only characters in Unicode with a decimal digit value are those
 302 that are part of a range of characters that encode the digits 0-9.
 303 Because of this, `(decimal-digit c) <=> (digit-char-p c 10)` in
 304 #+sb-unicode builds"
 305   (sb!impl::ucd-decimal-digit character))
 306
 307 (defun digit-value (character)
 308   #!+sb-doc
 309   "Returns the Unicode digit value of CHARACTER or NIL if it doesn't exist.
 310
 311 Digit values are guaranteed to be integers between 0 and 9 inclusive.
 312 All characters with decimal digit values have the same digit value,
 313 but there are characters (such as digits of number systems without a 0 value)
 314 that have a digit value but no decimal digit value"
 315   (let ((%digit (clear-flag 6
 316                             (aref **character-misc-database**
 317                                   (+ 3 (misc-index character))))))
 318     (if (< %digit 10) %digit nil)))
 319
 320 (defun numeric-value (character)
 321   #!+sb-doc
 322   "Returns the numeric value of CHARACTER or NIL if there is no such value.
 323 Numeric value is the most general of the Unicode numeric properties.
 324 The only constraint on the numeric value is that it be a rational number."
 325   (or (double-vector-binary-search (char-code character)
 326                                    **special-numerics**)
 327       (digit-value character)))
 328
 329 (defun mirrored-p (character)
 330   #!+sb-doc
 331   "Returns T if CHARACTER needs to be mirrored in bidirectional text.
 332 Otherwise, returns NIL."
 333   (logbitp 5 (aref **character-misc-database**
 334                     (+ 5 (misc-index character)))))
 335
 336 (defun bidi-mirroring-glyph (character)
 337   #!+sb-doc
 338   "Returns the mirror image of CHARACTER if it exists.
 339 Otherwise, returns NIL."
 340   (when (mirrored-p character)
 341     (let ((ret (gethash (char-code character) **bidi-mirroring-glyphs**)))
 342       (when ret (code-char ret)))))
 343
 344 (defun east-asian-width (character)
 345   #!+sb-doc
 346   "Returns the East Asian Width property of CHARACTER as
 347 one of the keywords :N (Narrow), :A (Ambiguous), :H (Halfwidth),
 348 :W (Wide), :F (Fullwidth), or :NA (Not applicable)"
 349   (svref-or-null *east-asian-widths*
 350                  (ldb (byte 3 0)
 351                       (aref **character-misc-database**
 352                             (+ 5 (misc-index character))))))
 353
 354 (defun script (character)
 355   #!+sb-doc
 356   "Returns the Script property of CHARACTER as a keyword.
 357 If CHARACTER does not have a known script, returns :UNKNOWN"
 358   (svref-or-null *scripts*
 359                  (aref **character-misc-database** (+ 6 (misc-index character)))))
 360
 361 (defun char-block (character)
 362   #!+sb-doc
 363   "Returns the Unicode block in which CHARACTER resides as a keyword.
 364 If CHARACTER does not have a known block, returns :NO-BLOCK"
 365   (let* ((code (char-code character))
 366          (block-index (ordered-ranges-position code **block-ranges**)))
 367     (if block-index
 368         (aref *blocks* block-index) :no-block)))
 369
 370 (defun unicode-1-name (character)
 371   #!+sb-doc
 372   "Returns the name assigned to CHARACTER in Unicode 1.0 if it is distinct
 373 from the name currently assigned to CHARACTER. Otherwise, returns NIL.
 374 This property has been officially obsoleted by the Unicode standard, and
 375 is only included for backwards compatibility."
 376   (let* ((char-code (char-code character))
 377          (h-code (double-vector-binary-search char-code
 378                                               **unicode-1-char-name-database**)))
 379     (when h-code
 380       (huffman-decode h-code **unicode-character-name-huffman-tree**))))
 381
 382 (defun age (character)
 383   #!+sb-doc
 384   "Returns the version of Unicode in which CHARACTER was assigned as a pair
 385 of values, both integers, representing the major and minor version respectively.
 386 If CHARACTER is not assigned in Unicode, returns NIL for both values."
 387   (let* ((value (aref **character-misc-database** (+ 8 (misc-index character))))
 388          (major (ash value -3))
 389          (minor (ldb (byte 3 0) value)))
 390     (if (zerop value) (values nil nil) (values major minor))))
 391
 392 (defun hangul-syllable-type (character)
 393   #!+sb-doc
 394   "Returns the Hangul syllable type of CHARACTER.
 395 The syllable type can be one of :L, :V, :T, :LV, or :LVT.
 396 If the character is not a Hangul syllable or Jamo, returns NIL"
 397   (let ((cp (char-code character)))
 398     (cond
 399       ((or
 400         (and (<= #x1100 cp) (<= cp #x115f))
 401         (and (<= #xa960 cp) (<= cp #xa97c))) :L)
 402       ((or
 403         (and (<= #x1160 cp) (<= cp #x11a7))
 404         (and (<= #xd7B0 cp) (<= cp #xd7C6))) :V)
 405       ((or
 406         (and (<= #x11a8 cp) (<= cp #x11ff))
 407         (and (<= #xd7c8 cp) (<= cp #xd7fb))) :T)
 408       ((and (<= #xac00 cp) (<= cp #xd7a3))
 409        (if (= 0 (rem (- cp #xac00) 28)) :LV :LVT)))))
 410
 411 (defun line-break-class (character &key resolve)
 412   #!+sb-doc
 413   "Returns the line breaking class of CHARACTER, as specified in UAX #14.
 414 If :RESOLVE is NIL, returns the character class found in the property file.
 415 If :RESOLVE is non-NIL, centain line-breaking classes will be mapped to othec
 416 classes as specified in the applicable standards. Addinionally, if :RESOLVE
 417 is :EAST-ASIAN, Ambigious (class :AI) characters will be mapped to the
 418 Ideographic (:ID) class instead of Alphabetic (:AL)."
 419   (when (and resolve (listp character)) (setf character (car character)))
 420   (when (and resolve (not character)) (return-from line-break-class :nil))
 421   (let ((raw-class
 422          (svref-or-null *line-break-classes*
 423                         (aref **character-misc-database** (+ 7 (misc-index character)))))
 424         (syllable-type (hangul-syllable-type character)))
 425     (when syllable-type
 426       (setf raw-class
 427             (cdr (assoc syllable-type
 428                         '((:l . :JL) (:v . :JV) (:t . :JT)
 429                           (:lv . :H2) (:lvt . :H3))))))
 430     (when resolve
 431       (setf raw-class
 432             (case raw-class
 433               (:ai (if (eql resolve :east-asion) :ID :AL))
 434               ; If we see :CM when resolving, we have a CM that isn't subject
 435               ; to LB9, so we do LB10
 436               ((:xx :cm) :al)
 437               (:sa (if (member (general-category character) '(:Mn :Mc))
 438                        :CM :AL))
 439               (:cj :ns)
 440               (:sg (error "The character ~S is a surrogate, which should not
 441 appear in an SBCL string. The line-breaking behavior of surrogates is undefined."
 442                           character))
 443               (t raw-class))))
 444     raw-class))
 445
 446 (defun uppercase-p (character)
 447   #!+sb-doc
 448   "Returns T if CHARACTER has the Unicode property Uppercase and NIL otherwise"
 449   (or (eql (general-category character) :Lu) (proplist-p character :other-uppercase)))
 450
 451 (defun lowercase-p (character)
 452   #!+sb-doc
 453   "Returns T if CHARACTER has the Unicode property Lowercase and NIL otherwise"
 454   (or (eql (general-category character) :Ll) (proplist-p character :other-lowercase)))
 455
 456 (defun cased-p (character)
 457   #!+sb-doc
 458   "Returns T if CHARACTER has a (Unicode) case, and NIL otherwise"
 459   (or (uppercase-p character) (lowercase-p character)
 460       (eql (general-category character) :Lt)))
 461
 462 (defun case-ignorable-p (character)
 463   #!+sb-doc
 464   "Returns T if CHARACTER is Case Ignorable as defined in Unicode 6.3, Chapter
 465 3"
 466   (or (member (general-category character)
 467               '(:Mn :Me :Cf :Lm :Sk))
 468       (member (word-break-class character)
 469               '(:midletter :midnumlet :single-quote))))
 470
 471 (defun alphabetic-p (character)
 472   #!+sb-doc
 473   "Returns T if CHARACTER is Alphabetic according to the Unicode standard
 474 and NIL otherwise"
 475   (or (member (general-category character) '(:Lu :Ll :Lt :Lm :Lo :Nl))
 476       (proplist-p character :other-alphabetic)))
 477
 478 (defun ideographic-p (character)
 479   #!+sb-doc
 480   "Returns T if CHARACTER has the Unicode property Ideographic,
 481 which loosely corresponds to the set of \"Chinese characters\""
 482   (proplist-p character :ideographic))
 483
 484 (defun math-p (character)
 485   #!+sb-doc
 486   "Returns T if CHARACTER is a mathematical symbol according to Unicode and
 487 NIL otherwise"
 488   (or (eql (general-category character) :sm) (proplist-p character :other-math)))
 489
 490 (defun whitespace-p (character)
 491   #!+sb-doc
 492   "Returns T if CHARACTER is whitespace according to Unicode
 493 and NIL otherwise"
 494   (proplist-p character :white-space))
 495
 496 (defun hex-digit-p (character &key ascii)
 497   #!+sb-doc
 498   "Returns T if CHARACTER is a hexadecimal digit and NIL otherwise.
 499 If :ASCII is non-NIL, fullwidth equivalents of the Latin letters A through F
 500 are excluded."
 501   (proplist-p character (if ascii :ascii-hex-digit :hex-digit)))
 502
 503 (defun soft-dotted-p (character)
 504   #!+sb-doc
 505   "Returns T if CHARACTER has a soft dot (such as the dots on i and j) which
 506 disappears when accents are placed on top of it. and NIL otherwise"
 507   (proplist-p character :soft-dotted))
 508
 509 (defun default-ignorable-p (character)
 510   #!+sb-doc
 511   "Returns T if CHARACTER is a Default_Ignorable_Code_Point"
 512   (and
 513    (or (proplist-p character :other-default-ignorable-code-point)
 514        (eql (general-category character) :cf)
 515        (proplist-p character :variation-selector))
 516    (not
 517     (or (whitespace-p character)
 518         (ordered-ranges-member
 519          (char-code character)
 520          #(#x0600 #x0604 #x06DD #x06DD #x070F #x070F #xFFF9 #xFFFB
 521            #x110BD #x110BD))))))
 522
 523 \f
 524 ;;; Implements UAX#15: Normalization Forms
 525 (declaim (inline char-decomposition-info))
 526 (defun char-decomposition-info (char)
 527   (let ((value (aref **character-misc-database**
 528                      (+ 4 (misc-index char)))))
 529     (values (clear-flag 7 value) (logbitp 7 value))))
 530
 531 (defun char-decomposition (char length callback)
 532   (declare (function callback))
 533   ;; Caller should have gotten length from char-decomposition-info
 534   (let* ((cp (char-code char))
 535          (cp-high (ash cp -8))
 536          (decompositions **character-decompositions**)
 537          (high-page (aref **character-high-pages** cp-high))
 538          (index (unless (logbitp 15 high-page) ;; Hangul syllable
 539                   (aref **character-low-pages**
 540                         (+ 1 (* 2 (+ (ldb (byte 8 0) cp) (ash high-page 8))))))))
 541     (cond ((= length 1)
 542            (funcall callback (code-char (aref decompositions index))))
 543           ((<= #xac00 cp #xd7a3)
 544            ;; see Unicode 6.2, section 3-12
 545            (let* ((sbase #xac00)
 546                   (lbase #x1100)
 547                   (vbase #x1161)
 548                   (tbase #x11a7)
 549                   (vcount 21)
 550                   (tcount 28)
 551                   (ncount (* vcount tcount))
 552                   (sindex (- cp sbase))
 553                   (lindex (floor sindex ncount))
 554                   (vindex (floor (mod sindex ncount) tcount))
 555                   (tindex (mod sindex tcount)))
 556              (funcall callback (code-char (+ lbase lindex)))
 557              (funcall callback (code-char (+ vbase vindex)))
 558              (when (> tindex 0)
 559                (funcall callback  (code-char (+ tbase tindex))))))
 560
 561           (t
 562            (loop for i below length
 563                  do
 564                  (funcall callback (code-char (aref decompositions (+ index i)))))))))
 565
 566 (defun decompose-char (char compatibility callback)
 567   (declare (function callback))
 568   (multiple-value-bind (info compat) (char-decomposition-info char)
 569     (if (and (plusp info)
 570              (or compatibility
 571                  (not compat)))
 572         (if compatibility
 573             (dx-flet ((callback (char)
 574                         (decompose-char char t callback)))
 575               (char-decomposition char info #'callback))
 576             (char-decomposition char info callback))
 577         (funcall callback char))))
 578
 579 (defun decompose-string (string compatibility filter)
 580   (let (chars
 581         (length 0)
 582         (previous-combining-class 0))
 583     (declare (type index length))
 584     (dx-flet ((callback (char)
 585                         (let ((combining-class (combining-class char)))
 586                           (incf length)
 587                           (cond ((< 0 combining-class previous-combining-class)
 588                                  ;; Ensure it's sorted
 589                                  (loop for cons on chars
 590                                        for next-char = (cadr cons)
 591                                        when (or (not next-char)
 592                                                 (<= 0 (combining-class next-char) combining-class))
 593                                        do (setf (cdr cons)
 594                                                 (cons char (cdr cons)))
 595                                           (return)))
 596                                 (t
 597                                  (push char chars)
 598                                  (setf previous-combining-class combining-class))))))
 599       (sb!kernel:with-array-data ((string string) (start) (end))
 600         (declare (ignore start))
 601         (let ((calback (if filter
 602                            (let ((filter (sb!kernel:%coerce-callable-to-fun filter)))
 603                              (lambda (char)
 604                                (when (funcall filter char)
 605                                  (callback char))))
 606                            #'callback)))
 607           (loop for i below end
 608                 for char = (schar string i)
 609                 do
 610                 (decompose-char char compatibility calback))))
 611       (let ((result (make-string length)))
 612         (loop for char in chars
 613               for i from (1- length) downto 0
 614               do (setf (schar result i) char))
 615         result))))
 616
 617 (defun composition-hangul-syllable-type (cp)
 618   (cond
 619     ((and (<= #x1100 cp) (<= cp #x1112)) :L)
 620     ((and (<= #x1161 cp) (<= cp #x1175)) :V)
 621     ((and (<= #x11a8 cp) (<= cp #x11c2)) :T)
 622     ((and (<= #xac00 cp) (<= cp #.(+ #xac00 11171)))
 623      (if (= 0 (rem (- cp #xac00) 28)) :LV :LVT))))
 624
 625 (defun primary-composition (char1 char2)
 626   (flet ((maybe (fn x) (when x (funcall fn x))))
 627     (let ((c1 (char-code char1))
 628           (c2 (char-code char2)))
 629       (maybe
 630        #'code-char
 631        (cond
 632          ((gethash (dpb c1 (byte 21 21) c2)
 633                    **character-primary-compositions**))
 634          ((and (eql (composition-hangul-syllable-type c1) :L)
 635                (eql (composition-hangul-syllable-type c2) :V))
 636           (let ((lindex (- c1 #x1100))
 637                 (vindex (- c2 #x1161)))
 638             (+ #xac00 (* lindex 588) (* vindex 28))))
 639          ((and (eql (composition-hangul-syllable-type c1) :LV)
 640                (eql (composition-hangul-syllable-type c2) :T))
 641           (+ c1 (- c2 #x11a7))))))))
 642
 643 ;;; This implements a sequence data structure, specialized for
 644 ;;; efficient deletion of characters at an index, along with tolerable
 645 ;;; random access.  The purpose is to support the canonical
 646 ;;; composition algorithm from Unicode, which involves replacing (not
 647 ;;; necessarily consecutive) pairs of code points with a single code
 648 ;;; point (e.g. [#\e #\combining_acute_accent] with
 649 ;;; #\latin_small_letter_e_with_acute).  The data structure is a list
 650 ;;; of three-element lists, each denoting a chunk of string data
 651 ;;; starting at the first index and ending at the second.
 652 ;;;
 653 ;;; Actually, the implementation isn't particularly efficient, and
 654 ;;; would probably benefit from being rewritten in terms of displaced
 655 ;;; arrays, which would substantially reduce copying.
 656 ;;;
 657 ;;; (also, generic sequences.  *sigh*.)
 658 (defun lref (lstring index)
 659   (dolist (l lstring)
 660     (when (and (<= (first l) index)
 661                (< index (second l)))
 662       (return (aref (third l) (- index (first l)))))))
 663
 664 (defun (setf lref) (newchar lstring index)
 665   (dolist (l lstring)
 666     (when (and (<= (first l) index)
 667                (< index (second l)))
 668       (return (setf (aref (third l) (- index (first l))) newchar)))))
 669
 670 (defun llength (lstring)
 671   (second (first (last lstring))))
 672
 673 (defun lstring (lstring)
 674   (let ((result (make-string (llength lstring))))
 675     (dolist (l lstring result)
 676       (replace result (third l) :start1 (first l) :end1 (second l)))))
 677
 678 (defun ldelete (lstring index)
 679   (do* ((ls lstring (cdr ls))
 680         (l (car ls) (car ls))
 681         so-fars)
 682        ((and (<= (first l) index)
 683              (< index (second l)))
 684         (append
 685          (nreverse so-fars)
 686          (cond
 687            ((= (first l) index)
 688             (list (list (first l) (1- (second l)) (subseq (third l) 1))))
 689            ((= index (1- (second l)))
 690             (list (list (first l) (1- (second l)) (subseq (third l) 0 (1- (length (third l)))))))
 691            (t
 692             (list
 693              (list (first l) index
 694                    (subseq (third l) 0 (- index (first l))))
 695              (list index (1- (second l))
 696                    (subseq (third l) (1+ (- index (first l))))))))
 697          (mapcar (lambda (x) (list (1- (first x)) (1- (second x)) (third x)))
 698                  (cdr ls))))
 699     (push l so-fars)))
 700
 701 (defun canonically-compose (string)
 702   (let* ((result (list (list 0 (length string) string)))
 703          (previous-starter-index (position 0 string :key #'combining-class))
 704          (i (and previous-starter-index (1+ previous-starter-index))))
 705     (when (or (not i) (= i (length string)))
 706       (return-from canonically-compose string))
 707     (tagbody
 708      again
 709        (when (and (>= (- i previous-starter-index) 2)
 710                   ;; test for Blocked (Unicode 3.11 para. D115)
 711                   ;;
 712                   ;; (assumes here that string has sorted combiners,
 713                   ;; so can look back just one step)
 714                   (>= (combining-class (lref result (1- i)))
 715                       (combining-class (lref result i))))
 716          (when (= (combining-class (lref result i)) 0)
 717            (setf previous-starter-index i))
 718          (incf i)
 719          (go next))
 720
 721        (let ((comp (primary-composition (lref result previous-starter-index)
 722                                         (lref result i))))
 723          (cond
 724            (comp
 725             (setf (lref result previous-starter-index) comp)
 726             (setf result (ldelete result i)))
 727            (t
 728             (when (= (combining-class (lref result i)) 0)
 729               (setf previous-starter-index i))
 730             (incf i))))
 731      next
 732        (unless (= i (llength result))
 733          (go again)))
 734     (if (= i (length string))
 735         string
 736         (lstring result))))
 737
 738 (defun normalize-string (string &optional (form :nfd)
 739                                           filter)
 740   #!+sb-doc
 741   "Normalize STRING to the Unicode normalization form form.
 742 Acceptable values for form are :NFD, :NFC, :NFKD, and :NFKC.
 743 If FILTER is a function it is called on each decomposed character and
 744 only characters for which it returns T are collected."
 745   (declare (type (member :nfd :nfkd :nfc :nfkc) form))
 746   #!-sb-unicode
 747   (declare (ignore filter))
 748   #!-sb-unicode
 749   (etypecase string
 750     ((array nil (*)) string)
 751     (string
 752      (ecase form
 753        ((:nfc :nfkc) string)
 754        ((:nfd :nfkd) (error "Cannot normalize to ~A form in #-SB-UNICODE builds" form)))))
 755   #!+sb-unicode
 756   (etypecase string
 757     (base-string string)
 758     ((array character (*))
 759      (ecase form
 760        ((:nfc)
 761         (canonically-compose (decompose-string string nil filter)))
 762        ((:nfd)
 763         (decompose-string string nil filter))
 764        ((:nfkc)
 765         (canonically-compose (decompose-string string t filter)))
 766        ((:nfkd)
 767         (decompose-string string t filter))))
 768     ((array nil (*)) string)))
 769
 770 (defun normalized-p (string &optional (form :nfd))
 771   #!+sb-doc
 772   "Tests if STRING is normalized to FORM"
 773   ;; FIXME: can be optimized
 774   (string= string (normalize-string string form)))
 775
 776 \f
 777 ;;; Unicode case algorithms
 778 ;; FIXME: Make these parts less redundant (macro?)
 779 (defparameter **special-titlecases**
 780   '#.(with-open-file (stream
 781                      (merge-pathnames
 782                       (make-pathname
 783                        :directory
 784                        '(:relative :up :up "output")
 785                        :name "titlecases" :type "lisp-expr")
 786                       sb!xc:*compile-file-truename*)
 787                      :direction :input
 788                      :element-type 'character)
 789         (read stream)))
 790
 791 (defparameter **special-casefolds**
 792   '#.(with-open-file (stream
 793                      (merge-pathnames
 794                       (make-pathname
 795                        :directory
 796                        '(:relative :up :up "output")
 797                        :name "foldcases" :type "lisp-expr")
 798                       sb!xc:*compile-file-truename*)
 799                      :direction :input
 800                      :element-type 'character)
 801         (read stream)))
 802
 803 (defun has-case-p (char)
 804   ;; Bit 6 is the Unicode case flag, as opposed to the Common Lisp one
 805   (logbitp 6 (aref **character-misc-database** (+ 5 (misc-index char)))))
 806
 807 (defun char-uppercase (char)
 808   (if (has-case-p char)
 809       (let ((cp (car (char-case-info char))))
 810         (if (atom cp) (list (code-char cp)) (mapcar #'code-char cp)))
 811       (list char)))
 812
 813 (defun char-lowercase (char)
 814   (if (has-case-p char)
 815       (let ((cp (cdr (char-case-info char))))
 816         (if (atom cp) (list (code-char cp)) (mapcar #'code-char cp)))
 817       (list char)))
 818
 819 (defun char-titlecase (char)
 820   (unless (has-case-p char) (return-from char-titlecase (list char)))
 821   (let* ((cp (char-code char))
 822          (value (assoc cp **special-titlecases**)))
 823     (if value
 824         (if (atom (cdr value))
 825             (list (code-char (cdr value)))
 826             (mapcar #'code-char (cdr value)))
 827         (char-uppercase char))))
 828
 829 (defun char-foldcase (char)
 830   (unless (has-case-p char) (return-from char-foldcase (list char)))
 831   (let* ((cp (char-code char))
 832          (value (assoc cp **special-casefolds**)))
 833     (if value
 834         (if (atom (cdr value))
 835             (list (code-char (cdr value)))
 836             (mapcar #'code-char (cdr value)))
 837         (char-lowercase char))))
 838
 839 (defun string-somethingcase (fn string special-fn)
 840   (let (result (len (length string)))
 841     (loop for index from 0 below len
 842        for char = (char string index)
 843        for cased = (or (funcall special-fn char index len)
 844                        (funcall fn char))
 845        do (loop for c in (remove :none cased) do (push c result)))
 846     (setf result (nreverse result))
 847     (coerce result 'string)))
 848
 849 (declaim (type function sb!unix::posix-getenv))
 850 (defun get-user-locale ()
 851   (let ((raw-locale
 852          #!+(or win32 unix) (or (sb!unix::posix-getenv "LC_ALL")
 853                                 (sb!unix::posix-getenv "LANG"))
 854          #!-(or win32 unix) nil))
 855     (when raw-locale
 856       (let ((lang-code (string-upcase
 857                         (subseq raw-locale 0 (position #\_ raw-locale)))))
 858         (when lang-code
 859           (intern lang-code "KEYWORD"))))))
 860
 861
 862 (defun uppercase (string &key locale)
 863   #!+sb-doc
 864   "Returns the full uppercase of STRING according to the Unicode standard.
 865 The result is not guaranteed to have the same length as the input. If :LOCALE
 866 is NIL, no language-specific case transformations are applied. If :LOCALE is a
 867 keyword representing a two-letter ISO country code, the case transforms of that
 868 locale are used. If :LOCALE is T, the user's current locale is used (Unix and
 869 Win32 only)."
 870   (when (eq locale t) (setf locale (get-user-locale)))
 871   (string-somethingcase
 872    #'char-uppercase string
 873    #!-sb-unicode (constantly nil)
 874    #!+sb-unicode ;; code-char with a constant > 255 breaks the build
 875    #'(lambda (char index len)
 876        (declare (ignore len))
 877        (cond
 878          ((and (eql locale :lt) (char= char (code-char #x0307))
 879                   (loop for i from (1- index) downto 0
 880                      for c = (char string i)
 881                      do (case (combining-class c)
 882                           (0 (return (soft-dotted-p c)))
 883                           (230 (return nil))
 884                           (t t))
 885                      finally (return nil)))
 886           '(:none))
 887          ((and (or (eql locale :tr) (eql locale :az))
 888                (char= char #\i))
 889           (list (code-char #x0130)))
 890          (t nil)))))
 891
 892 (defun lowercase (string &key locale)
 893   #!+sb-doc
 894   "Returns the full lowercase of STRING according to the Unicode standard.
 895 The result is not guaranteed to have the same length as the input.
 896 :LOCALE has the same semantics as the :LOCALE argument to UPPERCASE."
 897   (when (eq locale t) (setf locale (get-user-locale)))
 898   (string-somethingcase
 899    #'char-lowercase string
 900    #!-sb-unicode (constantly nil)
 901    #!+sb-unicode
 902    #'(lambda (char index len)
 903        (cond
 904          ((and (char= char (code-char #x03A3))
 905                (loop for i from (1- index) downto 0
 906                   for c = (char string i)
 907                   do (cond ((cased-p c) (return t))
 908                            ((case-ignorable-p c))
 909                            (t (return nil)))
 910                   finally (return nil))
 911                (loop for i from (1+ index) below len
 912                   for c = (char string i)
 913                   do (cond ((cased-p c) (return nil))
 914                            ((case-ignorable-p c))
 915                            (t (return t)))
 916                   finally (return t)))
 917           (list (code-char #x03C2)))
 918        ((eql locale :lt)
 919         (mapcar
 920          #'code-char
 921          (cdr (or
 922                (assoc (char-code char)
 923                       '((#x00CC . (#x0069 #x0307 #x0300))
 924                         (#x00CD . (#x0069 #x0307 #x0301))
 925                         (#x0128 . (#x0069 #x0307 #x0303))))
 926                (and (loop for i from (1+ index) below len
 927                        for c = (char string i)
 928                        do (case (combining-class c)
 929                             (230 (return t))
 930                             (0 (return nil))
 931                             (t t))
 932                        finally (return nil))
 933                     (assoc (char-code char)
 934                            '((#x0049 . (#x0069 #x0307))
 935                              (#x004A . (#x006A #x0307))
 936                              (#x012E . (#x012F #x0307)))))))))
 937        ((or (eql locale :tr) (eql locale :az))
 938         (cond
 939           ((char= char (code-char #x0130)) (list #\i))
 940           ((and (char= char (code-char #x0307))
 941                 (loop for i from (1- index) downto 0
 942                    for c = (char string i)
 943                    do (case (combining-class c)
 944                         (0 (return (char= c #\I)))
 945                         (230 (return nil))
 946                         (t t))
 947                    finally (return nil)))
 948            '(:none))
 949           ((and (char= char #\I)
 950                 (loop for i from (1+ index) below len
 951                    for c = (char string i)
 952                    do (case (combining-class c)
 953                         (0 (return t))
 954                         (230 (return (char/= c (code-char #x0307))))
 955                         (t t))
 956                    finally (return t)))
 957            (list (code-char #x0131)))
 958           (t nil)))
 959        (t nil)))))
 960
 961 (defun titlecase (string &key locale)
 962   #!+sb-doc
 963   "Returns the titlecase of STRING. The resulting string can
 964 be longer than the input.
 965 :LOCALE has the same semantics as the :LOCALE argument to UPPERCASE."
 966   (when (eq locale t) (setf locale (get-user-locale)))
 967   (let ((words (words string))
 968         (cased nil))
 969    (loop for word in words
 970       for first-cased = (or (position-if #'cased-p word) 0)
 971       for pre = (subseq word 0 first-cased)
 972       for initial = (char word first-cased)
 973       for rest = (subseq word (1+ first-cased))
 974       do (let ((up (char-titlecase initial)) (down (lowercase rest)))
 975            #!+sb-unicode
 976            (when (and (or (eql locale :tr) (eql locale :az))
 977                       (eql initial #\i))
 978              (setf up (list (code-char #x0130))))
 979            #!+sb-unicode
 980            (when (and (eql locale :lt)
 981                       (soft-dotted-p initial)
 982                       (eql (char down
 983                                  (position-if
 984                                   #'(lambda (c)
 985                                       (or (eql (combining-class c) 0)
 986                                           (eql (combining-class c) 230))) down))
 987                            (code-char #x0307)))
 988              (setf down (delete (code-char #x0307) down :count 1)))
 989            (push (concatenate 'string pre up down) cased)))
 990    (apply #'concatenate 'string (nreverse cased))))
 991
 992 (defun casefold (string)
 993   #!+sb-doc
 994   "Returns the full casefolding of STRING according to the Unicode standard.
 995 Casefolding removes case information in a way that allows the results to be used
 996 for case-insensitive comparisons.
 997 The result is not guaranteed to have the same length as the input."
 998   (string-somethingcase #'char-foldcase string (constantly nil)))
 999
1000 \f
1001 ;;; Unicode break algorithms
1002 ;;; In all the breaking methods:
1003 ;;; (brk) establishes a break between `first` and `second`
1004 ;;; (nobrk) prevents a break between `first` and `second`
1005 ;;; Setting flag=T/state=:nobrk-next prevents a break between `second` and `htird`
1006
1007 ;; Word breaking sets this to make their algorithms less tricky
1008 (defvar *other-break-special-graphemes* nil)
1009 (defun grapheme-break-class (char)
1010   #!+sb-doc
1011   "Returns the grapheme breaking class of CHARACTER, as specified in UAX #29."
1012   (let ((cp (when char (char-code char)))
1013         (gc (when char (general-category char)))
1014         (not-spacing-mark
1015          #(#x102B #x102C #x1038 #x1062 #x1063 #x1064 #x1067 #x1068 #x1069
1016            #x106A #x106B #x106C #x106D #x1083 #x1087 #x1088 #x1089 #x108A
1017            #x108B #x108C #x108F #x109A #x109B #x109C #x19B0 #x19B1 #x19B2
1018            #x19B3 #x19B4 #x19B8 #x19B9 #x19BB #x19BC #x19BD #x19BE #x19BF
1019            #x19C0 #x19C8 #x19C9 #x1A61 #x1A63 #x1A64 #xAA7B #xAA7D)))
1020     (cond
1021       ((not char) nil)
1022       ((= cp 10) :LF)
1023       ((= cp 13) :CR)
1024       ((or (member gc '(:Mn :Me))
1025            (proplist-p char :other-grapheme-extend)
1026            (and *other-break-special-graphemes*
1027                 (member gc '(:Mc :Cf)) (not (<= #x200B cp #x200D))))
1028        :extend)
1029       ((or (member gc '(:Zl :Zp :Cc :Cs :Cf))
1030            ;; From Cn and Default_Ignorable_Code_Point
1031            (eql cp #x2065) (eql cp #xE0000)
1032            (<= #xFFF0 cp #xFFF8)
1033            (<= #xE0002 cp #xE001F)
1034            (<= #xE0080 cp #xE00FF)
1035            (<= #xE01F0 cp #xE0FFF)) :control)
1036       ((<= #x1F1E6 cp #x1F1FF) :regional-indicator)
1037       ((and (or (eql gc :Mc)
1038                 (eql cp #x0E33) (eql cp #x0EB3))
1039             (not (binary-search cp not-spacing-mark))) :spacing-mark)
1040       (t (hangul-syllable-type char)))))
1041
1042 (macrolet ((def (name extendedp)
1043              `(defun ,name (function string)
1044                 (do ((length (length string))
1045                      (start 0)
1046                      (end 1 (1+ end))
1047                      (c1 nil)
1048                      (c2 (and (> (length string) 0) (grapheme-break-class (char string 0)))))
1049                     ((>= end length)
1050                      (if (= end length) (progn (funcall function string start end) nil)))
1051                   (flet ((brk () (funcall function string start end) (setf start end)))
1052                     (declare (truly-dynamic-extent #'brk))
1053                     (shiftf c1 c2 (grapheme-break-class (char string end)))
1054                     (cond
1055                       ((and (eql c1 :cr) (eql c2 :lf)))
1056                       ((or (member c1 '(:control :cr :lf))
1057                            (member c2 '(:control :cr :lf)))
1058                        (brk))
1059                       ((or (and (eql c1 :l) (member c2 '(:l :v :lv :lvt)))
1060                            (and (or (eql c1 :v) (eql c1 :lv))
1061                                 (or (eql c2 :v) (eql c2 :t)))
1062                            (and (eql c2 :t) (or (eql c1 :lvt) (eql c1 :t)))))
1063                       ((and (eql c1 :regional-indicator) (eql c2 :regional-indicator)))
1064                       ((eql c2 :extend))
1065                       ,@(when extendedp
1066                               `(((or (eql c2 :spacing-mark) (eql c1 :prepend)))))
1067                       (t (brk))))))))
1068   (def map-legacy-grapheme-boundaries nil)
1069   (def map-grapheme-boundaries t))
1070
1071 (macrolet ((def (name mapper)
1072              `(defun ,name (function string)
1073                 (let ((array (make-array 0 :element-type (array-element-type string) :adjustable t :displaced-to string)))
1074                   (flet ((fun (string start end)
1075                            (declare (type string string))
1076                            (funcall function (adjust-array array (- end start) :displaced-to string :displaced-index-offset start))))
1077                     (declare (truly-dynamic-extent #'fun))
1078                     (,mapper #'fun string))))))
1079   (def map-legacy-graphemes map-legacy-grapheme-boundaries)
1080   (def map-graphemes map-grapheme-boundaries))
1081
1082 (defun graphemes (string)
1083   #!+sb-doc
1084   "Breaks STRING into graphemes acording to the default
1085 grapheme breaking rules specified in UAX #29, returning a list of strings."
1086   (let (result)
1087     (map-graphemes (lambda (a) (push (subseq a 0) result)) string)
1088     (nreverse result)))
1089
1090 (defun word-break-class (char)
1091   #!+sb-doc
1092   "Returns the word breaking class of CHARACTER, as specified in UAX #29."
1093   ;; Words use graphemes as characters to deal with the ignore rule
1094   (when (listp char) (setf char (car char)))
1095   (let ((cp (when char (char-code char)))
1096         (gc (when char (general-category char)))
1097         (newlines #(#xB #xC #x0085 #x0085 #x2028 #x2029))
1098         (also-katakana
1099          #(#x3031 #x3035 #x309B #x309C
1100            #x30A0 #x30A0 #x30FC #x30FC
1101            #xFF70 #xFF70))
1102         (midnumlet #(#x002E #x2018 #x2019 #x2024 #xFE52 #xFF07 #xFF0E))
1103         (midletter
1104          #(#x003A #x00B7 #x002D7 #x0387 #x05F4 #x2027 #xFE13 #xFE55 #xFF1A))
1105         (midnum
1106          ;; Grepping of Line_Break = IS adjusted per UAX #29
1107          #(#x002C #x003B #x037E #x0589 #x060C #x060D #x066C #x07F8 #x2044
1108            #xFE10 #xFE14 #xFE50 #xFE54 #xFF0C #xFF1B)))
1109     (cond
1110       ((not char) nil)
1111       ((= cp 10) :LF)
1112       ((= cp 13) :CR)
1113       ((= cp #x27) :single-quote)
1114       ((= cp #x22) :double-quote)
1115       ((ordered-ranges-member cp newlines) :newline)
1116       ((or (eql (grapheme-break-class char) :extend)
1117            (eql gc :mc)) :extend)
1118       ((<= #x1F1E6 cp #x1F1FF) :regional-indicator)
1119       ((and (eql gc :Cf) (not (<= #x200B cp #x200D))) :format)
1120       ((or (eql (script char) :katakana)
1121            (ordered-ranges-member cp also-katakana)) :katakana)
1122       ((and (eql (script char) :Hebrew) (eql gc :lo)) :hebrew-letter)
1123       ((and (or (alphabetic-p char) (= cp #x05F3))
1124             (not (or (ideographic-p char)
1125                      (eql (line-break-class char) :sa)
1126                      (eql (script char) :hiragana)))) :aletter)
1127       ((binary-search cp midnumlet) :midnumlet)
1128       ((binary-search cp midletter) :midletter)
1129       ((binary-search cp midnum) :midnum)
1130       ((or (and (eql gc :Nd) (not (<= #xFF10 cp #xFF19))) ;Fullwidth digits
1131            (eql cp #x066B)) :numeric)
1132       ((eql gc :Pc) :extendnumlet)
1133       (t nil))))
1134
1135 (defmacro flatpush (thing list)
1136   (let ((%thing (gensym)) (%i (gensym)))
1137     `(let ((,%thing ,thing))
1138        (if (listp ,%thing)
1139            (dolist (,%i ,%thing)
1140              (push ,%i ,list))
1141            (push ,%thing ,list)))))
1142
1143 (defun words (string)
1144   #!+sb-doc
1145   "Breaks STRING into words acording to the default
1146 word breaking rules specified in UAX #29. Returns a list of strings"
1147   (let ((chars (mapcar
1148                  #'(lambda (s)
1149                      (let ((l (coerce s 'list)))
1150                        (if (cdr l) l (car l))))
1151                  (let ((*other-break-special-graphemes* t)) (graphemes string))))
1152          words word flag)
1153     (flatpush (car chars) word)
1154     (do ((first (car chars) second)
1155          (tail (cdr chars) (cdr tail))
1156          (second (cadr chars) (cadr tail)))
1157         ((not first) (nreverse (mapcar #'(lambda (l) (coerce l 'string)) words)))
1158       (flet ((brk () (push (nreverse word) words) (setf word nil) (flatpush second word))
1159              (nobrk () (flatpush second word)))
1160         (let ((c1 (word-break-class first))
1161               (c2 (word-break-class second))
1162               (c3 (when (and tail (cdr tail)) (word-break-class (cadr tail)))))
1163           (cond
1164             (flag (nobrk) (setf flag nil))
1165             ;; CR+LF are bound together by the grapheme clustering
1166             ((or (eql c1 :newline) (eql c1 :cr) (eql c1 :lf)
1167                  (eql c2 :newline) (eql c2 :cr) (eql c2 :lf)) (brk))
1168             ((or (eql c2 :format) (eql c2 :extend)) (nobrk))
1169             ((and (or (eql c1 :aletter) (eql c1 :hebrew-letter))
1170                   (or (eql c2 :aletter) (eql c2 :hebrew-letter))) (nobrk))
1171             ((and (or (eql c1 :aletter) (eql c1 :hebrew-letter))
1172                   (member c2 '(:midletter :midnumlet :single-quote))
1173                   (or (eql c3 :aletter) (eql c3 :hebrew-letter)))
1174              (nobrk) (setf flag t)) ; Handle the multiple breaks from this rule
1175             ((and (eql c1 :hebrew-letter) (eql c2 :double-quote)
1176                   (eql c3 :hebrew-letter))
1177              (nobrk) (setf flag t))
1178             ((and (eql c1 :hebrew-letter) (eql c2 :single-quote)) (nobrk))
1179             ((or (and (eql c1 :numeric) (member c2 '(:numeric :aletter :hebrew-letter)))
1180                  (and (eql c2 :numeric) (member c1 '(:numeric :aletter :hebrew-letter))))
1181              (nobrk))
1182             ((and (eql c1 :numeric)
1183                   (member c2 '(:midnum :midnumlet :single-quote))
1184                   (eql c3 :numeric))
1185              (nobrk) (setf flag t))
1186             ((and (eql c1 :katakana) (eql c2 :katakana)) (nobrk))
1187             ((or (and (member c1
1188                               '(:aletter :hebrew-letter :katakana
1189                                 :numeric :extendnumlet)) (eql c2 :extendnumlet))
1190                  (and (member c2
1191                               '(:aletter :hebrew-letter :katakana
1192                                 :numeric :extendnumlet)) (eql c1 :extendnumlet)))
1193              (nobrk))
1194             ((and (eql c1 :regional-indicator) (eql c2 :regional-indicator)) (nobrk))
1195             (t (brk))))))))
1196
1197 (defun sentence-break-class (char)
1198   #!+sb-doc
1199   "Returns the sentence breaking class of CHARACTER, as specified in UAX #29."
1200   (when (listp char) (setf char (car char)))
1201   (let ((cp (when char (char-code char)))
1202         (gc (when char (general-category char)))
1203         (aterms #(#x002E #x2024 #xFE52 #xFF0E))
1204         (scontinues
1205          #(#x002C #x002D #x003A #x055D #x060C #x060D #x07F8 #x1802 #x1808
1206            #x2013 #x2014 #x3001 #xFE10 #xFE11 #xFE13 #xFE31 #xFE32 #xFE50
1207            #xFE51 #xFE55 #xFE58 #xFE63 #xFF0C #xFF0D #xFF1A #xFF64)))
1208     (cond
1209       ((not char) nil)
1210       ((= cp 10) :LF)
1211       ((= cp 13) :CR)
1212       ((or (eql (grapheme-break-class char) :extend)
1213            (eql gc :mc)) :extend)
1214       ((or (eql cp #x0085) (<= #x2028 cp #x2029)) :sep)
1215       ((and (eql gc :Cf) (not (<= #x200C cp #x200D))) :format)
1216       ((whitespace-p char) :sp)
1217       ((lowercase-p char) :lower)
1218       ((or (uppercase-p char) (eql gc :Lt)) :upper)
1219       ((or (alphabetic-p char) (eql cp #x00A0) (eql cp #x05F3)) :oletter)
1220       ((or (and (eql gc :Nd) (not (<= #xFF10 cp #xFF19))) ;Fullwidth digits
1221            (<= #x066B cp #x066C)) :numeric)
1222       ((binary-search cp aterms) :aterm)
1223       ((binary-search cp scontinues) :scontinue)
1224       ((proplist-p char :sterm) :sterm)
1225       ((and (or (member gc '(:Po :Ps :Pe :Pf :Pi))
1226                 (eql (line-break-class char) :qu))
1227             (not (eql cp #x05F3))) :close)
1228       (t nil))))
1229
1230 (defun sentence-prebreak (string)
1231   #!+sb-doc
1232   "Pre-combines some sequences of characters to make the sentence-break
1233 algorithm simpler..
1234 Specifically,
1235 - Combines any character with the following extend of format characters
1236 - Combines CR + LF into '(CR LF)
1237 - Combines any run of :cp*:close* into one character"
1238   (let ((chars (coerce string 'list))
1239         cluster clusters last-seen sp-run)
1240     (labels ((flush () (if (cdr cluster) (push (nreverse cluster) clusters)
1241                            (if cluster (push (car cluster) clusters)))
1242                     (setf cluster nil))
1243              (brk (x)
1244                (flush) (push x clusters))
1245              (nobrk (x) (push x cluster)))
1246     (loop for ch in chars
1247        for type = (sentence-break-class ch)
1248        do (cond
1249             ((and (eql last-seen :cr) (eql type :lf)) (nobrk ch) (flush) (setf last-seen nil))
1250             ((eql last-seen :cr) (brk ch) (setf last-seen nil))
1251             ((eql type :cr) (nobrk ch) (setf last-seen :cr))
1252             ((eql type :lf) (brk ch) (setf last-seen nil))
1253             ((eql type :sep) (brk ch) (setf last-seen nil))
1254             ((and last-seen (or (eql type :extend) (eql type :format)))
1255              (nobrk ch))
1256             ((eql type :close)
1257              (unless (eql last-seen :close) (flush))
1258              (nobrk ch) (setf last-seen :close sp-run nil))
1259             ((eql type :sp)
1260              (unless (or (and (not sp-run) (eql last-seen :close)) (eql last-seen :sp))
1261                (flush) (setf sp-run t))
1262              (nobrk ch) (setf last-seen :sp))
1263             (t (flush) (nobrk ch) (setf last-seen type sp-run nil))))
1264     (flush) (nreverse clusters))))
1265
1266 (defun sentences (string)
1267   #!+sb-doc
1268   "Breaks STRING into sentences acording to the default
1269 sentence breaking rules specified in UAX #29"
1270   (let ((special-handling '(:close :sp :sep :cr :lf :scontinue :sterm :aterm))
1271         (chars (sentence-prebreak string))
1272         sentence sentences state)
1273     (flatpush (car chars) sentence)
1274     (do ((first (car chars) second)
1275          (tail (cdr chars) (cdr tail))
1276          (second (cadr chars) (cadr tail))
1277          (third (caddr chars) (caddr tail)))
1278         ((not first)
1279          (progn
1280            ; Shake off last sentence
1281            (when sentence (push (nreverse sentence) sentences))
1282            (nreverse (mapcar #'(lambda (l) (coerce l 'string)) sentences))))
1283       (flet ((brk () (push (nreverse sentence) sentences)
1284                   (setf sentence nil) (flatpush second sentence))
1285              (nobrk () (flatpush second sentence)))
1286       (let ((c1 (sentence-break-class first))
1287             (c2 (sentence-break-class second))
1288             (c3 (sentence-break-class third)))
1289         (cond
1290           ((eql state :brk-next) (brk) (setf state nil))
1291           ((eql state :nobrk-next) (nobrk) (setf state nil))
1292           ((member c1 '(:sep :cr :lf)) (brk))
1293           ((and (eql c1 :aterm) (eql c2 :numeric)) (nobrk))
1294           ((and (eql c1 :upper) (eql c2 :aterm)
1295                 (eql c3 :upper)) (nobrk) (setf state :nobrk-next))
1296           ((or (and (member c1 '(:sterm :aterm)) (member c2 '(:close :sp))
1297                     (member c3 '(:scontinue :sterm :aterm)))
1298                (and (member c1 '(:sterm :aterm))
1299                     (member c2 '(:scontinue :sterm :aterm))))
1300            (nobrk) (when (member c2 '(:close :sp)) (setf state :nobrk-next)))
1301           ((and (member c1 '(:sterm :aterm)) (member c2 '(:close :sp))
1302                 (member c3 '(:sep :cr :lf)))
1303            (nobrk) (setf state :nobrk-next)) ;; Let the linebreak call (brk)
1304           ((and (member c1 '(:sterm :aterm)) (member c2 '(:sep :cr :lf)))
1305            (nobrk)) ; Doesn't trigger rule 8
1306           ((eql c1 :sterm) ; Not ambiguous anymore, rule 8a already handled
1307            (if (member c2 '(:close :sp))
1308                (progn (nobrk) (setf state :brk-next))
1309                (brk)))
1310           ((and (eql c2 :sterm) third (not (member c3 special-handling)))
1311            (nobrk) (setf state :brk-next)) ; STerm followed by nothing important
1312           ((or (eql c1 :aterm)
1313                (and (eql c2 :aterm) third
1314                     (not (member c3 special-handling)) (not (eql c3 :numeric))))
1315            ; Finally handle rule 8
1316            (if (loop for c in
1317                     (if (and third (not (or (member c3 special-handling)
1318                                             (eql c3 :numeric))))
1319                         (cdr tail) tail)
1320                   for type = (sentence-break-class c) do
1321                     (when (member type '(:oletter :upper :sep :cr :lf
1322                                          :sterm :aterm))
1323                       (return nil))
1324                     (when (eql type :lower) (return t)) finally (return nil))
1325                ; Ambiguous case
1326                (progn (nobrk) (setf state :nobrk-next))
1327                ; Otherwise
1328                (if (member c2 '(:close :sp :aterm))
1329                    (progn (nobrk) (setf state :brk-next))
1330                    (brk))))
1331           (t (nobrk))))))))
1332
1333 (defun line-prebreak (string)
1334   (let ((chars (coerce string 'list))
1335         cluster clusters last-seen)
1336     (loop for char in chars
1337        for type = (line-break-class char)
1338        do
1339          (when
1340              (and cluster
1341                   (or
1342                    (not (eql type :cm))
1343                    (and (eql type :cm)
1344                         (member last-seen '(nil :BK :CR :LF :NL :SP :ZW)))))
1345            (if (cdr cluster)
1346                (push (nreverse cluster) clusters)
1347                (push (car cluster) clusters))
1348            (setf cluster nil))
1349          (unless (eql type :cm) (setf last-seen type))
1350          (push char cluster))
1351     (if (cdr cluster)
1352         (push (nreverse cluster) clusters)
1353         (push (car cluster) clusters))
1354     (nreverse clusters)))
1355
1356 (defun line-break-annotate (string)
1357   (let ((chars (line-prebreak string))
1358         first second t1 t2 tail (ret (list :cant))
1359         state after-spaces)
1360     (macrolet ((cmpush (thing)
1361                  (let ((gthing (gensym)))
1362                    `(let ((,gthing ,thing))
1363                       (if (listp ,gthing)
1364                           (loop for (c next) on ,gthing do
1365                                (push c ret)
1366                                (when next (push :cant ret)))
1367                           (push ,thing ret)))))
1368                (between (a b action)
1369                  (let ((atest (if (eql a :any) t
1370                                   (if (listp a)
1371                                       `(member t1 ,a)
1372                                       `(eql t1 ,a))))
1373                        (btest (if (eql b :any) t
1374                                   (if (listp b)
1375                                       `(member t2 ,b)
1376                                       `(eql t2 ,b)))))
1377                  `(when (and ,atest ,btest)
1378                     (cmpush ,action)
1379                     (cmpush second)
1380                     (go tail))))
1381                (after-spaces (a b action)
1382                  (let ((atest (if (eql a :any) t
1383                                   (if (listp a)
1384                                       `(member t1 ,a)
1385                                       `(eql t1 ,a))))
1386                        (btest (if (eql b :any) t
1387                                   (if (listp b)
1388                                       `(member type ,b)
1389                                       `(eql type ,b)))))
1390                    `(when
1391                         (and ,atest
1392                              (loop for c in tail
1393                                 for type = (line-break-class c :resolve t)
1394                                 do
1395                                   (when (not (eql type :sp))
1396                                     (return ,btest))))
1397                       (if (eql t2 :sp)
1398                          (progn (cmpush :cant)
1399                                 (cmpush second)
1400                                 (setf state :eat-spaces)
1401                                 (setf after-spaces ,action)
1402                                 (go tail))
1403                          (progn (cmpush ,action)
1404                                 (cmpush second)
1405                                 (go tail)))))))
1406
1407       (cmpush (car chars))
1408       (setf first (car chars))
1409       (setf tail (cdr chars))
1410       (setf second (car tail))
1411       (tagbody
1412          top
1413          (when (not first) (go end))
1414          (setf t1 (line-break-class first :resolve t))
1415          (setf t2 (line-break-class second :resolve t))
1416          (between :any :nil :must)
1417          (when (and (eql state :eat-spaces) (eql t2 :sp))
1418             (cmpush :cant) (cmpush second) (go tail))
1419          (between :bk :any :must)
1420          (between :cr :lf :cant)
1421          (between '(:cr :lf :nl) :any :must)
1422          (between :any '(:zw :bk :cr :lf :nl) :cant)
1423          (when after-spaces (cmpush after-spaces) (cmpush second)
1424                (setf state nil after-spaces nil) (go tail))
1425          (after-spaces :zw :any :can)
1426          (between :any :wj :cant)
1427          (between :wj :any :cant)
1428          (between :gl :any :cant)
1429          (between '(:ZW :WJ :SY :SG :SA :RI :QU :PR :PO :OP :NU :NS :NL
1430                     :LF :IS :IN :ID :HL :GL :EX :CR :CP :CM :CL :CJ :CB
1431                     :BK :BB :B2 :AL :AI :JL :JV :JT :H2 :H3 :XX)
1432                   :gl :cant)
1433          (between :any '(:cl :cp :ex :is :sy) :cant)
1434          (after-spaces :op :any :cant)
1435          (after-spaces :qu :op :cant)
1436          (after-spaces '(:cl :cp) :ns :cant)
1437          (after-spaces :b2 :b2 :cant)
1438          (between :any :sp :cant) ;; Goes here to deal with after-spaces
1439          (between :sp :any :can)
1440          (between :any :qu :cant)
1441          (between :qu :any :cant)
1442          (between :any :cb :can)
1443          (between :cb :any :can)
1444          (between :any '(:ba :hy :ns) :cant)
1445          (between :bb :any :cant)
1446          (when (and (eql t1 :hl) (eql t2 :hy))
1447            (cmpush :cant) (cmpush second)
1448            (setf after-spaces :can) (go tail))
1449          (between '(:al :hl :id :in :nu) :in :cant)
1450          (between :id :po :cant)
1451          (between '(:al :hl) :nu :cant)
1452          (between '(:nu :po) '(:al :hl) :cant)
1453          (between :pr '(:id :al :hl) :cant)
1454          (between '(:cl :cp :nu) '(:po :pr) :cant)
1455          (between :nu '(:po :pr :nu) :cant)
1456          (between '(:po :pr) :op :cant)
1457          (between '(:po :pr :hy :is :sy) :nu :cant)
1458          (between :jl '(:jl :jv :h2 :h3) :cant)
1459          (between '(:jv :h2) '(:jv :jt) :cant)
1460          (between '(:jt :h3) :jt :cant)
1461          (between '(:jl :jv :jt :h2 :h3) '(:in :po) :cant)
1462          (between :pr '(:jl :jv :jt :h2 :h3) :cant)
1463          (between '(:al :hl :is) '(:al :hl) :cant)
1464          (between '(:al :hl :nu) :op :cant)
1465          (between :cp '(:al :hl :nu) :cant)
1466          (between :ri :ri :cant)
1467          (between :any :any :can)
1468          tail
1469          (setf first second)
1470          (setf tail (cdr tail))
1471          (setf second (car tail))
1472          (go top)
1473          end)
1474       ;; LB3 satisfied by (:any :nil) -> :must
1475       (setf ret (nreverse ret))
1476       ret)))
1477
1478 (defun break-list-at (list n)
1479   (let ((tail list) (pre-tail nil))
1480     (loop repeat n do (setf pre-tail tail) (setf tail (cdr tail)))
1481     (setf (cdr pre-tail) nil)
1482     (values list tail)))
1483
1484 (defun lines (string &key (margin *print-right-margin*))
1485   #!+sb-doc
1486   "Breaks STRING into lines that are no wider than :MARGIN according to the
1487 line breaking rules outlined in UAX #14. Combining marks will always be kept
1488 together with their base characters, and spaces (but not other types of
1489 whitespace) will be removed from the end of lines. If :MARGIN is unspecified,
1490 it defaults to 80 characters"
1491   (when (string= string "") (return-from lines (list "")))
1492   (unless margin (setf margin 80))
1493   (do* ((chars (line-break-annotate string))
1494         line lines (filled 0) last-break-distance
1495         (break-type (car chars) (car tail))
1496         (char (cadr chars) (cadr tail))
1497         (tail (cddr chars) (cddr tail)))
1498        ((not break-type)
1499         (mapcar #'(lambda (s) (coerce s 'string)) (nreverse lines)))
1500     (ecase break-type
1501       (:cant
1502        (push char line)
1503        (unless (eql (line-break-class char) :CM)
1504          (incf filled))
1505        (when last-break-distance (incf last-break-distance)))
1506       (:can
1507        (push char line)
1508        (setf last-break-distance 1)
1509        (incf filled))
1510       (:must
1511        (push char line)
1512        (setf last-break-distance 1)
1513        (incf filled)
1514        (go break)))
1515     (if (> filled margin)
1516         (go break)
1517         (go next))
1518    break
1519     (when (not last-break-distance)
1520       ;; If we don't have any line breaks, remove the last thing we added that
1521       ;; takes up space, and all its combining marks
1522       (setf last-break-distance
1523             (1+ (loop for c in line while (eql (line-break-class c) :cm) summing 1))))
1524     (multiple-value-bind (next-line this-line) (break-list-at line last-break-distance)
1525       (loop while (eql (line-break-class (car this-line)) :sp)
1526          do (setf this-line (cdr this-line)))
1527       (push (nreverse this-line) lines)
1528       (setf line next-line)
1529       (setf filled (length line))
1530       (setf last-break-distance nil))
1531    next))
1532
1533 \f
1534 ;;; Collation
1535 (defconstant +maximum-variable-primary-element+
1536   #.(with-open-file (stream
1537                      (merge-pathnames
1538                       (make-pathname
1539                        :directory
1540                        '(:relative :up :up "output")
1541                        :name "other-collation-info" :type "lisp-expr")
1542                       sb!xc:*compile-file-truename*)
1543                      :direction :input
1544                      :element-type 'character)
1545       (read stream)))
1546
1547 (defun unpack-collation-key (key)
1548   (declare (type (simple-array (unsigned-byte 32) (*)) key))
1549   (loop for value across key
1550         collect
1551         (list (ldb (byte 16 16) value)
1552               (ldb (byte 11 5) value)
1553               (ldb (byte 5 0) value))))
1554
1555 (declaim (inline variable-p))
1556 (defun variable-p (x)
1557   (<= 1 x +maximum-variable-primary-element+))
1558
1559 (defun collation-key (string start end)
1560   (let (char1
1561         (char2 (code-char 0))
1562         (char3 (code-char 0)))
1563     (case (- end start)
1564       (1 (setf char1 (char string start)))
1565       (2 (setf char1 (char string start)
1566                char2 (char string (+ start 1))))
1567       (3 (setf char1 (char string start)
1568                char2 (char string (+ start 1))
1569                char3 (char string (+ start 2))))
1570       (t
1571        ;; There are never more than three characters in a contraction, right?
1572        (return-from collation-key nil)))
1573     (let ((packed-key (gethash (pack-3-codepoints
1574                                 (char-code char1)
1575                                 (char-code char2)
1576                                 (char-code char3))
1577                                **character-collations**)))
1578       (if packed-key
1579           (unpack-collation-key packed-key)
1580           (when (char= (code-char 0) char2 char3)
1581             (let* ((cp (char-code char1))
1582                    (base
1583                      (cond ((not (proplist-p char1 :unified-ideograph))
1584                             #xFBC0)
1585                            ((or (<= #x4E00 cp #x9FFF)
1586                                 (<= #xF900 cp #xFAFF))
1587                             #xFB40)
1588                            (t
1589                             #xFB80)))
1590                    (a (+ base (ash cp -15)))
1591                    (b (logior #.(ash 1 15) (logand cp #x7FFFF))))
1592               (list (list a #x20 #x2) (list b 0 0))))))))
1593
1594 (defun sort-key (string)
1595   (let* ((str (normalize-string string :nfd))
1596          (i 0) (len (length str)) max-match new-i
1597          sort-key
1598          after-variable)
1599     (loop while (< i len)
1600           do
1601           (loop for offset from 1 to 3
1602                 for index = (+ i offset)
1603                 while (<= index len)
1604                 do
1605                 (let ((key (collation-key str i index)))
1606                   (when key
1607                     (setf max-match key
1608                           new-i index))))
1609           (loop for index from new-i below len
1610                 for char = (char str index)
1611                 for previous-combining-class = combining-class
1612                 for combining-class = (combining-class char)
1613                 until (eql combining-class 0)
1614                 unless (and (>= (- index new-i) 1)
1615                             ;; Combiners are sorted, we only have to look back
1616                             ;; one step (see canonically-compose)
1617                             (>= (combining-class (char str (1- index)))
1618                                 combining-class))
1619                 do
1620                 (rotatef (char str new-i) (char str index))
1621                 (let ((key (collation-key str i (1+ new-i))))
1622                   (if key
1623                       (setf max-match key
1624                             new-i (1+ new-i))
1625                       (rotatef (char str new-i) (char str index)))))
1626           (loop for key in max-match do (push key sort-key))
1627           (setf i new-i))
1628     (macrolet ((push-non-zero (obj place)
1629                  `(when (/= ,obj 0)
1630                     (push ,obj ,place))))
1631       (let (primary secondary tertiary quatenary)
1632         (loop for (k1 k2 k3) in (nreverse sort-key)
1633               do
1634               (cond
1635                 ((= k1 k2 k3 0))
1636                 ((variable-p k1)
1637                  (setf after-variable t)
1638                  (push k1 quatenary))
1639                 ((/= k1 0)
1640                  (setf after-variable nil)
1641                  (push k1 primary)
1642                  (push-non-zero k2 secondary)
1643                  (push-non-zero k3 tertiary)
1644                  (push #xFFFF quatenary))
1645                 ((/= k3 0)
1646                  (unless after-variable
1647                    (push-non-zero k2 secondary)
1648                    (push k3 tertiary)
1649                    (push #xFFFF quatenary)))))
1650         (concatenate 'vector
1651                      (nreverse primary) #(0) (nreverse secondary) #(0)
1652                      (nreverse tertiary) #(0) (nreverse quatenary))))))
1653
1654 (defun vector< (vector1 vector2)
1655   (loop for i across vector1
1656         for j across vector2
1657         do
1658         (cond ((< i j) (return-from vector< t))
1659               ((> i j) (return-from vector< nil))))
1660   ;; If there's no differences, shortest vector wins
1661   (< (length vector1) (length vector2)))
1662
1663 (defun unicode= (string1 string2 &key (start1 0) end1 (start2 0) end2 (strict t))
1664   #!+sb-doc
1665   "Determines whether STRING1 and STRING2 are canonically equivalent according
1666 to Unicode. The START and END arguments behave like the arguments to STRING=.
1667 If :STRICT is NIL, UNICODE= tests compatibility equavalence instead."
1668   (let ((str1 (normalize-string (subseq string1 start1 end1) (if strict :nfd :nfkd)))
1669         (str2 (normalize-string (subseq string2 start2 end2) (if strict :nfd :nfkd))))
1670     (string= str1 str2)))
1671
1672 (defun unicode-equal (string1 string2 &key (start1 0) end1 (start2 0) end2 (strict t))
1673     #!+sb-doc
1674   "Determines whether STRING1 and STRING2 are canonically equivalent after
1675 casefoldin8 (that is, ignoring case differences) according to Unicode. The
1676 START and END arguments behave like the arguments to STRING=. If :STRICT is
1677 NIL, UNICODE= tests compatibility equavalence instead."
1678   (let ((str1 (normalize-string (subseq string1 start1 end1) (if strict :nfd :nfkd)))
1679         (str2 (normalize-string (subseq string2 start2 end2) (if strict :nfd :nfkd))))
1680     (string=
1681      (normalize-string (casefold str1) (if strict :nfd :nfkd))
1682      (normalize-string (casefold str2) (if strict :nfd :nfkd)))))
1683
1684 (defun unicode< (string1 string2 &key (start1 0) end1 (start2 0) end2)
1685   #!+sb-doc
1686   "Determines whether STRING1 sorts before STRING2 using the Unicode Collation
1687 Algorithm, The function uses an untailored Default Unicode Collation Element Table
1688 to produce the sort keys. The function uses the Shifted method for dealing
1689 with variable-weight characters, as described in UTS #10"
1690   (let* ((s1 (subseq string1 start1 end1))
1691          (s2 (subseq string2 start2 end2))
1692          (k1 (sort-key s1)) (k2 (sort-key s2)))
1693     (if (equalp k1 k2)
1694         (string< (normalize-string s1 :nfd) (normalize-string s2 :nfd))
1695         (vector< k1 k2))))
1696
1697 (defun unicode<= (string1 string2 &key (start1 0) end1 (start2 0) end2)
1698   #!+sb-doc
1699   "Tests if STRING1 and STRING2 are either UNICODE< or UNICODE="
1700   (or
1701    (unicode= string1 string2 :start1 start1 :end1 end1
1702              :start2 start2 :end2 end2)
1703    (unicode< string1 string2 :start1 start1 :end1 end1
1704              :start2 start2 :end2 end2)))
1705
1706 (defun unicode> (string1 string2 &key (start1 0) end1 (start2 0) end2)
1707   #!+sb-doc
1708   "Tests if STRING2 is UNICODE< STRING1."
1709    (unicode< string2 string1 :start1 start2 :end1 end2
1710              :start2 start1 :end2 end1))
1711
1712 (defun unicode>= (string1 string2 &key (start1 0) end1 (start2 0) end2)
1713   #!+sb-doc
1714   "Tests if STRING1 and STRING2 are either UNICODE= or UNICODE>"
1715   (or
1716    (unicode= string1 string2 :start1 start1 :end1 end1
1717              :start2 start2 :end2 end2)
1718    (unicode> string1 string2 :start1 start1 :end1 end1
1719              :start2 start2 :end2 end2)))
1720
1721 \f
1722 ;;; Confusable detection
1723
1724 (defun canonically-deconfuse (string)
1725   (let (ret (i 0) new-i (len (length string))
1726             best-node)
1727     (loop while (< i len) do
1728          (loop for offset from 1 to 5
1729             while (<= (+ i offset) len)
1730             do
1731               (let ((node (gethash (subseq string i (+ i offset))
1732                                    **confusables**)))
1733                 (when node (setf best-node node new-i (+ i offset)))))
1734          (cond
1735            (best-node (push best-node ret) (setf i new-i))
1736            (t (push (subseq string i (1+ i)) ret) (incf i)))
1737          (setf best-node nil new-i nil))
1738     (apply #'concatenate 'string (nreverse ret))))
1739
1740 (defun confusable-p (string1 string2 &key (start1 0) end1 (start2 0) end2)
1741   #!+sb-doc
1742   "Determines whether STRING1 and STRING2 could be visually confusable
1743 according to the IDNA confusableSummary.txt table"
1744     (let* ((form #!+sb-unicode :nfd #!-sb-unicode :nfc)
1745            (str1 (normalize-string (subseq string1 start1 end1) form))
1746            (str2 (normalize-string (subseq string2 start2 end2) form))
1747            (skeleton1 (normalize-string (canonically-deconfuse str1) form))
1748            (skeleton2 (normalize-string (canonically-deconfuse str2) form)))
1749       (string= skeleton1 skeleton2)))