3 ;;;; This software is part of the SBCL system. See the README file for
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
12 (in-package "SB!UNICODE")
14 (declaim (type simple-vector
**special-numerics
**))
15 (sb!impl
::defglobal
**special-numerics
**
16 #.
(with-open-file (stream
20 '(:relative
:up
:up
"output")
21 :name
"numerics" :type
"lisp-expr")
22 sb
!xc
:*compile-file-truename
*)
24 :element-type
'character
)
28 (declaim (type (simple-array (unsigned-byte 32) (*)) **block-ranges
**))
29 (sb!impl
::defglobal
**block-ranges
**
30 #.
(sb!int
:!coerce-to-specialized
31 (with-open-file (stream
35 '(:relative
:up
:up
"output")
36 :name
"blocks" :type
"lisp-expr")
37 sb
!xc
:*compile-file-truename
*)
39 :element-type
'character
)
43 (macrolet ((unicode-property-init ()
45 (with-open-file (stream
49 '(:relative
:up
:up
"output")
50 :name
"misc-properties" :type
"lisp-expr")
51 sb
!xc
:*compile-file-truename
*)
53 :element-type
'character
)
56 (with-open-file (stream
60 '(:relative
:up
:up
"output")
61 :name
"confusables" :type
"lisp-expr")
62 sb
!xc
:*compile-file-truename
*)
64 :element-type
'character
)
67 (with-open-file (stream
71 '(:relative
:up
:up
"output")
72 :name
"bidi-mirrors" :type
"lisp-expr")
73 sb
!xc
:*compile-file-truename
*)
75 :element-type
'character
)
78 (sb!impl
::defglobal
**proplist-properties
** ',proplist-dump
)
79 (sb!impl
::defglobal
**confusables
** ',confusable-sets
)
80 (sb!impl
::defglobal
**bidi-mirroring-glyphs
** ',bidi-mirroring-list
)
81 (defun !unicode-properties-cold-init
()
82 (let ((hash (make-hash-table)) (list ',proplist-dump
))
83 (do ((k (car list
) (car list
)) (v (cadr list
) (cadr list
)))
85 (setf (gethash k hash
) v
)
86 (setf list
(cddr list
)))
87 (setf **proplist-properties
** hash
))
88 (let ((hash (make-hash-table :test
#'equal
)))
89 (loop for set in
',confusable-sets
90 for items
= (mapcar #'(lambda (item)
99 (< x sb
!xc
:char-code-limit
))
102 (setf (gethash (logically-readonlyize (possibly-base-stringize i
))
104 (logically-readonlyize
105 (possibly-base-stringize (first items
))))))
106 (setf **confusables
** hash
))
107 (let ((hash (make-hash-table)) (list ',bidi-mirroring-list
))
108 (loop for
(k v
) in list do
109 (setf (gethash k hash
) v
))
110 (setf **bidi-mirroring-glyphs
** hash
)))))))
111 (unicode-property-init))
113 ;;; Unicode property access
114 (defun ordered-ranges-member (item vector
)
115 (declare (type simple-vector vector
)
118 (labels ((recurse (start end
)
119 (declare (type index start end
)
120 (optimize (safety 0)))
122 (let* ((i (+ start
(truncate (the index
(- end start
)) 2)))
124 (elt1 (svref vector index
))
125 (elt2 (svref vector
(1+ index
))))
126 (declare (type index i
)
131 (recurse (+ 1 i
) end
))
134 (recurse 0 (truncate (length vector
) 2))))
136 ;; Returns which range `item` was found in or NIL
137 ;; First range = 0, second range = 1 ...
138 (defun ordered-ranges-position (item vector
)
139 (declare (type (simple-array (unsigned-byte 32) (*)) vector
)
141 (labels ((recurse (start end
)
142 (declare (type index start end
))
144 (let* ((i (+ start
(truncate (- end start
) 2)))
146 (elt1 (aref vector index
))
147 (elt2 (aref vector
(1+ index
))))
148 (declare (type index i
))
152 (recurse (+ 1 i
) end
))
155 (recurse 0 (truncate (length vector
) 2))))
157 (defun proplist-p (character property
)
158 "Returns T if CHARACTER has the specified PROPERTY.
159 PROPERTY is a keyword representing one of the properties from PropList.txt,
160 with underscores replaced by dashes."
161 (ordered-ranges-member (char-code character
)
162 (gethash property
**proplist-properties
**)))
164 ;; WARNING: These have to be manually kept in sync with the values in ucd.lisp
165 (declaim (type simple-vector
*general-categories
* *bidi-classes
* *east-asian-widths
*
166 *scripts
* *line-break-classes
* *blocks
*))
167 (sb!impl
::defglobal
*general-categories
*
168 #(:Lu
:Ll
:Lt
:Lm
:Lo
:Cc
:Cf
:Co
:Cs
:Cn
:Mc
:Me
:Mn
:Nd
169 :Nl
:No
:Pc
:Pd
:Pe
:Pf
:Pi
:Po
:Ps
:Sc
:Sk
:Sm
:So
:Zl
172 (sb!impl
::defglobal
*bidi-classes
*
173 #(:BN
:AL
:AN
:B
:CS
:EN
:ES
:ET
:L
:LRE
:LRO
:NSM
:ON
174 :PDF
:R
:RLE
:RLO
:S
:WS
:LRI
:RLI
:FSI
:PDI
))
176 (sb!impl
::defglobal
*east-asian-widths
*
177 #(:N
:A
:H
:W
:F
:Na
))
179 (sb!impl
::defglobal
*scripts
*
180 #(:Unknown
:Common
:Latin
:Greek
:Cyrillic
:Armenian
:Hebrew
:Arabic
:Syriac
181 :Thaana
:Devanagari
:Bengali
:Gurmukhi
:Gujarati
:Oriya
:Tamil
:Telugu
182 :Kannada
:Malayalam
:Sinhala
:Thai
:Lao
:Tibetan
:Myanmar
:Georgian
:Hangul
183 :Ethiopic
:Cherokee
:Canadian-Aboriginal
:Ogham
:Runic
:Khmer
:Mongolian
184 :Hiragana
:Katakana
:Bopomofo
:Han
:Yi
:Old-Italic
:Gothic
:Deseret
185 :Inherited
:Tagalog
:Hanunoo
:Buhid
:Tagbanwa
:Limbu
:Tai-Le
:Linear-B
186 :Ugaritic
:Shavian
:Osmanya
:Cypriot
:Braille
:Buginese
:Coptic
:New-Tai-Lue
187 :Glagolitic
:Tifinagh
:Syloti-Nagri
:Old-Persian
:Kharoshthi
:Balinese
188 :Cuneiform
:Phoenician
:Phags-Pa
:Nko
:Sundanese
:Lepcha
:Ol-Chiki
:Vai
189 :Saurashtra
:Kayah-Li
:Rejang
:Lycian
:Carian
:Lydian
:Cham
:Tai-Tham
190 :Tai-Viet
:Avestan
:Egyptian-Hieroglyphs
:Samaritan
:Lisu
:Bamum
:Javanese
191 :Meetei-Mayek
:Imperial-Aramaic
:Old-South-Arabian
:Inscriptional-Parthian
192 :Inscriptional-Pahlavi
:Old-Turkic
:Kaithi
:Batak
:Brahmi
:Mandaic
:Chakma
193 :Meroitic-Cursive
:Meroitic-Hieroglyphs
:Miao
:Sharada
:Sora-Sompeng
194 :Takri
:Bassa-Vah
:Mahajani
:Pahawh-Hmong
:Caucasian-Albanian
:Manichaean
195 :Palmyrene
:Duployan
:Mende-Kikakui
:Pau-Cin-Hau
:Elbasan
:Modi
196 :Psalter-Pahlavi
:Grantha
:Mro
:Siddham
:Khojki
:Nabataean
:Tirhuta
197 :Khudawadi
:Old-North-Arabian
:Warang-Citi
:Linear-A
:Old-Permic
))
199 (sb!impl
::defglobal
*line-break-classes
*
200 #(:XX
:AI
:AL
:B2
:BA
:BB
:BK
:CB
:CJ
:CL
:CM
:CP
:CR
:EX
:GL
201 :HL
:HY
:ID
:IN
:IS
:LF
:NL
:NS
:NU
:OP
:PO
:PR
:QU
:RI
:SA
202 :SG
:SP
:SY
:WJ
:ZW
))
204 (sb!impl
::defglobal
*blocks
*
205 #(:Basic-Latin
:Latin-1-Supplement
:Latin-Extended-A
:Latin-Extended-B
206 :IPA-Extensions
:Spacing-Modifier-Letters
:Combining-Diacritical-Marks
207 :Greek-and-Coptic
:Cyrillic
:Cyrillic-Supplement
:Armenian
:Hebrew
:Arabic
208 :Syriac
:Arabic-Supplement
:Thaana
:NKo
:Samaritan
:Mandaic
209 :Arabic-Extended-A
:Devanagari
:Bengali
:Gurmukhi
:Gujarati
:Oriya
:Tamil
210 :Telugu
:Kannada
:Malayalam
:Sinhala
:Thai
:Lao
:Tibetan
:Myanmar
:Georgian
211 :Hangul-Jamo
:Ethiopic
:Ethiopic-Supplement
:Cherokee
212 :Unified-Canadian-Aboriginal-Syllabics
:Ogham
:Runic
:Tagalog
:Hanunoo
213 :Buhid
:Tagbanwa
:Khmer
:Mongolian
214 :Unified-Canadian-Aboriginal-Syllabics-Extended
:Limbu
:Tai-Le
:New-Tai-Lue
215 :Khmer-Symbols
:Buginese
:Tai-Tham
:Combining-Diacritical-Marks-Extended
216 :Balinese
:Sundanese
:Batak
:Lepcha
:Ol-Chiki
:Sundanese-Supplement
217 :Vedic-Extensions
:Phonetic-Extensions
:Phonetic-Extensions-Supplement
218 :Combining-Diacritical-Marks-Supplement
:Latin-Extended-Additional
219 :Greek-Extended
:General-Punctuation
:Superscripts-and-Subscripts
220 :Currency-Symbols
:Combining-Diacritical-Marks-for-Symbols
221 :Letterlike-Symbols
:Number-Forms
:Arrows
:Mathematical-Operators
222 :Miscellaneous-Technical
:Control-Pictures
:Optical-Character-Recognition
223 :Enclosed-Alphanumerics
:Box-Drawing
:Block-Elements
:Geometric-Shapes
224 :Miscellaneous-Symbols
:Dingbats
:Miscellaneous-Mathematical-Symbols-A
225 :Supplemental-Arrows-A
:Braille-Patterns
:Supplemental-Arrows-B
226 :Miscellaneous-Mathematical-Symbols-B
:Supplemental-Mathematical-Operators
227 :Miscellaneous-Symbols-and-Arrows
:Glagolitic
:Latin-Extended-C
:Coptic
228 :Georgian-Supplement
:Tifinagh
:Ethiopic-Extended
:Cyrillic-Extended-A
229 :Supplemental-Punctuation
:CJK-Radicals-Supplement
:Kangxi-Radicals
230 :Ideographic-Description-Characters
:CJK-Symbols-and-Punctuation
:Hiragana
231 :Katakana
:Bopomofo
:Hangul-Compatibility-Jamo
:Kanbun
:Bopomofo-Extended
232 :CJK-Strokes
:Katakana-Phonetic-Extensions
:Enclosed-CJK-Letters-and-Months
233 :CJK-Compatibility
:CJK-Unified-Ideographs-Extension-A
234 :Yijing-Hexagram-Symbols
:CJK-Unified-Ideographs
:Yi-Syllables
:Yi-Radicals
235 :Lisu
:Vai
:Cyrillic-Extended-B
:Bamum
:Modifier-Tone-Letters
236 :Latin-Extended-D
:Syloti-Nagri
:Common-Indic-Number-Forms
:Phags-pa
237 :Saurashtra
:Devanagari-Extended
:Kayah-Li
:Rejang
:Hangul-Jamo-Extended-A
238 :Javanese
:Myanmar-Extended-B
:Cham
:Myanmar-Extended-A
:Tai-Viet
239 :Meetei-Mayek-Extensions
:Ethiopic-Extended-A
:Latin-Extended-E
240 :Meetei-Mayek
:Hangul-Syllables
:Hangul-Jamo-Extended-B
:High-Surrogates
241 :High-Private-Use-Surrogates
:Low-Surrogates
:Private-Use-Area
242 :CJK-Compatibility-Ideographs
:Alphabetic-Presentation-Forms
243 :Arabic-Presentation-Forms-A
:Variation-Selectors
:Vertical-Forms
244 :Combining-Half-Marks
:CJK-Compatibility-Forms
:Small-Form-Variants
245 :Arabic-Presentation-Forms-B
:Halfwidth-and-Fullwidth-Forms
:Specials
246 :Linear-B-Syllabary
:Linear-B-Ideograms
:Aegean-Numbers
247 :Ancient-Greek-Numbers
:Ancient-Symbols
:Phaistos-Disc
:Lycian
:Carian
248 :Coptic-Epact-Numbers
:Old-Italic
:Gothic
:Old-Permic
:Ugaritic
:Old-Persian
249 :Deseret
:Shavian
:Osmanya
:Elbasan
:Caucasian-Albanian
:Linear-A
250 :Cypriot-Syllabary
:Imperial-Aramaic
:Palmyrene
:Nabataean
:Phoenician
251 :Lydian
:Meroitic-Hieroglyphs
:Meroitic-Cursive
:Kharoshthi
252 :Old-South-Arabian
:Old-North-Arabian
:Manichaean
:Avestan
253 :Inscriptional-Parthian
:Inscriptional-Pahlavi
:Psalter-Pahlavi
:Old-Turkic
254 :Rumi-Numeral-Symbols
:Brahmi
:Kaithi
:Sora-Sompeng
:Chakma
:Mahajani
255 :Sharada
:Sinhala-Archaic-Numbers
:Khojki
:Khudawadi
:Grantha
:Tirhuta
256 :Siddham
:Modi
:Takri
:Warang-Citi
:Pau-Cin-Hau
:Cuneiform
257 :Cuneiform-Numbers-and-Punctuation
:Egyptian-Hieroglyphs
:Bamum-Supplement
258 :Mro
:Bassa-Vah
:Pahawh-Hmong
:Miao
:Kana-Supplement
:Duployan
259 :Shorthand-Format-Controls
:Byzantine-Musical-Symbols
:Musical-Symbols
260 :Ancient-Greek-Musical-Notation
:Tai-Xuan-Jing-Symbols
261 :Counting-Rod-Numerals
:Mathematical-Alphanumeric-Symbols
:Mende-Kikakui
262 :Arabic-Mathematical-Alphabetic-Symbols
:Mahjong-Tiles
:Domino-Tiles
263 :Playing-Cards
:Enclosed-Alphanumeric-Supplement
264 :Enclosed-Ideographic-Supplement
:Miscellaneous-Symbols-and-Pictographs
265 :Emoticons
:Ornamental-Dingbats
:Transport-and-Map-Symbols
266 :Alchemical-Symbols
:Geometric-Shapes-Extended
:Supplemental-Arrows-C
267 :CJK-Unified-Ideographs-Extension-B
:CJK-Unified-Ideographs-Extension-C
268 :CJK-Unified-Ideographs-Extension-D
:CJK-Compatibility-Ideographs-Supplement
269 :Tags
:Variation-Selectors-Supplement
:Supplementary-Private-Use-Area-A
270 :Supplementary-Private-Use-Area-B
))
272 (declaim (inline svref-or-null
))
273 (defun svref-or-null (vector index
)
274 (and (< index
(length vector
))
275 (svref vector index
)))
277 (defun general-category (character)
278 "Returns the general category of CHARACTER as it appears in UnicodeData.txt"
279 (svref-or-null *general-categories
* (sb!impl
::ucd-general-category character
)))
281 (defun bidi-class (character)
282 "Returns the bidirectional class of CHARACTER"
283 (if (and (eql (general-category character
) :Cn
)
284 (default-ignorable-p character
))
288 (aref **character-misc-database
** (1+ (misc-index character
))))))
290 (declaim (inline combining-class
))
291 (defun combining-class (character)
292 "Returns the canonical combining class (CCC) of CHARACTER"
293 (aref **character-misc-database
** (+ 2 (misc-index character
))))
295 (defun decimal-value (character)
296 "Returns the decimal digit value associated with CHARACTER or NIL if
297 there is no such value.
299 The only characters in Unicode with a decimal digit value are those
300 that are part of a range of characters that encode the digits 0-9.
301 Because of this, `(decimal-digit c) <=> (digit-char-p c 10)` in
303 (sb!impl
::ucd-decimal-digit character
))
305 (defun digit-value (character)
306 "Returns the Unicode digit value of CHARACTER or NIL if it doesn't exist.
308 Digit values are guaranteed to be integers between 0 and 9 inclusive.
309 All characters with decimal digit values have the same digit value,
310 but there are characters (such as digits of number systems without a 0 value)
311 that have a digit value but no decimal digit value"
312 (let ((%digit
(clear-flag 6
313 (aref **character-misc-database
**
314 (+ 3 (misc-index character
))))))
315 (if (< %digit
10) %digit nil
)))
317 (defun numeric-value (character)
318 "Returns the numeric value of CHARACTER or NIL if there is no such value.
319 Numeric value is the most general of the Unicode numeric properties.
320 The only constraint on the numeric value is that it be a rational number."
321 (or (double-vector-binary-search (char-code character
)
322 **special-numerics
**)
323 (digit-value character
)))
325 (defun mirrored-p (character)
326 "Returns T if CHARACTER needs to be mirrored in bidirectional text.
327 Otherwise, returns NIL."
328 (logbitp 5 (aref **character-misc-database
**
329 (+ 5 (misc-index character
)))))
331 (defun bidi-mirroring-glyph (character)
332 "Returns the mirror image of CHARACTER if it exists.
333 Otherwise, returns NIL."
334 (when (mirrored-p character
)
335 (let ((ret (gethash (char-code character
) **bidi-mirroring-glyphs
**)))
336 (when ret
(code-char ret
)))))
338 (defun east-asian-width (character)
339 "Returns the East Asian Width property of CHARACTER as
340 one of the keywords :N (Narrow), :A (Ambiguous), :H (Halfwidth),
341 :W (Wide), :F (Fullwidth), or :NA (Not applicable)"
342 (svref-or-null *east-asian-widths
*
344 (aref **character-misc-database
**
345 (+ 5 (misc-index character
))))))
347 (defun script (character)
348 "Returns the Script property of CHARACTER as a keyword.
349 If CHARACTER does not have a known script, returns :UNKNOWN"
350 (svref-or-null *scripts
*
351 (aref **character-misc-database
** (+ 6 (misc-index character
)))))
353 (defun char-block (character)
354 "Returns the Unicode block in which CHARACTER resides as a keyword.
355 If CHARACTER does not have a known block, returns :NO-BLOCK"
356 (let* ((code (char-code character
))
357 (block-index (ordered-ranges-position code
**block-ranges
**)))
359 (aref *blocks
* block-index
) :no-block
)))
361 (defun unicode-1-name (character)
362 "Returns the name assigned to CHARACTER in Unicode 1.0 if it is distinct
363 from the name currently assigned to CHARACTER. Otherwise, returns NIL.
364 This property has been officially obsoleted by the Unicode standard, and
365 is only included for backwards compatibility."
366 (let* ((char-code (char-code character
))
367 (h-code (double-vector-binary-search char-code
368 **unicode-1-char-name-database
**)))
370 (huffman-decode h-code
**unicode-character-name-huffman-tree
**))))
372 (defun age (character)
373 "Returns the version of Unicode in which CHARACTER was assigned as a pair
374 of values, both integers, representing the major and minor version respectively.
375 If CHARACTER is not assigned in Unicode, returns NIL for both values."
376 (let* ((value (aref **character-misc-database
** (+ 8 (misc-index character
))))
377 (major (ash value -
3))
378 (minor (ldb (byte 3 0) value
)))
379 (if (zerop value
) (values nil nil
) (values major minor
))))
381 (defun hangul-syllable-type (character)
382 "Returns the Hangul syllable type of CHARACTER.
383 The syllable type can be one of :L, :V, :T, :LV, or :LVT.
384 If the character is not a Hangul syllable or Jamo, returns NIL"
385 (let ((cp (char-code character
)))
388 (and (<= #x1100 cp
) (<= cp
#x115f
))
389 (and (<= #xa960 cp
) (<= cp
#xa97c
))) :L
)
391 (and (<= #x1160 cp
) (<= cp
#x11a7
))
392 (and (<= #xd7B0 cp
) (<= cp
#xd7C6
))) :V
)
394 (and (<= #x11a8 cp
) (<= cp
#x11ff
))
395 (and (<= #xd7c8 cp
) (<= cp
#xd7fb
))) :T
)
396 ((and (<= #xac00 cp
) (<= cp
#xd7a3
))
397 (if (= 0 (rem (- cp
#xac00
) 28)) :LV
:LVT
)))))
399 (defun line-break-class (character &key resolve
)
400 "Returns the line breaking class of CHARACTER, as specified in UAX #14.
401 If :RESOLVE is NIL, returns the character class found in the property file.
402 If :RESOLVE is non-NIL, centain line-breaking classes will be mapped to othec
403 classes as specified in the applicable standards. Addinionally, if :RESOLVE
404 is :EAST-ASIAN, Ambigious (class :AI) characters will be mapped to the
405 Ideographic (:ID) class instead of Alphabetic (:AL)."
406 (when (and resolve
(listp character
)) (setf character
(car character
)))
407 (when (and resolve
(not character
)) (return-from line-break-class
:nil
))
409 (svref-or-null *line-break-classes
*
410 (aref **character-misc-database
** (+ 7 (misc-index character
)))))
411 (syllable-type (hangul-syllable-type character
)))
414 (cdr (assoc syllable-type
415 '((:l .
:JL
) (:v .
:JV
) (:t .
:JT
)
416 (:lv .
:H2
) (:lvt .
:H3
))))))
420 (:ai
(if (eql resolve
:east-asion
) :ID
:AL
))
421 ; If we see :CM when resolving, we have a CM that isn't subject
422 ; to LB9, so we do LB10
424 (:sa
(if (member (general-category character
) '(:Mn
:Mc
))
427 (:sg
(error "The character ~S is a surrogate, which should not
428 appear in an SBCL string. The line-breaking behavior of surrogates is undefined."
433 (defun uppercase-p (character)
434 "Returns T if CHARACTER has the Unicode property Uppercase and NIL otherwise"
435 (or (eql (general-category character
) :Lu
) (proplist-p character
:other-uppercase
)))
437 (defun lowercase-p (character)
438 "Returns T if CHARACTER has the Unicode property Lowercase and NIL otherwise"
439 (or (eql (general-category character
) :Ll
) (proplist-p character
:other-lowercase
)))
441 (defun cased-p (character)
442 "Returns T if CHARACTER has a (Unicode) case, and NIL otherwise"
443 (or (uppercase-p character
) (lowercase-p character
)
444 (eql (general-category character
) :Lt
)))
446 (defun case-ignorable-p (character)
447 "Returns T if CHARACTER is Case Ignorable as defined in Unicode 6.3, Chapter
449 (or (member (general-category character
)
450 '(:Mn
:Me
:Cf
:Lm
:Sk
))
451 (member (word-break-class character
)
452 '(:midletter
:midnumlet
:single-quote
))))
454 (defun alphabetic-p (character)
455 "Returns T if CHARACTER is Alphabetic according to the Unicode standard
457 (or (member (general-category character
) '(:Lu
:Ll
:Lt
:Lm
:Lo
:Nl
))
458 (proplist-p character
:other-alphabetic
)))
460 (defun ideographic-p (character)
461 "Returns T if CHARACTER has the Unicode property Ideographic,
462 which loosely corresponds to the set of \"Chinese characters\""
463 (proplist-p character
:ideographic
))
465 (defun math-p (character)
466 "Returns T if CHARACTER is a mathematical symbol according to Unicode and
468 (or (eql (general-category character
) :sm
) (proplist-p character
:other-math
)))
470 (defun whitespace-p (character)
471 "Returns T if CHARACTER is whitespace according to Unicode
473 (proplist-p character
:white-space
))
475 (defun hex-digit-p (character &key ascii
)
476 "Returns T if CHARACTER is a hexadecimal digit and NIL otherwise.
477 If :ASCII is non-NIL, fullwidth equivalents of the Latin letters A through F
479 (proplist-p character
(if ascii
:ascii-hex-digit
:hex-digit
)))
481 (defun soft-dotted-p (character)
482 "Returns T if CHARACTER has a soft dot (such as the dots on i and j) which
483 disappears when accents are placed on top of it. and NIL otherwise"
484 (proplist-p character
:soft-dotted
))
486 (defun default-ignorable-p (character)
487 "Returns T if CHARACTER is a Default_Ignorable_Code_Point"
489 (or (proplist-p character
:other-default-ignorable-code-point
)
490 (eql (general-category character
) :cf
)
491 (proplist-p character
:variation-selector
))
493 (or (whitespace-p character
)
494 (ordered-ranges-member
495 (char-code character
)
496 #(#x0600
#x0604
#x06DD
#x06DD
#x070F
#x070F
#xFFF9
#xFFFB
497 #x110BD
#x110BD
))))))
500 ;;; Implements UAX#15: Normalization Forms
501 (declaim (inline char-decomposition-info
))
502 (defun char-decomposition-info (char)
503 (let ((value (aref **character-misc-database
**
504 (+ 4 (misc-index char
)))))
505 (values (clear-flag 7 value
) (logbitp 7 value
))))
507 (defun char-decomposition (char length callback
)
508 (declare (function callback
))
509 ;; Caller should have gotten length from char-decomposition-info
510 (let* ((cp (char-code char
))
511 (cp-high (ash cp -
8))
512 (decompositions **character-decompositions
**)
513 (high-page (aref **character-high-pages
** cp-high
))
514 (index (unless (logbitp 15 high-page
) ;; Hangul syllable
515 (aref **character-low-pages
**
516 (+ 1 (* 2 (+ (ldb (byte 8 0) cp
) (ash high-page
8))))))))
518 (funcall callback
(code-char (aref decompositions index
))))
519 ((<= #xac00 cp
#xd7a3
)
520 ;; see Unicode 6.2, section 3-12
521 (let* ((sbase #xac00
)
527 (ncount (* vcount tcount
))
528 (sindex (- cp sbase
))
529 (lindex (floor sindex ncount
))
530 (vindex (floor (mod sindex ncount
) tcount
))
531 (tindex (mod sindex tcount
)))
532 (funcall callback
(code-char (+ lbase lindex
)))
533 (funcall callback
(code-char (+ vbase vindex
)))
535 (funcall callback
(code-char (+ tbase tindex
))))))
538 (loop for i below length
540 (funcall callback
(code-char (aref decompositions
(+ index i
)))))))))
542 (defun decompose-char (char compatibility callback
)
543 (declare (function callback
))
544 (multiple-value-bind (info compat
) (char-decomposition-info char
)
545 (if (and (plusp info
)
549 (dx-flet ((callback (char)
550 (decompose-char char t callback
)))
551 (char-decomposition char info
#'callback
))
552 (char-decomposition char info callback
))
553 (funcall callback char
))))
555 (defun decompose-string (string compatibility filter
)
558 (previous-combining-class 0))
559 (declare (type index length
))
560 (dx-flet ((callback (char)
561 (let ((combining-class (combining-class char
)))
563 (cond ((< 0 combining-class previous-combining-class
)
564 ;; Ensure it's sorted
565 (loop for cons on chars
566 for next-char
= (cadr cons
)
567 when
(or (not next-char
)
568 (<= 0 (combining-class next-char
) combining-class
))
570 (cons char
(cdr cons
)))
574 (setf previous-combining-class combining-class
))))))
575 (sb!kernel
:with-array-data
((string string
) (start) (end)
576 :check-fill-pointer t
)
577 (let ((calback (if filter
578 (let ((filter (sb!kernel
:%coerce-callable-to-fun filter
)))
580 (when (funcall filter char
)
583 (loop for i from start below end
584 for char
= (schar string i
)
586 (decompose-char char compatibility calback
))))
587 (let ((result (make-string length
)))
588 (loop for char in chars
589 for i from
(1- length
) downto
0
590 do
(setf (schar result i
) char
))
593 (defun composition-hangul-syllable-type (cp)
595 ((and (<= #x1100 cp
) (<= cp
#x1112
)) :L
)
596 ((and (<= #x1161 cp
) (<= cp
#x1175
)) :V
)
597 ((and (<= #x11a8 cp
) (<= cp
#x11c2
)) :T
)
598 ((and (<= #xac00 cp
) (<= cp
#.
(+ #xac00
11171)))
599 (if (= 0 (rem (- cp
#xac00
) 28)) :LV
:LVT
))))
601 (defun primary-composition (char1 char2
)
602 (flet ((maybe (fn x
) (when x
(funcall fn x
))))
603 (let ((c1 (char-code char1
))
604 (c2 (char-code char2
)))
608 ((gethash (dpb c1
(byte 21 21) c2
)
609 **character-primary-compositions
**))
610 ((and (eql (composition-hangul-syllable-type c1
) :L
)
611 (eql (composition-hangul-syllable-type c2
) :V
))
612 (let ((lindex (- c1
#x1100
))
613 (vindex (- c2
#x1161
)))
614 (+ #xac00
(* lindex
588) (* vindex
28))))
615 ((and (eql (composition-hangul-syllable-type c1
) :LV
)
616 (eql (composition-hangul-syllable-type c2
) :T
))
617 (+ c1
(- c2
#x11a7
))))))))
619 ;;; This implements a sequence data structure, specialized for
620 ;;; efficient deletion of characters at an index, along with tolerable
621 ;;; random access. The purpose is to support the canonical
622 ;;; composition algorithm from Unicode, which involves replacing (not
623 ;;; necessarily consecutive) pairs of code points with a single code
624 ;;; point (e.g. [#\e #\combining_acute_accent] with
625 ;;; #\latin_small_letter_e_with_acute). The data structure is a list
626 ;;; of three-element lists, each denoting a chunk of string data
627 ;;; starting at the first index and ending at the second.
629 ;;; Actually, the implementation isn't particularly efficient, and
630 ;;; would probably benefit from being rewritten in terms of displaced
631 ;;; arrays, which would substantially reduce copying.
633 ;;; (also, generic sequences. *sigh*.)
634 (defun lref (lstring index
)
636 (when (and (<= (first l
) index
)
637 (< index
(second l
)))
638 (return (aref (third l
) (- index
(first l
)))))))
640 (defun (setf lref
) (newchar lstring index
)
642 (when (and (<= (first l
) index
)
643 (< index
(second l
)))
644 (return (setf (aref (third l
) (- index
(first l
))) newchar
)))))
646 (defun llength (lstring)
647 (second (first (last lstring
))))
649 (defun lstring (lstring)
650 (let ((result (make-string (llength lstring
))))
651 (dolist (l lstring result
)
652 (replace result
(third l
) :start1
(first l
) :end1
(second l
)))))
654 (defun ldelete (lstring index
)
655 (do* ((ls lstring
(cdr ls
))
656 (l (car ls
) (car ls
))
658 ((and (<= (first l
) index
)
659 (< index
(second l
)))
664 (list (list (first l
) (1- (second l
)) (subseq (third l
) 1))))
665 ((= index
(1- (second l
)))
666 (list (list (first l
) (1- (second l
)) (subseq (third l
) 0 (1- (length (third l
)))))))
669 (list (first l
) index
670 (subseq (third l
) 0 (- index
(first l
))))
671 (list index
(1- (second l
))
672 (subseq (third l
) (1+ (- index
(first l
))))))))
673 (mapcar (lambda (x) (list (1- (first x
)) (1- (second x
)) (third x
)))
677 (defun canonically-compose (string)
678 (let* ((result (list (list 0 (length string
) string
)))
679 (previous-starter-index (position 0 string
:key
#'combining-class
))
680 (i (and previous-starter-index
(1+ previous-starter-index
))))
681 (when (or (not i
) (= i
(length string
)))
682 (return-from canonically-compose string
))
685 (when (and (>= (- i previous-starter-index
) 2)
686 ;; test for Blocked (Unicode 3.11 para. D115)
688 ;; (assumes here that string has sorted combiners,
689 ;; so can look back just one step)
690 (>= (combining-class (lref result
(1- i
)))
691 (combining-class (lref result i
))))
692 (when (= (combining-class (lref result i
)) 0)
693 (setf previous-starter-index i
))
697 (let ((comp (primary-composition (lref result previous-starter-index
)
701 (setf (lref result previous-starter-index
) comp
)
702 (setf result
(ldelete result i
)))
704 (when (= (combining-class (lref result i
)) 0)
705 (setf previous-starter-index i
))
708 (unless (= i
(llength result
))
710 (if (= i
(length string
))
714 (defun normalize-string (string &optional
(form :nfd
)
716 "Normalize STRING to the Unicode normalization form form.
717 Acceptable values for form are :NFD, :NFC, :NFKD, and :NFKC.
718 If FILTER is a function it is called on each decomposed character and
719 only characters for which it returns T are collected."
720 (declare (type (member :nfd
:nfkd
:nfc
:nfkc
) form
))
722 (declare (ignore filter
))
725 ((array nil
(*)) string
)
728 ((:nfc
:nfkc
) string
)
729 ((:nfd
:nfkd
) (error "Cannot normalize to ~A form in #-SB-UNICODE builds" form
)))))
733 ((array character
(*))
736 (canonically-compose (decompose-string string nil filter
)))
738 (decompose-string string nil filter
))
740 (canonically-compose (decompose-string string t filter
)))
742 (decompose-string string t filter
))))
743 ((array nil
(*)) string
)))
745 (defun normalized-p (string &optional
(form :nfd
))
746 "Tests if STRING is normalized to FORM"
747 ;; FIXME: can be optimized
748 (string= string
(normalize-string string form
)))
751 ;;; Unicode case algorithms
752 ;; FIXME: Make these parts less redundant (macro?)
753 (defparameter **special-titlecases
**
754 '#.
(with-open-file (stream
758 '(:relative
:up
:up
"output")
759 :name
"titlecases" :type
"lisp-expr")
760 sb
!xc
:*compile-file-truename
*)
762 :element-type
'character
)
765 (defparameter **special-casefolds
**
766 '#.
(with-open-file (stream
770 '(:relative
:up
:up
"output")
771 :name
"foldcases" :type
"lisp-expr")
772 sb
!xc
:*compile-file-truename
*)
774 :element-type
'character
)
777 (defun has-case-p (char)
778 ;; Bit 6 is the Unicode case flag, as opposed to the Common Lisp one
779 (logbitp 6 (aref **character-misc-database
** (+ 5 (misc-index char
)))))
781 (defun char-uppercase (char)
782 (if (has-case-p char
)
783 (let ((cp (car (char-case-info char
))))
784 (if (atom cp
) (list (code-char cp
)) (mapcar #'code-char cp
)))
787 (defun char-lowercase (char)
788 (if (has-case-p char
)
789 (let ((cp (cdr (char-case-info char
))))
790 (if (atom cp
) (list (code-char cp
)) (mapcar #'code-char cp
)))
793 (defun char-titlecase (char)
794 (unless (has-case-p char
) (return-from char-titlecase
(list char
)))
795 (let* ((cp (char-code char
))
796 (value (assoc cp
**special-titlecases
**)))
798 (if (atom (cdr value
))
799 (list (code-char (cdr value
)))
800 (mapcar #'code-char
(cdr value
)))
801 (char-uppercase char
))))
803 (defun char-foldcase (char)
804 (unless (has-case-p char
) (return-from char-foldcase
(list char
)))
805 (let* ((cp (char-code char
))
806 (value (assoc cp
**special-casefolds
**)))
808 (if (atom (cdr value
))
809 (list (code-char (cdr value
)))
810 (mapcar #'code-char
(cdr value
)))
811 (char-lowercase char
))))
813 (defun string-somethingcase (fn string special-fn
)
814 (let (result (len (length string
)))
815 (loop for index from
0 below len
816 for char
= (char string index
)
817 for cased
= (or (funcall special-fn char index len
)
819 do
(loop for c in
(remove :none cased
) do
(push c result
)))
820 (setf result
(nreverse result
))
821 (coerce result
'string
)))
823 (declaim (type function sb
!unix
::posix-getenv
))
824 (defun get-user-locale ()
826 #!+(or win32 unix
) (or (sb!unix
::posix-getenv
"LC_ALL")
827 (sb!unix
::posix-getenv
"LANG"))
828 #!-
(or win32 unix
) nil
))
830 (let ((lang-code (string-upcase
831 (subseq raw-locale
0 (position #\_ raw-locale
)))))
833 (intern lang-code
"KEYWORD"))))))
836 (defun uppercase (string &key locale
)
837 "Returns the full uppercase of STRING according to the Unicode standard.
838 The result is not guaranteed to have the same length as the input. If :LOCALE
839 is NIL, no language-specific case transformations are applied. If :LOCALE is a
840 keyword representing a two-letter ISO country code, the case transforms of that
841 locale are used. If :LOCALE is T, the user's current locale is used (Unix and
843 (when (eq locale t
) (setf locale
(get-user-locale)))
844 (string-somethingcase
845 #'char-uppercase string
846 #!-sb-unicode
(constantly nil
)
847 #!+sb-unicode
;; code-char with a constant > 255 breaks the build
848 #'(lambda (char index len
)
849 (declare (ignore len
))
851 ((and (eql locale
:lt
) (char= char
(code-char #x0307
))
852 (loop for i from
(1- index
) downto
0
853 for c
= (char string i
)
854 do
(case (combining-class c
)
855 (0 (return (soft-dotted-p c
)))
858 finally
(return nil
)))
860 ((and (or (eql locale
:tr
) (eql locale
:az
))
862 (list (code-char #x0130
)))
865 (defun lowercase (string &key locale
)
866 "Returns the full lowercase of STRING according to the Unicode standard.
867 The result is not guaranteed to have the same length as the input.
868 :LOCALE has the same semantics as the :LOCALE argument to UPPERCASE."
869 (when (eq locale t
) (setf locale
(get-user-locale)))
870 (string-somethingcase
871 #'char-lowercase string
872 #!-sb-unicode
(constantly nil
)
874 #'(lambda (char index len
)
876 ((and (char= char
(code-char #x03A3
))
877 (loop for i from
(1- index
) downto
0
878 for c
= (char string i
)
879 do
(cond ((cased-p c
) (return t
))
880 ((case-ignorable-p c
))
882 finally
(return nil
))
883 (loop for i from
(1+ index
) below len
884 for c
= (char string i
)
885 do
(cond ((cased-p c
) (return nil
))
886 ((case-ignorable-p c
))
889 (list (code-char #x03C2
)))
894 (assoc (char-code char
)
895 '((#x00CC .
(#x0069
#x0307
#x0300
))
896 (#x00CD .
(#x0069
#x0307
#x0301
))
897 (#x0128 .
(#x0069
#x0307
#x0303
))))
898 (and (loop for i from
(1+ index
) below len
899 for c
= (char string i
)
900 do
(case (combining-class c
)
904 finally
(return nil
))
905 (assoc (char-code char
)
906 '((#x0049 .
(#x0069
#x0307
))
907 (#x004A .
(#x006A
#x0307
))
908 (#x012E .
(#x012F
#x0307
)))))))))
909 ((or (eql locale
:tr
) (eql locale
:az
))
911 ((char= char
(code-char #x0130
)) (list #\i
))
912 ((and (char= char
(code-char #x0307
))
913 (loop for i from
(1- index
) downto
0
914 for c
= (char string i
)
915 do
(case (combining-class c
)
916 (0 (return (char= c
#\I
)))
919 finally
(return nil
)))
921 ((and (char= char
#\I
)
922 (loop for i from
(1+ index
) below len
923 for c
= (char string i
)
924 do
(case (combining-class c
)
926 (230 (return (char/= c
(code-char #x0307
))))
929 (list (code-char #x0131
)))
933 (defun titlecase (string &key locale
)
934 "Returns the titlecase of STRING. The resulting string can
935 be longer than the input.
936 :LOCALE has the same semantics as the :LOCALE argument to UPPERCASE."
937 (when (eq locale t
) (setf locale
(get-user-locale)))
938 (let ((words (words string
))
940 (loop for word in words
941 for first-cased
= (or (position-if #'cased-p word
) 0)
942 for pre
= (subseq word
0 first-cased
)
943 for initial
= (char word first-cased
)
944 for rest
= (subseq word
(1+ first-cased
))
945 do
(let ((up (char-titlecase initial
)) (down (lowercase rest
)))
947 (when (and (or (eql locale
:tr
) (eql locale
:az
))
949 (setf up
(list (code-char #x0130
))))
951 (when (and (eql locale
:lt
)
952 (soft-dotted-p initial
)
956 (or (eql (combining-class c
) 0)
957 (eql (combining-class c
) 230))) down
))
959 (setf down
(delete (code-char #x0307
) down
:count
1)))
960 (push (concatenate 'string pre up down
) cased
)))
961 (apply #'concatenate
'string
(nreverse cased
))))
963 (defun casefold (string)
964 "Returns the full casefolding of STRING according to the Unicode standard.
965 Casefolding removes case information in a way that allows the results to be used
966 for case-insensitive comparisons.
967 The result is not guaranteed to have the same length as the input."
968 (string-somethingcase #'char-foldcase string
(constantly nil
)))
971 ;;; Unicode break algorithms
972 ;;; In all the breaking methods:
973 ;;; (brk) establishes a break between `first` and `second`
974 ;;; (nobrk) prevents a break between `first` and `second`
975 ;;; Setting flag=T/state=:nobrk-next prevents a break between `second` and `htird`
977 ;; Word breaking sets this to make their algorithms less tricky
978 (defvar *other-break-special-graphemes
* nil
)
979 (defun grapheme-break-class (char)
980 "Returns the grapheme breaking class of CHARACTER, as specified in UAX #29."
981 (let ((cp (when char
(char-code char
)))
982 (gc (when char
(general-category char
)))
984 #(#x102B
#x102C
#x1038
#x1062
#x1063
#x1064
#x1067
#x1068
#x1069
985 #x106A
#x106B
#x106C
#x106D
#x1083
#x1087
#x1088
#x1089
#x108A
986 #x108B
#x108C
#x108F
#x109A
#x109B
#x109C
#x19B0
#x19B1
#x19B2
987 #x19B3
#x19B4
#x19B8
#x19B9
#x19BB
#x19BC
#x19BD
#x19BE
#x19BF
988 #x19C0
#x19C8
#x19C9
#x1A61
#x1A63
#x1A64
#xAA7B
#xAA7D
)))
993 ((or (member gc
'(:Mn
:Me
))
994 (proplist-p char
:other-grapheme-extend
)
995 (and *other-break-special-graphemes
*
996 (member gc
'(:Mc
:Cf
)) (not (<= #x200B cp
#x200D
))))
998 ((or (member gc
'(:Zl
:Zp
:Cc
:Cs
:Cf
))
999 ;; From Cn and Default_Ignorable_Code_Point
1000 (eql cp
#x2065
) (eql cp
#xE0000
)
1001 (<= #xFFF0 cp
#xFFF8
)
1002 (<= #xE0002 cp
#xE001F
)
1003 (<= #xE0080 cp
#xE00FF
)
1004 (<= #xE01F0 cp
#xE0FFF
)) :control
)
1005 ((<= #x1F1E6 cp
#x1F1FF
) :regional-indicator
)
1006 ((and (or (eql gc
:Mc
)
1007 (eql cp
#x0E33
) (eql cp
#x0EB3
))
1008 (not (binary-search cp not-spacing-mark
))) :spacing-mark
)
1009 (t (hangul-syllable-type char
)))))
1011 (macrolet ((def (name extendedp
)
1012 `(defun ,name
(function string
)
1013 (do ((length (length string
))
1017 (c2 (and (> (length string
) 0) (grapheme-break-class (char string
0)))))
1019 (if (= end length
) (progn (funcall function string start end
) nil
)))
1020 (flet ((brk () (funcall function string start end
) (setf start end
)))
1021 (declare (truly-dynamic-extent #'brk
))
1022 (shiftf c1 c2
(grapheme-break-class (char string end
)))
1024 ((and (eql c1
:cr
) (eql c2
:lf
)))
1025 ((or (member c1
'(:control
:cr
:lf
))
1026 (member c2
'(:control
:cr
:lf
)))
1028 ((or (and (eql c1
:l
) (member c2
'(:l
:v
:lv
:lvt
)))
1029 (and (or (eql c1
:v
) (eql c1
:lv
))
1030 (or (eql c2
:v
) (eql c2
:t
)))
1031 (and (eql c2
:t
) (or (eql c1
:lvt
) (eql c1
:t
)))))
1032 ((and (eql c1
:regional-indicator
) (eql c2
:regional-indicator
)))
1035 `(((or (eql c2
:spacing-mark
) (eql c1
:prepend
)))))
1037 (def map-legacy-grapheme-boundaries nil
)
1038 (def map-grapheme-boundaries t
))
1040 (macrolet ((def (name mapper
)
1041 `(defun ,name
(function string
)
1042 (let ((array (make-array 0 :element-type
(array-element-type string
) :adjustable t
:displaced-to string
)))
1043 (flet ((fun (string start end
)
1044 (declare (type string string
))
1045 (funcall function
(adjust-array array
(- end start
) :displaced-to string
:displaced-index-offset start
))))
1046 (declare (truly-dynamic-extent #'fun
))
1047 (,mapper
#'fun string
))))))
1048 (def map-legacy-graphemes map-legacy-grapheme-boundaries
)
1049 (def map-graphemes map-grapheme-boundaries
))
1051 (defun graphemes (string)
1052 "Breaks STRING into graphemes acording to the default
1053 grapheme breaking rules specified in UAX #29, returning a list of strings."
1055 (map-graphemes (lambda (a) (push (subseq a
0) result
)) string
)
1058 (defun word-break-class (char)
1059 "Returns the word breaking class of CHARACTER, as specified in UAX #29."
1060 ;; Words use graphemes as characters to deal with the ignore rule
1061 (when (listp char
) (setf char
(car char
)))
1062 (let ((cp (when char
(char-code char
)))
1063 (gc (when char
(general-category char
)))
1064 (newlines #(#xB
#xC
#x0085
#x0085
#x2028
#x2029
))
1066 #(#x3031
#x3035
#x309B
#x309C
1067 #x30A0
#x30A0
#x30FC
#x30FC
1069 (midnumlet #(#x002E
#x2018
#x2019
#x2024
#xFE52
#xFF07
#xFF0E
))
1071 #(#x003A
#x00B7
#x002D7
#x0387
#x05F4
#x2027
#xFE13
#xFE55
#xFF1A
))
1073 ;; Grepping of Line_Break = IS adjusted per UAX #29
1074 #(#x002C
#x003B
#x037E
#x0589
#x060C
#x060D
#x066C
#x07F8
#x2044
1075 #xFE10
#xFE14
#xFE50
#xFE54
#xFF0C
#xFF1B
)))
1080 ((= cp
#x27
) :single-quote
)
1081 ((= cp
#x22
) :double-quote
)
1082 ((ordered-ranges-member cp newlines
) :newline
)
1083 ((or (eql (grapheme-break-class char
) :extend
)
1084 (eql gc
:mc
)) :extend
)
1085 ((<= #x1F1E6 cp
#x1F1FF
) :regional-indicator
)
1086 ((and (eql gc
:Cf
) (not (<= #x200B cp
#x200D
))) :format
)
1087 ((or (eql (script char
) :katakana
)
1088 (ordered-ranges-member cp also-katakana
)) :katakana
)
1089 ((and (eql (script char
) :Hebrew
) (eql gc
:lo
)) :hebrew-letter
)
1090 ((and (or (alphabetic-p char
) (= cp
#x05F3
))
1091 (not (or (ideographic-p char
)
1092 (eql (line-break-class char
) :sa
)
1093 (eql (script char
) :hiragana
)))) :aletter
)
1094 ((binary-search cp midnumlet
) :midnumlet
)
1095 ((binary-search cp midletter
) :midletter
)
1096 ((binary-search cp midnum
) :midnum
)
1097 ((or (and (eql gc
:Nd
) (not (<= #xFF10 cp
#xFF19
))) ;Fullwidth digits
1098 (eql cp
#x066B
)) :numeric
)
1099 ((eql gc
:Pc
) :extendnumlet
)
1102 (defmacro flatpush
(thing list
)
1103 (let ((%thing
(gensym)) (%i
(gensym)))
1104 `(let ((,%thing
,thing
))
1106 (dolist (,%i
,%thing
)
1108 (push ,%thing
,list
)))))
1110 (defun words (string)
1111 "Breaks STRING into words acording to the default
1112 word breaking rules specified in UAX #29. Returns a list of strings"
1113 (let ((chars (mapcar
1115 (let ((l (coerce s
'list
)))
1116 (if (cdr l
) l
(car l
))))
1117 (let ((*other-break-special-graphemes
* t
)) (graphemes string
))))
1119 (flatpush (car chars
) word
)
1120 (do ((first (car chars
) second
)
1121 (tail (cdr chars
) (cdr tail
))
1122 (second (cadr chars
) (cadr tail
)))
1123 ((not first
) (nreverse (mapcar #'(lambda (l) (coerce l
'string
)) words
)))
1124 (flet ((brk () (push (nreverse word
) words
) (setf word nil
) (flatpush second word
))
1125 (nobrk () (flatpush second word
)))
1126 (let ((c1 (word-break-class first
))
1127 (c2 (word-break-class second
))
1128 (c3 (when (and tail
(cdr tail
)) (word-break-class (cadr tail
)))))
1130 (flag (nobrk) (setf flag nil
))
1131 ;; CR+LF are bound together by the grapheme clustering
1132 ((or (eql c1
:newline
) (eql c1
:cr
) (eql c1
:lf
)
1133 (eql c2
:newline
) (eql c2
:cr
) (eql c2
:lf
)) (brk))
1134 ((or (eql c2
:format
) (eql c2
:extend
)) (nobrk))
1135 ((and (or (eql c1
:aletter
) (eql c1
:hebrew-letter
))
1136 (or (eql c2
:aletter
) (eql c2
:hebrew-letter
))) (nobrk))
1137 ((and (or (eql c1
:aletter
) (eql c1
:hebrew-letter
))
1138 (member c2
'(:midletter
:midnumlet
:single-quote
))
1139 (or (eql c3
:aletter
) (eql c3
:hebrew-letter
)))
1140 (nobrk) (setf flag t
)) ; Handle the multiple breaks from this rule
1141 ((and (eql c1
:hebrew-letter
) (eql c2
:double-quote
)
1142 (eql c3
:hebrew-letter
))
1143 (nobrk) (setf flag t
))
1144 ((and (eql c1
:hebrew-letter
) (eql c2
:single-quote
)) (nobrk))
1145 ((or (and (eql c1
:numeric
) (member c2
'(:numeric
:aletter
:hebrew-letter
)))
1146 (and (eql c2
:numeric
) (member c1
'(:numeric
:aletter
:hebrew-letter
))))
1148 ((and (eql c1
:numeric
)
1149 (member c2
'(:midnum
:midnumlet
:single-quote
))
1151 (nobrk) (setf flag t
))
1152 ((and (eql c1
:katakana
) (eql c2
:katakana
)) (nobrk))
1153 ((or (and (member c1
1154 '(:aletter
:hebrew-letter
:katakana
1155 :numeric
:extendnumlet
)) (eql c2
:extendnumlet
))
1157 '(:aletter
:hebrew-letter
:katakana
1158 :numeric
:extendnumlet
)) (eql c1
:extendnumlet
)))
1160 ((and (eql c1
:regional-indicator
) (eql c2
:regional-indicator
)) (nobrk))
1163 (defun sentence-break-class (char)
1164 "Returns the sentence breaking class of CHARACTER, as specified in UAX #29."
1165 (when (listp char
) (setf char
(car char
)))
1166 (let ((cp (when char
(char-code char
)))
1167 (gc (when char
(general-category char
)))
1168 (aterms #(#x002E
#x2024
#xFE52
#xFF0E
))
1170 #(#x002C
#x002D
#x003A
#x055D
#x060C
#x060D
#x07F8
#x1802
#x1808
1171 #x2013
#x2014
#x3001
#xFE10
#xFE11
#xFE13
#xFE31
#xFE32
#xFE50
1172 #xFE51
#xFE55
#xFE58
#xFE63
#xFF0C
#xFF0D
#xFF1A
#xFF64
)))
1177 ((or (eql (grapheme-break-class char
) :extend
)
1178 (eql gc
:mc
)) :extend
)
1179 ((or (eql cp
#x0085
) (<= #x2028 cp
#x2029
)) :sep
)
1180 ((and (eql gc
:Cf
) (not (<= #x200C cp
#x200D
))) :format
)
1181 ((whitespace-p char
) :sp
)
1182 ((lowercase-p char
) :lower
)
1183 ((or (uppercase-p char
) (eql gc
:Lt
)) :upper
)
1184 ((or (alphabetic-p char
) (eql cp
#x00A0
) (eql cp
#x05F3
)) :oletter
)
1185 ((or (and (eql gc
:Nd
) (not (<= #xFF10 cp
#xFF19
))) ;Fullwidth digits
1186 (<= #x066B cp
#x066C
)) :numeric
)
1187 ((binary-search cp aterms
) :aterm
)
1188 ((binary-search cp scontinues
) :scontinue
)
1189 ((proplist-p char
:sterm
) :sterm
)
1190 ((and (or (member gc
'(:Po
:Ps
:Pe
:Pf
:Pi
))
1191 (eql (line-break-class char
) :qu
))
1192 (not (eql cp
#x05F3
))) :close
)
1195 (defun sentence-prebreak (string)
1196 "Pre-combines some sequences of characters to make the sentence-break
1199 - Combines any character with the following extend of format characters
1200 - Combines CR + LF into '(CR LF)
1201 - Combines any run of :cp*:close* into one character"
1202 (let ((chars (coerce string
'list
))
1203 cluster clusters last-seen sp-run
)
1204 (labels ((flush () (if (cdr cluster
) (push (nreverse cluster
) clusters
)
1205 (if cluster
(push (car cluster
) clusters
)))
1208 (flush) (push x clusters
))
1209 (nobrk (x) (push x cluster
)))
1210 (loop for ch in chars
1211 for type
= (sentence-break-class ch
)
1213 ((and (eql last-seen
:cr
) (eql type
:lf
)) (nobrk ch
) (flush) (setf last-seen nil
))
1214 ((eql last-seen
:cr
) (brk ch
) (setf last-seen nil
))
1215 ((eql type
:cr
) (nobrk ch
) (setf last-seen
:cr
))
1216 ((eql type
:lf
) (brk ch
) (setf last-seen nil
))
1217 ((eql type
:sep
) (brk ch
) (setf last-seen nil
))
1218 ((and last-seen
(or (eql type
:extend
) (eql type
:format
)))
1221 (unless (eql last-seen
:close
) (flush))
1222 (nobrk ch
) (setf last-seen
:close sp-run nil
))
1224 (unless (or (and (not sp-run
) (eql last-seen
:close
)) (eql last-seen
:sp
))
1225 (flush) (setf sp-run t
))
1226 (nobrk ch
) (setf last-seen
:sp
))
1227 (t (flush) (nobrk ch
) (setf last-seen type sp-run nil
))))
1228 (flush) (nreverse clusters
))))
1230 (defun sentences (string)
1231 "Breaks STRING into sentences acording to the default
1232 sentence breaking rules specified in UAX #29"
1233 (let ((special-handling '(:close
:sp
:sep
:cr
:lf
:scontinue
:sterm
:aterm
))
1234 (chars (sentence-prebreak string
))
1235 sentence sentences state
)
1236 (flatpush (car chars
) sentence
)
1237 (do ((first (car chars
) second
)
1238 (tail (cdr chars
) (cdr tail
))
1239 (second (cadr chars
) (cadr tail
))
1240 (third (caddr chars
) (caddr tail
)))
1243 ; Shake off last sentence
1244 (when sentence
(push (nreverse sentence
) sentences
))
1245 (nreverse (mapcar #'(lambda (l) (coerce l
'string
)) sentences
))))
1246 (flet ((brk () (push (nreverse sentence
) sentences
)
1247 (setf sentence nil
) (flatpush second sentence
))
1248 (nobrk () (flatpush second sentence
)))
1249 (let ((c1 (sentence-break-class first
))
1250 (c2 (sentence-break-class second
))
1251 (c3 (sentence-break-class third
)))
1253 ((eql state
:brk-next
) (brk) (setf state nil
))
1254 ((eql state
:nobrk-next
) (nobrk) (setf state nil
))
1255 ((member c1
'(:sep
:cr
:lf
)) (brk))
1256 ((and (eql c1
:aterm
) (eql c2
:numeric
)) (nobrk))
1257 ((and (eql c1
:upper
) (eql c2
:aterm
)
1258 (eql c3
:upper
)) (nobrk) (setf state
:nobrk-next
))
1259 ((or (and (member c1
'(:sterm
:aterm
)) (member c2
'(:close
:sp
))
1260 (member c3
'(:scontinue
:sterm
:aterm
)))
1261 (and (member c1
'(:sterm
:aterm
))
1262 (member c2
'(:scontinue
:sterm
:aterm
))))
1263 (nobrk) (when (member c2
'(:close
:sp
)) (setf state
:nobrk-next
)))
1264 ((and (member c1
'(:sterm
:aterm
)) (member c2
'(:close
:sp
))
1265 (member c3
'(:sep
:cr
:lf
)))
1266 (nobrk) (setf state
:nobrk-next
)) ;; Let the linebreak call (brk)
1267 ((and (member c1
'(:sterm
:aterm
)) (member c2
'(:sep
:cr
:lf
)))
1268 (nobrk)) ; Doesn't trigger rule 8
1269 ((eql c1
:sterm
) ; Not ambiguous anymore, rule 8a already handled
1270 (if (member c2
'(:close
:sp
))
1271 (progn (nobrk) (setf state
:brk-next
))
1273 ((and (eql c2
:sterm
) third
(not (member c3 special-handling
)))
1274 (nobrk) (setf state
:brk-next
)) ; STerm followed by nothing important
1275 ((or (eql c1
:aterm
)
1276 (and (eql c2
:aterm
) third
1277 (not (member c3 special-handling
)) (not (eql c3
:numeric
))))
1278 ; Finally handle rule 8
1280 (if (and third
(not (or (member c3 special-handling
)
1281 (eql c3
:numeric
))))
1283 for type
= (sentence-break-class c
) do
1284 (when (member type
'(:oletter
:upper
:sep
:cr
:lf
1287 (when (eql type
:lower
) (return t
)) finally
(return nil
))
1289 (progn (nobrk) (setf state
:nobrk-next
))
1291 (if (member c2
'(:close
:sp
:aterm
))
1292 (progn (nobrk) (setf state
:brk-next
))
1296 (defun line-prebreak (string)
1297 (let ((chars (coerce string
'list
))
1298 cluster clusters last-seen
)
1299 (loop for char in chars
1300 for type
= (line-break-class char
)
1305 (not (eql type
:cm
))
1307 (member last-seen
'(nil :BK
:CR
:LF
:NL
:SP
:ZW
)))))
1309 (push (nreverse cluster
) clusters
)
1310 (push (car cluster
) clusters
))
1312 (unless (eql type
:cm
) (setf last-seen type
))
1313 (push char cluster
))
1315 (push (nreverse cluster
) clusters
)
1316 (push (car cluster
) clusters
))
1317 (nreverse clusters
)))
1319 (defun line-break-annotate (string)
1320 (let ((chars (line-prebreak string
))
1321 first second t1 t2 tail
(ret (list :cant
))
1323 (macrolet ((cmpush (thing)
1324 (let ((gthing (gensym)))
1325 `(let ((,gthing
,thing
))
1327 (loop for
(c next
) on
,gthing do
1329 (when next
(push :cant ret
)))
1330 (push ,thing ret
)))))
1331 (between (a b action
)
1332 (let ((atest (if (eql a
:any
) t
1336 (btest (if (eql b
:any
) t
1340 `(when (and ,atest
,btest
)
1344 (after-spaces (a b action
)
1345 (let ((atest (if (eql a
:any
) t
1349 (btest (if (eql b
:any
) t
1356 for type
= (line-break-class c
:resolve t
)
1358 (when (not (eql type
:sp
))
1361 (progn (cmpush :cant
)
1363 (setf state
:eat-spaces
)
1364 (setf after-spaces
,action
)
1366 (progn (cmpush ,action
)
1370 (cmpush (car chars
))
1371 (setf first
(car chars
))
1372 (setf tail
(cdr chars
))
1373 (setf second
(car tail
))
1376 (when (not first
) (go end
))
1377 (setf t1
(line-break-class first
:resolve t
))
1378 (setf t2
(line-break-class second
:resolve t
))
1379 (between :any
:nil
:must
)
1380 (when (and (eql state
:eat-spaces
) (eql t2
:sp
))
1381 (cmpush :cant
) (cmpush second
) (go tail
))
1382 (between :bk
:any
:must
)
1383 (between :cr
:lf
:cant
)
1384 (between '(:cr
:lf
:nl
) :any
:must
)
1385 (between :any
'(:zw
:bk
:cr
:lf
:nl
) :cant
)
1386 (when after-spaces
(cmpush after-spaces
) (cmpush second
)
1387 (setf state nil after-spaces nil
) (go tail
))
1388 (after-spaces :zw
:any
:can
)
1389 (between :any
:wj
:cant
)
1390 (between :wj
:any
:cant
)
1391 (between :gl
:any
:cant
)
1392 (between '(:ZW
:WJ
:SY
:SG
:SA
:RI
:QU
:PR
:PO
:OP
:NU
:NS
:NL
1393 :LF
:IS
:IN
:ID
:HL
:GL
:EX
:CR
:CP
:CM
:CL
:CJ
:CB
1394 :BK
:BB
:B2
:AL
:AI
:JL
:JV
:JT
:H2
:H3
:XX
)
1396 (between :any
'(:cl
:cp
:ex
:is
:sy
) :cant
)
1397 (after-spaces :op
:any
:cant
)
1398 (after-spaces :qu
:op
:cant
)
1399 (after-spaces '(:cl
:cp
) :ns
:cant
)
1400 (after-spaces :b2
:b2
:cant
)
1401 (between :any
:sp
:cant
) ;; Goes here to deal with after-spaces
1402 (between :sp
:any
:can
)
1403 (between :any
:qu
:cant
)
1404 (between :qu
:any
:cant
)
1405 (between :any
:cb
:can
)
1406 (between :cb
:any
:can
)
1407 (between :any
'(:ba
:hy
:ns
) :cant
)
1408 (between :bb
:any
:cant
)
1409 (when (and (eql t1
:hl
) (eql t2
:hy
))
1410 (cmpush :cant
) (cmpush second
)
1411 (setf after-spaces
:can
) (go tail
))
1412 (between '(:al
:hl
:id
:in
:nu
) :in
:cant
)
1413 (between :id
:po
:cant
)
1414 (between '(:al
:hl
) :nu
:cant
)
1415 (between '(:nu
:po
) '(:al
:hl
) :cant
)
1416 (between :pr
'(:id
:al
:hl
) :cant
)
1417 (between '(:cl
:cp
:nu
) '(:po
:pr
) :cant
)
1418 (between :nu
'(:po
:pr
:nu
) :cant
)
1419 (between '(:po
:pr
) :op
:cant
)
1420 (between '(:po
:pr
:hy
:is
:sy
) :nu
:cant
)
1421 (between :jl
'(:jl
:jv
:h2
:h3
) :cant
)
1422 (between '(:jv
:h2
) '(:jv
:jt
) :cant
)
1423 (between '(:jt
:h3
) :jt
:cant
)
1424 (between '(:jl
:jv
:jt
:h2
:h3
) '(:in
:po
) :cant
)
1425 (between :pr
'(:jl
:jv
:jt
:h2
:h3
) :cant
)
1426 (between '(:al
:hl
:is
) '(:al
:hl
) :cant
)
1427 (between '(:al
:hl
:nu
) :op
:cant
)
1428 (between :cp
'(:al
:hl
:nu
) :cant
)
1429 (between :ri
:ri
:cant
)
1430 (between :any
:any
:can
)
1433 (setf tail
(cdr tail
))
1434 (setf second
(car tail
))
1437 ;; LB3 satisfied by (:any :nil) -> :must
1438 (setf ret
(nreverse ret
))
1441 (defun break-list-at (list n
)
1442 (let ((tail list
) (pre-tail nil
))
1443 (loop repeat n do
(setf pre-tail tail
) (setf tail
(cdr tail
)))
1444 (setf (cdr pre-tail
) nil
)
1445 (values list tail
)))
1447 (defun lines (string &key
(margin *print-right-margin
*))
1448 "Breaks STRING into lines that are no wider than :MARGIN according to the
1449 line breaking rules outlined in UAX #14. Combining marks will always be kept
1450 together with their base characters, and spaces (but not other types of
1451 whitespace) will be removed from the end of lines. If :MARGIN is unspecified,
1452 it defaults to 80 characters"
1453 (when (string= string
"") (return-from lines
(list "")))
1454 (unless margin
(setf margin
80))
1455 (do* ((chars (line-break-annotate string
))
1456 line lines
(filled 0) last-break-distance
1457 (break-type (car chars
) (car tail
))
1458 (char (cadr chars
) (cadr tail
))
1459 (tail (cddr chars
) (cddr tail
)))
1461 (mapcar #'(lambda (s) (coerce s
'string
)) (nreverse lines
)))
1465 (unless (eql (line-break-class char
) :CM
)
1467 (when last-break-distance
(incf last-break-distance
)))
1470 (setf last-break-distance
1)
1474 (setf last-break-distance
1)
1477 (if (> filled margin
)
1481 (when (not last-break-distance
)
1482 ;; If we don't have any line breaks, remove the last thing we added that
1483 ;; takes up space, and all its combining marks
1484 (setf last-break-distance
1485 (1+ (loop for c in line while
(eql (line-break-class c
) :cm
) summing
1))))
1486 (multiple-value-bind (next-line this-line
) (break-list-at line last-break-distance
)
1487 (loop while
(eql (line-break-class (car this-line
)) :sp
)
1488 do
(setf this-line
(cdr this-line
)))
1489 (push (nreverse this-line
) lines
)
1490 (setf line next-line
)
1491 (setf filled
(length line
))
1492 (setf last-break-distance nil
))
1497 (defconstant +maximum-variable-primary-element
+
1498 #.
(with-open-file (stream
1502 '(:relative
:up
:up
"output")
1503 :name
"other-collation-info" :type
"lisp-expr")
1504 sb
!xc
:*compile-file-truename
*)
1506 :element-type
'character
)
1509 (defun unpack-collation-key (key)
1510 (declare (type (simple-array (unsigned-byte 32) (*)) key
))
1511 (loop for value across key
1513 (list (ldb (byte 16 16) value
)
1514 (ldb (byte 11 5) value
)
1515 (ldb (byte 5 0) value
))))
1517 (declaim (inline variable-p
))
1518 (defun variable-p (x)
1519 (<= 1 x
+maximum-variable-primary-element
+))
1521 (defun collation-key (string start end
)
1523 (char2 (code-char 0))
1524 (char3 (code-char 0)))
1526 (1 (setf char1
(char string start
)))
1527 (2 (setf char1
(char string start
)
1528 char2
(char string
(+ start
1))))
1529 (3 (setf char1
(char string start
)
1530 char2
(char string
(+ start
1))
1531 char3
(char string
(+ start
2))))
1533 ;; There are never more than three characters in a contraction, right?
1534 (return-from collation-key nil
)))
1535 (let ((packed-key (gethash (pack-3-codepoints
1539 **character-collations
**)))
1541 (unpack-collation-key packed-key
)
1542 (when (char= (code-char 0) char2 char3
)
1543 (let* ((cp (char-code char1
))
1545 (cond ((not (proplist-p char1
:unified-ideograph
))
1547 ((or (<= #x4E00 cp
#x9FFF
)
1548 (<= #xF900 cp
#xFAFF
))
1552 (a (+ base
(ash cp -
15)))
1553 (b (logior #.
(ash 1 15) (logand cp
#x7FFFF
))))
1554 (list (list a
#x20
#x2
) (list b
0 0))))))))
1556 (defun sort-key (string)
1557 (let* ((str (normalize-string string
:nfd
))
1558 (i 0) (len (length str
)) max-match new-i
1561 (loop while
(< i len
)
1563 (loop for offset from
1 to
3
1564 for index
= (+ i offset
)
1565 while
(<= index len
)
1567 (let ((key (collation-key str i index
)))
1571 (loop for index from new-i below len
1572 for char
= (char str index
)
1573 for previous-combining-class
= combining-class
1574 for combining-class
= (combining-class char
)
1575 until
(eql combining-class
0)
1576 unless
(and (>= (- index new-i
) 1)
1577 ;; Combiners are sorted, we only have to look back
1578 ;; one step (see canonically-compose)
1579 (>= (combining-class (char str
(1- index
)))
1582 (rotatef (char str new-i
) (char str index
))
1583 (let ((key (collation-key str i
(1+ new-i
))))
1587 (rotatef (char str new-i
) (char str index
)))))
1588 (loop for key in max-match do
(push key sort-key
))
1590 (macrolet ((push-non-zero (obj place
)
1592 (push ,obj
,place
))))
1593 (let (primary secondary tertiary quatenary
)
1594 (loop for
(k1 k2 k3
) in
(nreverse sort-key
)
1599 (setf after-variable t
)
1600 (push k1 quatenary
))
1602 (setf after-variable nil
)
1604 (push-non-zero k2 secondary
)
1605 (push-non-zero k3 tertiary
)
1606 (push #xFFFF quatenary
))
1608 (unless after-variable
1609 (push-non-zero k2 secondary
)
1611 (push #xFFFF quatenary
)))))
1612 (concatenate 'vector
1613 (nreverse primary
) #(0) (nreverse secondary
) #(0)
1614 (nreverse tertiary
) #(0) (nreverse quatenary
))))))
1616 (defun vector< (vector1 vector2
)
1617 (loop for i across vector1
1618 for j across vector2
1620 (cond ((< i j
) (return-from vector
< t
))
1621 ((> i j
) (return-from vector
< nil
))))
1622 ;; If there's no differences, shortest vector wins
1623 (< (length vector1
) (length vector2
)))
1625 (defun unicode= (string1 string2
&key
(start1 0) end1
(start2 0) end2
(strict t
))
1626 "Determines whether STRING1 and STRING2 are canonically equivalent according
1627 to Unicode. The START and END arguments behave like the arguments to STRING=.
1628 If :STRICT is NIL, UNICODE= tests compatibility equavalence instead."
1629 (let ((str1 (normalize-string (subseq string1 start1 end1
) (if strict
:nfd
:nfkd
)))
1630 (str2 (normalize-string (subseq string2 start2 end2
) (if strict
:nfd
:nfkd
))))
1631 (string= str1 str2
)))
1633 (defun unicode-equal (string1 string2
&key
(start1 0) end1
(start2 0) end2
(strict t
))
1634 "Determines whether STRING1 and STRING2 are canonically equivalent after
1635 casefoldin8 (that is, ignoring case differences) according to Unicode. The
1636 START and END arguments behave like the arguments to STRING=. If :STRICT is
1637 NIL, UNICODE= tests compatibility equavalence instead."
1638 (let ((str1 (normalize-string (subseq string1 start1 end1
) (if strict
:nfd
:nfkd
)))
1639 (str2 (normalize-string (subseq string2 start2 end2
) (if strict
:nfd
:nfkd
))))
1641 (normalize-string (casefold str1
) (if strict
:nfd
:nfkd
))
1642 (normalize-string (casefold str2
) (if strict
:nfd
:nfkd
)))))
1644 (defun unicode< (string1 string2
&key
(start1 0) end1
(start2 0) end2
)
1645 "Determines whether STRING1 sorts before STRING2 using the Unicode Collation
1646 Algorithm, The function uses an untailored Default Unicode Collation Element Table
1647 to produce the sort keys. The function uses the Shifted method for dealing
1648 with variable-weight characters, as described in UTS #10"
1649 (let* ((s1 (subseq string1 start1 end1
))
1650 (s2 (subseq string2 start2 end2
))
1651 (k1 (sort-key s1
)) (k2 (sort-key s2
)))
1653 (string< (normalize-string s1
:nfd
) (normalize-string s2
:nfd
))
1656 (defun unicode<= (string1 string2
&key
(start1 0) end1
(start2 0) end2
)
1657 "Tests if STRING1 and STRING2 are either UNICODE< or UNICODE="
1659 (unicode= string1 string2
:start1 start1
:end1 end1
1660 :start2 start2
:end2 end2
)
1661 (unicode< string1 string2
:start1 start1
:end1 end1
1662 :start2 start2
:end2 end2
)))
1664 (defun unicode> (string1 string2
&key
(start1 0) end1
(start2 0) end2
)
1665 "Tests if STRING2 is UNICODE< STRING1."
1666 (unicode< string2 string1
:start1 start2
:end1 end2
1667 :start2 start1
:end2 end1
))
1669 (defun unicode>= (string1 string2
&key
(start1 0) end1
(start2 0) end2
)
1670 "Tests if STRING1 and STRING2 are either UNICODE= or UNICODE>"
1672 (unicode= string1 string2
:start1 start1
:end1 end1
1673 :start2 start2
:end2 end2
)
1674 (unicode> string1 string2
:start1 start1
:end1 end1
1675 :start2 start2
:end2 end2
)))
1678 ;;; Confusable detection
1680 (defun canonically-deconfuse (string)
1681 (let (ret (i 0) new-i
(len (length string
))
1683 (loop while
(< i len
) do
1684 (loop for offset from
1 to
5
1685 while
(<= (+ i offset
) len
)
1687 (let ((node (gethash (subseq string i
(+ i offset
))
1689 (when node
(setf best-node node new-i
(+ i offset
)))))
1691 (best-node (push best-node ret
) (setf i new-i
))
1692 (t (push (subseq string i
(1+ i
)) ret
) (incf i
)))
1693 (setf best-node nil new-i nil
))
1694 (apply #'concatenate
'string
(nreverse ret
))))
1696 (defun confusable-p (string1 string2
&key
(start1 0) end1
(start2 0) end2
)
1697 "Determines whether STRING1 and STRING2 could be visually confusable
1698 according to the IDNA confusableSummary.txt table"
1699 (let* ((form #!+sb-unicode
:nfd
#!-sb-unicode
:nfc
)
1700 (str1 (normalize-string (subseq string1 start1 end1
) form
))
1701 (str2 (normalize-string (subseq string2 start2 end2
) form
))
1702 (skeleton1 (normalize-string (canonically-deconfuse str1
) form
))
1703 (skeleton2 (normalize-string (canonically-deconfuse str2
) form
)))
1704 (string= skeleton1 skeleton2
)))