Unbreak test.
[sbcl.git] / tools-for-build / ucd.lisp
blobfa3a0d19466fadd34b1513a09b3c66d371067142
1 (in-package "SB-COLD")
3 ;;; Common functions
5 (defparameter *output-directory*
6 (merge-pathnames
7 (make-pathname :directory '(:relative :up "output"))
8 (make-pathname :directory (pathname-directory *load-truename*))))
10 (defun split-string (line character)
11 (loop for prev-position = 0 then (1+ position)
12 for position = (position character line :start prev-position)
13 collect (subseq line prev-position position)
14 do (unless position
15 (loop-finish))))
17 (defun parse-codepoints (string &key (singleton-list t))
18 "Gets a list of codepoints out of 'aaaa bbbb cccc', stripping surrounding space"
19 (let ((list (mapcar
20 (lambda (s) (parse-integer s :radix 16))
21 (remove "" (split-string string #\Space) :test #'string=))))
22 (if (not (or (cdr list) singleton-list)) (car list) list)))
25 (defun parse-codepoint-range (string)
26 "Parse the Unicode syntax DDDD|DDDD..DDDD into an inclusive range (start end)"
27 (destructuring-bind (start &optional empty end) (split-string string #\.)
28 (declare (ignore empty))
29 (let* ((head (parse-integer start :radix 16))
30 (tail (if end
31 (parse-integer end :radix 16 :end (position #\Space end))
32 head)))
33 (list head tail))))
35 (defun init-indices (strings)
36 (let ((hash (make-hash-table :test #'equal)))
37 (loop for string in strings
38 for index from 0
39 do (setf (gethash string hash) index))
40 hash))
42 (defun clear-flag (bit integer)
43 (logandc2 integer (ash 1 bit)))
46 ;;; Output storage globals
47 (defstruct ucd misc decomp)
49 (defparameter *unicode-character-database*
50 (make-pathname :directory (pathname-directory *load-truename*)))
52 (defparameter *unicode-names* (make-hash-table))
53 (defparameter *unicode-1-names* (make-hash-table))
55 (defparameter *decompositions*
56 (make-array 10000 :element-type '(unsigned-byte 24) :fill-pointer 0
57 :adjustable t)) ; 10000 is not a significant number
59 (defparameter *decomposition-corrections*
60 (with-open-file (s (make-pathname :name "NormalizationCorrections" :type "txt"
61 :defaults *unicode-character-database*))
62 (loop with result = nil
63 for line = (read-line s nil nil) while line
64 do (when (position #\; line)
65 (destructuring-bind (cp old-decomp correction version)
66 (split-string line #\;)
67 (declare (ignore old-decomp version))
68 (push (cons (parse-integer cp :radix 16)
69 (parse-integer correction :radix 16))
70 result)))
71 finally (return result)))
72 "List of decompsotions that were amended in Unicode corrigenda")
74 (defparameter *compositions* (make-hash-table :test #'equal))
75 (defparameter *composition-exclusions*
76 (with-open-file (s (make-pathname :name "CompositionExclusions" :type "txt"
77 :defaults *unicode-character-database*))
78 (loop with result = nil
79 for line = (read-line s nil nil) while line
80 when (and (> (length line) 0) (char/= (char line 0) #\#))
81 do (push (parse-integer line :end (position #\Space line) :radix 16)
82 result) finally (return result)))
83 "Characters that are excluded from composition according to UAX#15")
85 (defparameter *different-titlecases* nil)
86 (defparameter *different-casefolds* nil)
88 (defparameter *case-mapping*
89 (with-open-file (s (make-pathname :name "SpecialCasing" :type "txt"
90 :defaults *unicode-character-database*))
91 (loop with hash = (make-hash-table)
92 for line = (read-line s nil nil) while line
93 unless (or (not (position #\# line)) (= 0 (position #\# line)))
94 do (destructuring-bind (%cp %lower %title %upper &optional context comment)
95 (split-string line #\;)
96 (unless (and context comment)
97 (let ((cp (parse-integer %cp :radix 16))
98 (lower (parse-codepoints %lower :singleton-list nil))
99 (title (parse-codepoints %title :singleton-list nil))
100 (upper (parse-codepoints %upper :singleton-list nil)))
101 (setf (gethash cp hash) (cons upper lower))
102 (unless (equal title upper) (push (cons cp title) *different-titlecases*)))))
103 finally (return hash)))
104 "Maps cp -> (cons uppercase|(uppercase ...) lowercase|(lowercase ...))")
106 (defparameter *misc-table* (make-array 3000 :fill-pointer 0)
107 "Holds the entries in the Unicode database's miscellanious array, stored as lists.
108 These lists have the form (gc-index bidi-index ccc digit decomposition-info
109 flags script line-break age). Flags is a bit-bashed integer containing
110 cl-both-case-p, has-case-p, and bidi-mirrored-p, and an east asian width.
111 Length should be adjusted when the standard changes.")
112 (defparameter *misc-hash* (make-hash-table :test #'equal)
113 "Maps a misc list to its position in the misc table.")
115 (defparameter *different-numerics* nil)
117 (defparameter *ucd-entries* (make-hash-table))
119 ;; Mappings of the general categories and bidi classes to integers
120 ;; Letter classes go first to optimize certain cl character type checks
121 ;; BN is the first BIDI class so that unallocated characters are BN
122 ;; Uppercase in the CL sense must have GC = 0, lowercase must GC = 1
123 (defparameter *general-categories*
124 (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Cn"
125 "Mc" "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf"
126 "Pi" "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
127 (defparameter *bidi-classes*
128 (init-indices '("BN" "AL" "AN" "B" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
129 "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS" "LRI" "RLI"
130 "FSI" "PDI")))
131 (defparameter *east-asian-widths* (init-indices '("N" "A" "H" "W" "F" "Na")))
132 (defparameter *scripts*
133 (init-indices
134 '("Unknown" "Common" "Latin" "Greek" "Cyrillic" "Armenian" "Hebrew" "Arabic"
135 "Syriac" "Thaana" "Devanagari" "Bengali" "Gurmukhi" "Gujarati" "Oriya"
136 "Tamil" "Telugu" "Kannada" "Malayalam" "Sinhala" "Thai" "Lao" "Tibetan"
137 "Myanmar" "Georgian" "Hangul" "Ethiopic" "Cherokee" "Canadian_Aboriginal"
138 "Ogham" "Runic" "Khmer" "Mongolian" "Hiragana" "Katakana" "Bopomofo" "Han"
139 "Yi" "Old_Italic" "Gothic" "Deseret" "Inherited" "Tagalog" "Hanunoo" "Buhid"
140 "Tagbanwa" "Limbu" "Tai_Le" "Linear_B" "Ugaritic" "Shavian" "Osmanya"
141 "Cypriot" "Braille" "Buginese" "Coptic" "New_Tai_Lue" "Glagolitic"
142 "Tifinagh" "Syloti_Nagri" "Old_Persian" "Kharoshthi" "Balinese" "Cuneiform"
143 "Phoenician" "Phags_Pa" "Nko" "Sundanese" "Lepcha" "Ol_Chiki" "Vai"
144 "Saurashtra" "Kayah_Li" "Rejang" "Lycian" "Carian" "Lydian" "Cham"
145 "Tai_Tham" "Tai_Viet" "Avestan" "Egyptian_Hieroglyphs" "Samaritan" "Lisu"
146 "Bamum" "Javanese" "Meetei_Mayek" "Imperial_Aramaic" "Old_South_Arabian"
147 "Inscriptional_Parthian" "Inscriptional_Pahlavi" "Old_Turkic" "Kaithi"
148 "Batak" "Brahmi" "Mandaic" "Chakma" "Meroitic_Cursive"
149 "Meroitic_Hieroglyphs" "Miao" "Sharada" "Sora_Sompeng" "Takri"
150 "Bassa_Vah" "Mahajani" "Pahawh_Hmong" "Caucasian_Albanian" "Manichaean"
151 "Palmyrene" "Duployan" "Mende_Kikakui" "Pau_Cin_Hau" "Elbasan" "Modi"
152 "Psalter_Pahlavi" "Grantha" "Mro" "Siddham" "Khojki" "Nabataean" "Tirhuta"
153 "Khudawadi" "Old_North_Arabian" "Warang_Citi" "Linear_A" "Old_Permic")))
154 (defparameter *line-break-classes*
155 (init-indices
156 '("XX" "AI" "AL" "B2" "BA" "BB" "BK" "CB" "CJ" "CL" "CM" "CP" "CR" "EX" "GL"
157 "HL" "HY" "ID" "IN" "IS" "LF" "NL" "NS" "NU" "OP" "PO" "PR" "QU" "RI" "SA"
158 "SG" "SP" "SY" "WJ" "ZW")))
160 (defparameter *east-asian-width-table*
161 (with-open-file (s (make-pathname :name "EastAsianWidth" :type "txt"
162 :defaults *unicode-character-database*))
163 (loop with hash = (make-hash-table)
164 for line = (read-line s nil nil) while line
165 unless (or (not (position #\# line)) (= 0 (position #\# line)))
166 do (destructuring-bind (codepoints value)
167 (split-string
168 (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
169 (let ((range (parse-codepoint-range codepoints))
170 (index (gethash value *east-asian-widths*)))
171 (loop for i from (car range) to (cadr range)
172 do (setf (gethash i hash) index))))
173 finally (return hash)))
174 "Table of East Asian Widths. Used in the creation of misc entries.")
176 (defparameter *script-table*
177 (with-open-file (s (make-pathname :name "Scripts" :type "txt"
178 :defaults *unicode-character-database*))
179 (loop with hash = (make-hash-table)
180 for line = (read-line s nil nil) while line
181 unless (or (not (position #\# line)) (= 0 (position #\# line)))
182 do (destructuring-bind (codepoints value)
183 (split-string
184 (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
185 (let ((range (parse-codepoint-range codepoints))
186 (index (gethash (subseq value 1) *scripts*)))
187 (loop for i from (car range) to (cadr range)
188 do (setf (gethash i hash) index))))
189 finally (return hash)))
190 "Table of scripts. Used in the creation of misc entries.")
192 (defparameter *line-break-class-table*
193 (with-open-file (s (make-pathname :name "LineBreakProperty" :type "txt"
194 :defaults *unicode-character-database*))
195 (loop with hash = (make-hash-table)
196 for line = (read-line s nil nil) while line
197 unless (or (not (position #\# line)) (= 0 (position #\# line)))
198 do (destructuring-bind (codepoints value)
199 (split-string
200 (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
201 (let ((range (parse-codepoint-range codepoints))
202 ;; Hangul syllables temporarily go to "Unkwown"
203 (index (gethash value *line-break-classes* 0)))
204 (loop for i from (car range) to (cadr range)
205 do (setf (gethash i hash) index))))
206 finally (return hash)))
207 "Table of line break classes. Used in the creation of misc entries.")
209 (defparameter *age-table*
210 (with-open-file (s (make-pathname :name "DerivedAge" :type "txt"
211 :defaults *unicode-character-database*))
212 (loop with hash = (make-hash-table)
213 for line = (read-line s nil nil) while line
214 unless (or (not (position #\# line)) (= 0 (position #\# line)))
215 do (destructuring-bind (codepoints value)
216 (split-string
217 (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
218 (let* ((range (parse-codepoint-range codepoints))
219 (age-parts (mapcar #'parse-integer (split-string value #\.)))
220 (age (logior (ash (car age-parts) 3) (cadr age-parts))))
221 (loop for i from (car range) to (cadr range)
222 do (setf (gethash i hash) age))))
223 finally (return hash)))
224 "Table of character ages. Used in the creation of misc entries.")
226 (defvar *block-first* nil)
229 ;;; Unicode data file parsing
230 (defun hash-misc (gc-index bidi-index ccc digit decomposition-info flags
231 script line-break age)
232 (let* ((list (list gc-index bidi-index ccc digit decomposition-info flags
233 script line-break age))
234 (index (gethash list *misc-hash*)))
235 (or index
236 (progn
237 (setf (gethash list *misc-hash*)
238 (fill-pointer *misc-table*))
239 (when (eql nil (vector-push list *misc-table*))
240 (error "Misc table too small."))
241 (gethash list *misc-hash*)))))
243 (defun ordered-ranges-member (item vector)
244 (labels ((recurse (start end)
245 (when (< start end)
246 (let* ((i (+ start (truncate (- end start) 2)))
247 (index (* 2 i))
248 (elt1 (svref vector index))
249 (elt2 (svref vector (1+ index))))
250 (cond ((< item elt1)
251 (recurse start i))
252 ((> item elt2)
253 (recurse (+ 1 i) end))
255 item))))))
256 (recurse 0 (/ (length vector) 2))))
258 (defun unallocated-bidi-class (code-point)
259 ;; See tests/data/DerivedBidiClass.txt for more information
260 (flet ((in (vector class)
261 (when (ordered-ranges-member code-point vector)
262 (gethash class *bidi-classes*))))
263 (cond
264 ((in
265 #(#x0600 #x07BF #x08A0 #x08FF #xFB50 #xFDCF #xFDF0 #xFDFF #xFE70 #xFEFF
266 #x1EE00 #x1EEFF) "AL"))
267 ((in
268 #(#x0590 #x05FF #x07C0 #x089F #xFB1D #xFB4F #x10800 #x10FFF #x1E800 #x1EDFF
269 #x1EF00 #x1EFFF) "R"))
270 ((in #(#x20A0 #x20CF) "ET"))
271 ;; BN is non-characters and default-ignorable.
272 ;; Default-ignorable will be dealt with elsewhere
273 ((in #(#xFDD0 #xFDEF #xFFFE #xFFFF #x1FFFE #x1FFFF #x2FFFE #x2FFFF
274 #x3FFFE #x3FFFF #x4FFFE #x4FFFF #x5FFFE #x5FFFF #x6FFFE #x6FFFF
275 #x7FFFE #x7FFFF #x8FFFE #x8FFFF #x9FFFE #x9FFFF #xAFFFE #xAFFFF
276 #xBFFFE #xBFFFF #xCFFFE #xCFFFF #xDFFFE #xDFFFF #xEFFFE #xEFFFF
277 #xFFFFE #xFFFFF #x10FFFE #x10FFFF)
278 "BN"))
279 ((in #(#x0 #x10FFFF) "L"))
280 (t (error "Somehow we've gone too far in unallocated bidi determination")))))
282 (defun complete-misc-table ()
283 (loop for code-point from 0 to #x10FFFF do ; Flood-fil unallocated codepoints
284 (unless (second (multiple-value-list (gethash code-point *ucd-entries*)))
285 (let* ((unallocated-misc
286 ;; unallocated characters have a GC of "Cn", aren't digits
287 ;; (digit = 128), have a bidi that depends on their block, and
288 ;; don't decompose, combine, or have case. They have an East
289 ;; Asian Width (eaw) of "N" (0), and a script, line breaking
290 ;; class, and age of 0 ("Unknown"), unless some of those
291 ;; properties are otherwise assigned.
292 `(,(gethash "Cn" *general-categories*)
293 ,(unallocated-bidi-class code-point) 0 128 0
294 ,(gethash code-point *east-asian-width-table* 0)
295 0 ,(gethash code-point *line-break-class-table* 0)
296 ,(gethash code-point *age-table* 0)))
297 (unallocated-index (apply #'hash-misc unallocated-misc))
298 (unallocated-ucd (make-ucd :misc unallocated-index)))
299 (setf (gethash code-point *ucd-entries*) unallocated-ucd)))))
301 (defun expand-decomposition (decomposition)
302 (loop for cp in decomposition
303 for ucd = (gethash cp *ucd-entries*)
304 for length = (elt (aref *misc-table* (ucd-misc ucd)) 4)
305 if (and (not (logbitp 7 length))
306 (plusp length))
307 append (expand-decomposition (ucd-decomp ucd))
308 else
309 collect cp))
311 ;;; Recursively expand canonical decompositions
312 (defun fixup-decompositions ()
313 (loop for did-something = nil
315 (loop for code being the hash-key of *ucd-entries*
316 using (hash-value ucd)
317 when (and (ucd-decomp ucd)
318 (not (logbitp 7 (elt (aref *misc-table* (ucd-misc ucd)) 4))))
320 (let ((expanded (expand-decomposition (ucd-decomp ucd))))
321 (unless (equal expanded (ucd-decomp ucd))
322 (setf (ucd-decomp ucd) expanded
323 did-something t))))
324 while did-something)
325 (loop for i below (hash-table-count *ucd-entries*)
326 for ucd = (gethash i *ucd-entries*)
327 for decomp = (ucd-decomp ucd)
329 (setf (ucd-decomp ucd)
330 (cond ((not (consp decomp)) 0)
331 ((logbitp 7 (elt (aref *misc-table* (ucd-misc ucd)) 4))
332 (prog1 (length *decompositions*)
333 (loop for cp in decomp
334 do (vector-push-extend cp *decompositions*))))
336 (let ((misc-entry (copy-list (aref *misc-table* (ucd-misc ucd)))))
337 (setf (elt misc-entry 4) (length decomp)
338 (ucd-misc ucd) (apply #'hash-misc misc-entry))
339 (prog1 (length *decompositions*)
340 (loop for cp in decomp
341 do (vector-push-extend cp *decompositions*)))))))))
343 (defun fixup-compositions ()
344 (flet ((fixup (k v)
345 (declare (ignore v))
346 (let* ((cp (car k))
347 (ucd (gethash cp *ucd-entries*))
348 (misc (aref *misc-table* (ucd-misc ucd)))
349 (ccc (third misc)))
350 ;; we can do everything in the first pass except for
351 ;; accounting for decompositions where the first
352 ;; character of the decomposition is not a starter.
353 (when (/= ccc 0)
354 (remhash k *compositions*)))))
355 (maphash #'fixup *compositions*)))
357 (defun add-jamo-information (line table)
358 (let* ((split (split-string line #\;))
359 (code (parse-integer (first split) :radix 16))
360 (syllable (string-trim
362 (subseq (second split) 0 (position #\# (second split))))))
363 (setf (gethash code table) syllable)))
365 (defun fixup-hangul-syllables ()
366 ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
367 (let* ((sbase #xac00)
368 (lbase #x1100)
369 (vbase #x1161)
370 (tbase #x11a7)
371 (scount 11172)
372 (lcount 19)
373 (vcount 21)
374 (tcount 28)
375 (ncount (* vcount tcount))
376 (table (make-hash-table)))
377 (declare (ignore lcount))
378 (with-open-file (*standard-input*
379 (make-pathname :name "Jamo" :type "txt"
380 :defaults *unicode-character-database*))
381 (loop for line = (read-line nil nil)
382 while line
383 if (position #\; line)
384 do (add-jamo-information line table)))
385 (dotimes (sindex scount)
386 (let* ((l (+ lbase (floor sindex ncount)))
387 (v (+ vbase (floor (mod sindex ncount) tcount)))
388 (tee (+ tbase (mod sindex tcount)))
389 (code-point (+ sbase sindex))
390 (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]"
391 (gethash l table) (gethash v table)
392 (= tee tbase) (gethash tee table))))
393 (setf (gethash code-point *unicode-names*) name)))))
395 (defun normalize-character-name (name)
396 (when (find #\_ name)
397 (error "Bad name for a character: ~A" name))
398 ;; U+1F5CF (PAGE)'s name conflicts with the ANSI CL-assigned
399 ;; name for form feed (^L, U+000C). To avoid a case where
400 ;; more than one character has a particular name while remaining
401 ;; standards-compliant, we remove U+1F5CF's name here.
402 (when (string= name "PAGE")
403 (return-from normalize-character-name "UNICODE_PAGE"))
404 (unless (or (zerop (length name)) (find #\< name) (find #\> name))
405 (substitute #\_ #\Space name)))
407 ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
408 ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
409 ;;; D800 -- F8FF : surrogates and private use
410 ;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
411 ;;; F0000 -- FFFFD : private use
412 ;;; 100000 -- 10FFFD: private use
413 (defun encode-ucd-line (line code-point)
414 (destructuring-bind (name general-category canonical-combining-class
415 bidi-class decomposition-type-and-mapping
416 decimal-digit digit numeric bidi-mirrored
417 unicode-1-name iso-10646-comment simple-uppercase
418 simple-lowercase simple-titlecase)
419 line
420 (declare (ignore iso-10646-comment))
421 (if (and (> (length name) 8)
422 (string= ", First>" name :start2 (- (length name) 8)))
423 (progn
424 (setf *block-first* code-point)
425 nil)
426 (let* ((gc-index (or (gethash general-category *general-categories*)
427 (error "unknown general category ~A"
428 general-category)))
429 (bidi-index (or (gethash bidi-class *bidi-classes*)
430 (error "unknown bidirectional class ~A"
431 bidi-class)))
432 (ccc (parse-integer canonical-combining-class))
433 (digit-index (if (string= "" digit) 128 ; non-digits have high bit
434 (let ((%digit (parse-integer digit)))
435 (if (string= digit decimal-digit)
436 ;; decimal-digit-p is in bit 6
437 (logior (ash 1 6) %digit) %digit))))
438 (upper-index (unless (string= "" simple-uppercase)
439 (parse-integer simple-uppercase :radix 16)))
440 (lower-index (unless (string= "" simple-lowercase)
441 (parse-integer simple-lowercase :radix 16)))
442 (title-index (unless (string= "" simple-titlecase)
443 (parse-integer simple-titlecase :radix 16)))
444 (cl-both-case-p (or (and (= gc-index 0) lower-index)
445 (and (= gc-index 1) upper-index)))
446 (bidi-mirrored-p (string= bidi-mirrored "Y"))
447 (decomposition-info 0)
448 (eaw-index (gethash code-point *east-asian-width-table*))
449 (script-index (gethash code-point *script-table* 0))
450 (line-break-index (gethash code-point *line-break-class-table* 0))
451 (age-index (gethash code-point *age-table* 0))
452 decomposition)
453 #+nil
454 (when (and (not cl-both-case-p)
455 (< gc-index 2))
456 (format t "~A~%" name))
458 (when (string/= "" decomposition-type-and-mapping)
459 (let* ((compatibility-p (position #\> decomposition-type-and-mapping)))
460 (setf decomposition
461 (parse-codepoints
462 (subseq decomposition-type-and-mapping
463 (if compatibility-p (1+ compatibility-p) 0))))
464 (when (assoc code-point *decomposition-corrections*)
465 (setf decomposition
466 (list (cdr (assoc code-point *decomposition-corrections*)))))
467 (setf decomposition-info
468 (logior (length decomposition) (if compatibility-p 128 0)))
469 (unless compatibility-p
470 ;; Primary composition excludes:
471 ;; * singleton decompositions;
472 ;; * decompositions of non-starters;
473 ;; * script-specific decompositions;
474 ;; * later-version decompositions;
475 ;; * decompositions whose first character is a
476 ;; non-starter.
477 ;; All but the last case can be handled here;
478 ;; for the fixup, see FIXUP-COMPOSITIONS
479 (when (and (> decomposition-info 1)
480 (= ccc 0)
481 (not (member code-point *composition-exclusions*)))
482 (unless (= decomposition-info 2)
483 (error "canonical decomposition unexpectedly long"))
484 (setf (gethash (cons (first decomposition)
485 (second decomposition))
486 *compositions*)
487 code-point)))))
488 ;; Hangul decomposition; see Unicode 6.2 section 3-12
489 (when (= code-point #xd7a3)
490 ;; KLUDGE: The decomposition-length for Hangul syllables in the
491 ;; misc database will be a bit of a lie. It doesn't really matter
492 ;; since the only purpose of the length is to index into the
493 ;; decompositions array (which Hangul decomposition doesn't use).
494 ;; The decomposition index is 0 because we won't be going into the
495 ;; array
496 (setf decomposition-info 3))
498 (unless (gethash code-point *case-mapping*) ; Exclude codepoints from SpecialCasing
499 (when (string/= simple-uppercase simple-titlecase)
500 (push (cons code-point title-index) *different-titlecases*))
501 (and (or upper-index lower-index)
502 (setf (gethash code-point *case-mapping*)
503 (cons
504 (or upper-index code-point)
505 (or lower-index code-point)))))
507 (when (string/= digit numeric)
508 (push (cons code-point numeric) *different-numerics*))
510 (when (> ccc 255)
511 (error "canonical combining class too large ~A" ccc))
512 (let* ((flags (logior
513 (if cl-both-case-p (ash 1 7) 0)
514 (if (gethash code-point *case-mapping*) (ash 1 6) 0)
515 (if bidi-mirrored-p (ash 1 5) 0)
516 eaw-index))
517 (misc-index (hash-misc gc-index bidi-index ccc digit-index
518 decomposition-info flags script-index
519 line-break-index age-index))
520 (result (make-ucd :misc misc-index
521 :decomp decomposition)))
522 (when (and (> (length name) 7)
523 (string= ", Last>" name :start2 (- (length name) 7)))
524 ;; We can still do this despite East Asian Width being in the
525 ;; databasce since each of the UCD <First><Last> blocks
526 ;; has a consistent East Asian Width
527 (loop for point from *block-first* to code-point do
528 (setf (gethash point *ucd-entries*) result)))
529 (values result (normalize-character-name name)
530 (normalize-character-name unicode-1-name)))))))
532 (defun slurp-ucd-line (line)
533 (let* ((split-line (split-string line #\;))
534 (code-point (parse-integer (first split-line) :radix 16)))
535 (multiple-value-bind (encoding name unicode-1-name)
536 (encode-ucd-line (cdr split-line) code-point)
537 (setf (gethash code-point *ucd-entries*) encoding
538 (gethash code-point *unicode-names*) name)
539 (when unicode-1-name
540 (setf (gethash code-point *unicode-1-names*) unicode-1-name)))))
542 ;;; this fixes up the case conversion discrepancy between CL and
543 ;;; Unicode: CL operators depend on char-downcase / char-upcase being
544 ;;; inverses, which is not true in general in Unicode even for
545 ;;; characters which change case to single characters.
546 ;;; Also, fix misassigned age values, which are not constant across blocks
547 (defun second-pass ()
548 (let ((case-mapping
549 (sort (loop for code-point being the hash-keys in *case-mapping*
550 using (hash-value value)
551 collect (cons code-point value))
552 #'< :key #'car)))
553 (loop for (code-point upper . lower) in case-mapping
554 for misc-index = (ucd-misc (gethash code-point *ucd-entries*))
555 for (gc bidi ccc digit decomp flags script lb age) = (aref *misc-table* misc-index)
556 when (logbitp 7 flags) do
557 (when (or (not (atom upper)) (not (atom lower))
558 (and (= gc 0)
559 (not (equal (car (gethash lower *case-mapping*)) code-point)))
560 (and (= gc 1)
561 (not (equal (cdr (gethash upper *case-mapping*)) code-point))))
562 (let* ((new-flags (clear-flag 7 flags))
563 (new-misc (hash-misc gc bidi ccc digit decomp new-flags script lb age)))
564 (setf (ucd-misc (gethash code-point *ucd-entries*)) new-misc))))))
566 (defun fixup-casefolding ()
567 (with-open-file (s (make-pathname :name "CaseFolding" :type "txt"
568 :defaults *unicode-character-database*))
569 (loop for line = (read-line s nil nil)
570 while line
571 unless (or (not (position #\; line)) (equal (position #\# line) 0))
572 do (destructuring-bind (original type mapping comment)
573 (split-string line #\;)
574 (declare (ignore comment))
575 (let ((cp (parse-integer original :radix 16))
576 (fold (parse-codepoints mapping :singleton-list nil)))
577 (unless (or (string= type " S") (string= type " T"))
578 (when (not (equal (cdr (gethash cp *case-mapping*)) fold))
579 (push (cons cp fold) *different-casefolds*))))))))
581 (defun fixup-ages ()
582 (let ((age (sort
583 (loop for code-point being the hash-keys in *age-table*
584 using (hash-value true-age)
585 collect (cons code-point true-age))
586 #'< :key #'car)))
587 (loop for (code-point . true-age) in age
588 for misc-index = (ucd-misc (gethash code-point *ucd-entries*))
589 for (gc bidi ccc digit decomp flags script lb age) = (aref *misc-table* misc-index)
590 unless (= age true-age) do
591 (let* ((new-misc (hash-misc gc bidi ccc digit decomp flags script lb true-age))
592 (new-ucd (make-ucd
593 :misc new-misc
594 :decomp (ucd-decomp (gethash code-point *ucd-entries*)))))
595 (setf (gethash code-point *ucd-entries*) new-ucd)))))
597 (defun slurp-ucd ()
598 (with-open-file (*standard-input*
599 (make-pathname :name "UnicodeData"
600 :type "txt"
601 :defaults *unicode-character-database*)
602 :direction :input)
603 (format t "~%//slurp-ucd~%")
604 (loop for line = (read-line nil nil)
605 while line
606 do (slurp-ucd-line line)))
607 (second-pass)
608 (fixup-compositions)
609 (fixup-hangul-syllables)
610 (complete-misc-table)
611 (fixup-casefolding)
612 (fixup-ages)
613 (fixup-decompositions)
614 nil)
617 ;;; PropList.txt
618 (defparameter **proplist-properties** nil
619 "A list of properties extracted from PropList.txt")
621 (defun parse-property (stream &optional name)
622 (let ((result (make-array 1 :fill-pointer 0 :adjustable t)))
623 (loop for line = (read-line stream nil nil)
624 ;; Deal with Blah=Blah in DerivedNormalizationPRops.txt
625 while (and line (not (position #\= (substitute #\Space #\= line :count 1))))
626 for entry = (subseq line 0 (position #\# line))
627 when (and entry (string/= entry ""))
629 (destructuring-bind (start end)
630 (parse-codepoint-range (car (split-string entry #\;)))
631 (vector-push-extend start result)
632 (vector-push-extend end result)))
633 (when name
634 (push name **proplist-properties**)
635 (push result **proplist-properties**))))
637 (defun slurp-proplist ()
638 (with-open-file (s (make-pathname :name "PropList"
639 :type "txt"
640 :defaults *unicode-character-database*)
641 :direction :input)
642 (parse-property s) ;; Initial comments
643 (parse-property s :white-space)
644 (parse-property s :bidi-control)
645 (parse-property s :join-control)
646 (parse-property s :dash)
647 (parse-property s :hyphen)
648 (parse-property s :quotation-mark)
649 (parse-property s :terminal-punctuation)
650 (parse-property s :other-math)
651 (parse-property s :hex-digit)
652 (parse-property s :ascii-hex-digit)
653 (parse-property s :other-alphabetic)
654 (parse-property s :ideographic)
655 (parse-property s :diacritic)
656 (parse-property s :extender)
657 (parse-property s :other-lowercase)
658 (parse-property s :other-uppercase)
659 (parse-property s :noncharacter-code-point)
660 (parse-property s :other-grapheme-extend)
661 (parse-property s :ids-binary-operator)
662 (parse-property s :ids-trinary-operator)
663 (parse-property s :radical)
664 (parse-property s :unified-ideograph)
665 (parse-property s :other-default-ignorable-code-point)
666 (parse-property s :deprecated)
667 (parse-property s :soft-dotted)
668 (parse-property s :logical-order-exception)
669 (parse-property s :other-id-start)
670 (parse-property s :other-id-continue)
671 (parse-property s :sterm)
672 (parse-property s :variation-selector)
673 (parse-property s :pattern-white-space)
674 (parse-property s :pattern-syntax))
676 (with-open-file (s (make-pathname :name "DerivedNormalizationProps"
677 :type "txt"
678 :defaults *unicode-character-database*)
679 :direction :input)
680 (parse-property s) ;; Initial comments
681 (parse-property s) ;; FC_NFKC_Closure
682 (parse-property s) ;; FC_NFKC_Closure
683 (parse-property s) ;; Full_Composition_Exclusion
684 (parse-property s) ;; NFD_QC Comments
685 (parse-property s :nfd-qc)
686 (parse-property s) ;; NFC_QC Comments
687 (parse-property s :nfc-qc)
688 (parse-property s :nfc-qc-maybe)
689 (parse-property s) ;; NFKD_QC Comments
690 (parse-property s :nfkd-qc)
691 (parse-property s) ;; NFKC_QC Comments
692 (parse-property s :nfkc-qc)
693 (parse-property s :nfkc-qc-maybe))
694 (setf **proplist-properties** (nreverse **proplist-properties**))
695 (values))
698 ;;; Collation keys
699 (defvar *maximum-variable-key* 1)
701 (defun bitpack-collation-key (primary secondary tertiary)
702 ;; 0 <= primary <= #xFFFD (default table)
703 ;; 0 <= secondary <= #x10C [9 bits]
704 ;; 0 <= tertiary <= #x1E (#x1F allowed) [5 bits]
705 ;; Because of this, the bit packs don't overlap
706 (logior (ash primary 16) (ash secondary 5) tertiary))
708 (defun parse-collation-line (line)
709 (destructuring-bind (%code-points %keys) (split-string line #\;)
710 (let* ((code-points (parse-codepoints %code-points))
711 (keys
712 (remove
714 (split-string (remove #\[ (remove #\Space %keys)) #\]) :test #'string=))
715 (ret
716 (loop for key in keys
717 for variable-p = (position #\* key)
718 for parsed =
719 ;; Don't need first value, it's always just ""
720 (cdr (mapcar (lambda (x) (parse-integer x :radix 16 :junk-allowed t))
721 (split-string (substitute #\. #\* key) #\.)))
722 collect
723 (destructuring-bind (primary secondary tertiary) parsed
724 (when variable-p (setf *maximum-variable-key*
725 (max primary *maximum-variable-key*)))
726 (bitpack-collation-key primary secondary tertiary)))))
727 (values code-points ret))))
729 (defparameter *collation-table*
730 (with-open-file (stream (make-pathname :name "Allkeys70" :type "txt"
731 :defaults *unicode-character-database*))
732 (loop with hash = (make-hash-table :test #'equal)
733 for line = (read-line stream nil nil) while line
734 unless (eql 0 (position #\# line))
735 do (multiple-value-bind (codepoints keys) (parse-collation-line line)
736 (setf (gethash codepoints hash) keys))
737 finally (return hash))))
740 ;;; Other properties
741 (defparameter *confusables*
742 (with-open-file (s (make-pathname :name "ConfusablesEdited" :type "txt"
743 :defaults *unicode-character-database*))
744 (loop for line = (read-line s nil nil) while line
745 unless (eql 0 (position #\# line))
746 collect (mapcar #'parse-codepoints (split-string line #\<))))
747 "List of confusable codepoint sets")
749 (defparameter *bidi-mirroring-glyphs*
750 (with-open-file (s (make-pathname :name "BidiMirroring" :type "txt"
751 :defaults *unicode-character-database*))
752 (loop for line = (read-line s nil nil) while line
753 when (and (plusp (length line))
754 (char/= (char line 0) #\#))
755 collect
756 (mapcar
757 #'(lambda (c) (parse-codepoints c :singleton-list nil))
758 (split-string (subseq line 0 (position #\# line)) #\;))))
759 "List of BIDI mirroring glyph pairs")
761 (defparameter *block-ranges*
762 (with-open-file (stream (make-pathname :name "Blocks" :type "txt"
763 :defaults *unicode-character-database*))
764 (loop with result = (make-array (* 252 2) :fill-pointer 0)
765 for line = (read-line stream nil nil) while line
766 unless (or (string= line "") (position #\# line))
768 (map nil #'(lambda (x) (vector-push x result))
769 (parse-codepoint-range (car (split-string line #\;))))
770 finally (return result)))
771 "Vector of block starts and ends in a form acceptable to `ordered-ranges-position`.
772 Used to look up block data.")
774 ;;; Output code
775 (defun write-codepoint (code-point stream)
776 (declare (type (unsigned-byte 32) code-point))
777 (write-byte (ldb (byte 8 16) code-point) stream)
778 (write-byte (ldb (byte 8 8) code-point) stream)
779 (write-byte (ldb (byte 8 0) code-point) stream))
781 (defun write-4-byte (value stream)
782 (declare (type (unsigned-byte 32) value))
783 (write-byte (ldb (byte 8 24) value) stream)
784 (write-byte (ldb (byte 8 16) value) stream)
785 (write-byte (ldb (byte 8 8) value) stream)
786 (write-byte (ldb (byte 8 0) value) stream))
788 (defun output-misc-data ()
789 (with-open-file (stream (make-pathname :name "ucdmisc"
790 :type "dat"
791 :defaults *output-directory*)
792 :direction :output
793 :element-type '(unsigned-byte 8)
794 :if-exists :supersede
795 :if-does-not-exist :create)
796 (loop for (gc-index bidi-index ccc digit decomposition-info flags
797 script line-break age)
798 across *misc-table*
799 ;; three bits spare here
800 do (write-byte gc-index stream)
801 ;; three bits spare here
802 (write-byte bidi-index stream)
803 (write-byte ccc stream)
804 ;; bits 0-3 encode [0,9], bit 7 is for non-digit status,
805 ;; bit 6 is the decimal-digit flag. Two bits spare
806 (write-byte digit stream)
807 (write-byte decomposition-info stream)
808 (write-byte flags stream) ; includes EAW in bits 0-3, bit 4 is free
809 (write-byte script stream)
810 (write-byte line-break stream)
811 (write-byte age stream))))
813 (defun output-ucd-data ()
814 (with-open-file (high-pages (make-pathname :name "ucdhigh"
815 :type "dat"
816 :defaults *output-directory*)
817 :direction :output
818 :element-type '(unsigned-byte 8)
819 :if-exists :supersede
820 :if-does-not-exist :create)
821 (with-open-file (low-pages (make-pathname :name "ucdlow"
822 :type "dat"
823 :defaults *output-directory*)
824 :direction :output
825 :element-type '(unsigned-byte 8)
826 :if-exists :supersede
827 :if-does-not-exist :create)
828 ;; Output either the index into the misc array (if all the points in the
829 ;; high-page have the same misc value) or an index into the law-pages
830 ;; array / 256. For indexes into the misc array, set bit 15 (high bit).
831 ;; We should never have that many misc entries, so that's not a problem.
833 ;; If Unicode ever allocates an all-decomposing <First>/<Last> block (the
834 ;; only way to get a high page that outputs as the same and has a
835 ;; non-zero decomposition-index, which there's nowhere to store now),
836 ;; find me, slap me with a fish, and have fun fixing this mess.
837 (loop with low-pages-index = 0
838 for high-page from 0 to (ash #x10FFFF -8)
839 for uniq-ucd-entries = nil do
840 (loop for low-page from 0 to #xFF do
841 (pushnew
842 (gethash (logior low-page (ash high-page 8)) *ucd-entries*)
843 uniq-ucd-entries :test #'equalp))
844 (flet ((write-2-byte (int stream)
845 (declare (type (unsigned-byte 16) int))
846 (write-byte (ldb (byte 8 8) int) stream)
847 (write-byte (ldb (byte 8 0) int) stream)))
848 (case (length uniq-ucd-entries)
849 (0 (error "Somehow, a high page has no codepoints in it."))
850 (1 (write-2-byte (logior
851 (ash 1 15)
852 (ucd-misc (car uniq-ucd-entries)))
853 high-pages))
854 (t (loop for low-page from 0 to #xFF
855 for cp = (logior low-page (ash high-page 8))
856 for entry = (gethash cp *ucd-entries*) do
857 (write-2-byte (ucd-misc entry) low-pages)
858 (write-2-byte (ucd-decomp entry) low-pages)
859 finally (write-2-byte low-pages-index high-pages)
860 (incf low-pages-index)))))
861 finally (assert (< low-pages-index (ash 1 15))) (print low-pages-index)))))
863 (defun output-decomposition-data ()
864 (with-open-file (stream (make-pathname :name "decomp" :type "dat"
865 :defaults *output-directory*)
866 :direction :output
867 :element-type '(unsigned-byte 8)
868 :if-exists :supersede
869 :if-does-not-exist :create)
870 (loop for cp across *decompositions* do
871 (write-codepoint cp stream)))
872 (print (length *decompositions*)))
874 (defun output-composition-data ()
875 (with-open-file (stream (make-pathname :name "comp" :type "dat"
876 :defaults *output-directory*)
877 :direction :output
878 :element-type '(unsigned-byte 8)
879 :if-exists :supersede :if-does-not-exist :create)
880 (let (comp)
881 (maphash (lambda (k v) (push (cons k v) comp)) *compositions*)
882 (setq comp (sort comp #'< :key #'cdr))
883 (loop for (k . v) in comp
884 do (write-codepoint (car k) stream)
885 (write-codepoint (cdr k) stream)
886 (write-codepoint v stream)))))
888 (defun output-case-data ()
889 (let (casing-pages points-with-case)
890 (with-open-file (stream (make-pathname :name "case" :type "dat"
891 :defaults *output-directory*)
892 :direction :output
893 :element-type '(unsigned-byte 8)
894 :if-exists :supersede :if-does-not-exist :create)
895 (loop for cp being the hash-keys in *case-mapping*
896 do (push cp points-with-case))
897 (setf points-with-case (sort points-with-case #'<))
898 (loop for cp in points-with-case
899 for (upper . lower) = (gethash cp *case-mapping*) do
900 (pushnew (ash cp -6) casing-pages)
901 (write-codepoint cp stream)
902 (write-byte (if (atom upper) 0 (length upper)) stream)
903 (if (atom upper) (write-codepoint upper stream)
904 (map 'nil (lambda (c) (write-codepoint c stream)) upper))
905 (write-byte (if (atom lower) 0 (length lower)) stream)
906 (if (atom lower) (write-codepoint lower stream)
907 (map 'nil (lambda (c) (write-codepoint c stream)) lower))))
908 (setf casing-pages (sort casing-pages #'<))
909 (assert (< (length casing-pages) 256))
910 (let* ((size (1+ (reduce #'max casing-pages)))
911 (array (make-array size :initial-element 255))
912 (page -1))
913 (dolist (entry casing-pages)
914 (setf (aref array entry) (incf page)))
915 (with-open-file (stream (make-pathname :name "casepages" :type "dat"
916 :defaults *output-directory*)
917 :direction :output
918 :element-type '(unsigned-byte 8)
919 :if-exists :supersede :if-does-not-exist :create)
920 (dotimes (i size)
921 (write-byte (aref array i) stream))))
922 (with-open-file (stream (make-pathname :name "casepages" :type "lisp-expr"
923 :defaults *output-directory*)
924 :direction :output
925 :if-exists :supersede :if-does-not-exist :create)
926 (with-standard-io-syntax
927 (let ((*print-pretty* t)) (print casing-pages stream))))))
929 (defun output-collation-data ()
930 (with-open-file (stream (make-pathname :name "collation" :type "dat"
931 :defaults *output-directory*)
932 :direction :output
933 :element-type '(unsigned-byte 8)
934 :if-exists :supersede :if-does-not-exist :create)
935 (flet ((length-tag (list1 list2)
936 ;; takes two lists of UB32 (with the caveat that list1[0]
937 ;; needs its high 8 bits free (codepoints always have
938 ;; that) and do
939 (let* ((l1 (length list1)) (l2 (length list2))
940 (tag (dpb l1 (byte 4 28) (dpb l2 (byte 5 23) (car list1)))))
941 (assert (<= l1 3))
942 (write-4-byte tag stream)
943 (map nil #'(lambda (l) (write-4-byte l stream)) (append (cdr list1) list2)))))
944 (let (coll)
945 (maphash (lambda (k v) (push (cons k v) coll)) *collation-table*)
946 (labels ((sorter (o1 o2)
947 (cond
948 ((null o1) t)
949 ((null o2) nil)
950 (t (or (< (car o1) (car o2))
951 (and (= (car o1) (car o2))
952 (sorter (cdr o1) (cdr o2))))))))
953 (setq coll (sort coll #'sorter :key #'car)))
954 (loop for (k . v) in coll
955 do (length-tag k v)))))
956 (with-open-file (*standard-output*
957 (make-pathname :name "other-collation-info"
958 :type "lisp-expr"
959 :defaults *output-directory*)
960 :direction :output
961 :if-exists :supersede
962 :if-does-not-exist :create)
963 (with-standard-io-syntax
964 (let ((*print-pretty* t))
965 (write-string ";;; The highest primary variable collation index")
966 (terpri)
967 (prin1 *maximum-variable-key*) (terpri)))))
969 (defun output ()
970 (output-misc-data)
971 (output-ucd-data)
972 (output-decomposition-data)
973 (output-composition-data)
974 (output-case-data)
975 (output-collation-data)
976 (with-open-file (*standard-output*
977 (make-pathname :name "misc-properties" :type "lisp-expr"
978 :defaults *output-directory*)
979 :direction :output
980 :if-exists :supersede
981 :if-does-not-exist :create)
982 (with-standard-io-syntax
983 (let ((*print-pretty* t))
984 (prin1 **proplist-properties**))))
986 (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
987 :defaults *output-directory*)
988 :direction :output
989 :if-exists :supersede
990 :if-does-not-exist :create)
991 (with-standard-io-syntax
992 (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
993 (maphash (lambda (code name)
994 (when name
995 (print code f)
996 (prin1 name f)))
997 *unicode-names*))
998 (setf *unicode-names* nil))
999 (with-open-file (f (make-pathname :name "ucd1-names" :type "lisp-expr"
1000 :defaults *output-directory*)
1001 :direction :output
1002 :if-exists :supersede
1003 :if-does-not-exist :create)
1004 (with-standard-io-syntax
1005 (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
1006 (maphash (lambda (code name)
1007 (when name
1008 (print code f)
1009 (prin1 name f)))
1010 *unicode-1-names*))
1011 (setf *unicode-1-names* nil))
1013 (with-open-file (*standard-output*
1014 (make-pathname :name "numerics"
1015 :type "lisp-expr"
1016 :defaults *output-directory*)
1017 :direction :output
1018 :if-exists :supersede
1019 :if-does-not-exist :create)
1020 (with-standard-io-syntax
1021 (let ((*print-pretty* t)
1022 (result (make-array (* (length *different-numerics*) 2))))
1023 (loop for (code . value) in (sort *different-numerics* #'< :key #'car)
1024 for i by 2
1025 do (setf (aref result i) code
1026 (aref result (1+ i)) (read-from-string value)))
1027 (prin1 result))))
1028 (with-open-file (*standard-output*
1029 (make-pathname :name "titlecases"
1030 :type "lisp-expr"
1031 :defaults *output-directory*)
1032 :direction :output
1033 :if-exists :supersede
1034 :if-does-not-exist :create)
1035 (with-standard-io-syntax
1036 (let ((*print-pretty* t))
1037 (prin1 *different-titlecases*))))
1038 (with-open-file (*standard-output*
1039 (make-pathname :name "foldcases"
1040 :type "lisp-expr"
1041 :defaults *output-directory*)
1042 :direction :output
1043 :if-exists :supersede
1044 :if-does-not-exist :create)
1045 (with-standard-io-syntax
1046 (let ((*print-pretty* t))
1047 (prin1 *different-casefolds*))))
1048 (with-open-file (*standard-output*
1049 (make-pathname :name "confusables"
1050 :type "lisp-expr"
1051 :defaults *output-directory*)
1052 :direction :output
1053 :if-exists :supersede
1054 :if-does-not-exist :create)
1055 (with-standard-io-syntax
1056 (let ((*print-pretty* t))
1057 (prin1 *confusables*))))
1058 (with-open-file (*standard-output*
1059 (make-pathname :name "bidi-mirrors"
1060 :type "lisp-expr"
1061 :defaults *output-directory*)
1062 :direction :output
1063 :if-exists :supersede
1064 :if-does-not-exist :create)
1065 (with-standard-io-syntax
1066 (let ((*print-pretty* t))
1067 (prin1 *bidi-mirroring-glyphs*))))
1068 (with-open-file (*standard-output*
1069 (make-pathname :name "blocks"
1070 :type "lisp-expr"
1071 :defaults *output-directory*)
1072 :direction :output
1073 :if-exists :supersede
1074 :if-does-not-exist :create)
1075 (with-standard-io-syntax
1076 (let ((*print-pretty* t))
1077 (prin1 *block-ranges*))))
1078 (values))