Remove !begin-instruction-definitions.
[sbcl.git] / tools-for-build / ucd.lisp
blobfddee508d76fec6f35153a4dfd8bdaed14fb8e03
1 (in-package "SB-COLD")
3 ;;; Common functions
5 (defparameter *output-directory*
6 (merge-pathnames
7 (make-pathname :directory '(:relative :up "output"))
8 (make-pathname :directory (pathname-directory *load-truename*))))
10 (defparameter *unicode-character-database*
11 (make-pathname :directory (pathname-directory *load-truename*)))
13 (defmacro with-input-txt-file ((s name) &body body)
14 `(with-open-file (,s (make-pathname :name ,name :type "txt"
15 :defaults *unicode-character-database*))
16 ,@body))
18 (defmacro with-output-dat-file ((s name) &body body)
19 `(with-open-file (,s (make-pathname :name ,name :type "dat"
20 :defaults *output-directory*)
21 :direction :output :element-type '(unsigned-byte 8)
22 :if-exists :supersede :if-does-not-exist :create)
23 ,@body))
25 (defmacro with-ucd-output-syntax (&body body)
26 `(with-standard-io-syntax
27 (let ((*readtable* (copy-readtable))
28 (*print-readably* nil)
29 (*print-pretty* t))
30 ,@body)))
32 (defmacro with-output-lisp-expr-file ((s name) &body body)
33 `(with-open-file (,s (make-pathname :name ,name :type "lisp-expr"
34 :defaults *output-directory*)
35 :direction :output :element-type 'character
36 :if-exists :supersede :if-does-not-exist :create)
37 (with-ucd-output-syntax
38 ,@body)))
40 (defun split-string (line character)
41 (loop for prev-position = 0 then (1+ position)
42 for position = (position character line :start prev-position)
43 collect (subseq line prev-position position)
44 do (unless position
45 (loop-finish))))
47 (defun parse-codepoints (string &key (singleton-list t))
48 "Gets a list of codepoints out of 'aaaa bbbb cccc', stripping surrounding space"
49 (let ((list (mapcar
50 (lambda (s) (parse-integer s :radix 16))
51 (remove "" (split-string string #\Space) :test #'string=))))
52 (if (not (or (cdr list) singleton-list)) (car list) list)))
55 (defun parse-codepoint-range (string)
56 "Parse the Unicode syntax DDDD|DDDD..DDDD into an inclusive range (start end)"
57 (destructuring-bind (start &optional empty end) (split-string string #\.)
58 (declare (ignore empty))
59 (let* ((head (parse-integer start :radix 16))
60 (tail (if end
61 (parse-integer end :radix 16 :end (position #\Space end))
62 head)))
63 (list head tail))))
65 (defun init-indices (strings)
66 (let ((hash (make-hash-table :test #'equal)))
67 (loop for string in strings
68 for index from 0
69 do (setf (gethash string hash) index))
70 hash))
72 (defun clear-flag (bit integer)
73 (logandc2 integer (ash 1 bit)))
76 ;;; Output storage globals
77 (defstruct ucd misc decomp)
79 (defparameter *unicode-names* (make-hash-table))
80 (defparameter *unicode-1-names* (make-hash-table))
82 (defparameter *decompositions*
83 (make-array 10000 :element-type '(unsigned-byte 24) :fill-pointer 0
84 :adjustable t)) ; 10000 is not a significant number
86 (defparameter *decomposition-corrections*
87 (with-input-txt-file (s "NormalizationCorrections")
88 (loop with result = nil
89 for line = (read-line s nil nil) while line
90 do (when (position #\; line)
91 (destructuring-bind (cp old-decomp correction version)
92 (split-string line #\;)
93 (declare (ignore old-decomp version))
94 (push (cons (parse-integer cp :radix 16)
95 (parse-integer correction :radix 16))
96 result)))
97 finally (return result)))
98 "List of decompsotions that were amended in Unicode corrigenda")
100 (defparameter *compositions* (make-hash-table :test #'equal))
101 (defparameter *composition-exclusions*
102 (with-input-txt-file (s "CompositionExclusions")
103 (loop with result = nil
104 for line = (read-line s nil nil) while line
105 when (and (> (length line) 0) (char/= (char line 0) #\#))
106 do (push (parse-integer line :end (position #\Space line) :radix 16)
107 result) finally (return result)))
108 "Characters that are excluded from composition according to UAX#15")
110 (defparameter *different-titlecases* nil)
111 (defparameter *different-casefolds* nil)
113 (defparameter *case-mapping*
114 (with-input-txt-file (s "SpecialCasing")
115 (loop with hash = (make-hash-table)
116 for line = (read-line s nil nil) while line
117 unless (or (not (position #\# line)) (= 0 (position #\# line)))
118 do (destructuring-bind (%cp %lower %title %upper &optional context comment)
119 (split-string line #\;)
120 (unless (and context comment)
121 (let ((cp (parse-integer %cp :radix 16))
122 (lower (parse-codepoints %lower :singleton-list nil))
123 (title (parse-codepoints %title :singleton-list nil))
124 (upper (parse-codepoints %upper :singleton-list nil)))
125 (setf (gethash cp hash) (cons upper lower))
126 (unless (equal title upper) (push (cons cp title) *different-titlecases*)))))
127 finally (return hash)))
128 "Maps cp -> (cons uppercase|(uppercase ...) lowercase|(lowercase ...))")
130 (defparameter *misc-table* (make-array 3000 :fill-pointer 0)
131 "Holds the entries in the Unicode database's miscellanious array, stored as lists.
132 These lists have the form (gc-index bidi-index ccc digit decomposition-info
133 flags script line-break age). Flags is a bit-bashed integer containing
134 cl-both-case-p, has-case-p, and bidi-mirrored-p, and an east asian width.
135 Length should be adjusted when the standard changes.")
136 (defparameter *misc-hash* (make-hash-table :test #'equal)
137 "Maps a misc list to its position in the misc table.")
139 (defparameter *different-numerics* nil)
141 (defparameter *ucd-entries* (make-hash-table))
143 ;; Mappings of the general categories and bidi classes to integers
144 ;; Letter classes go first to optimize certain cl character type checks
145 ;; BN is the first BIDI class so that unallocated characters are BN
146 ;; Uppercase in the CL sense must have GC = 0, lowercase must GC = 1
147 (defparameter *general-categories*
148 (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Cn"
149 "Mc" "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf"
150 "Pi" "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
151 (defparameter *bidi-classes*
152 (init-indices '("BN" "AL" "AN" "B" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
153 "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS" "LRI" "RLI"
154 "FSI" "PDI")))
155 (defparameter *east-asian-widths* (init-indices '("N" "A" "H" "W" "F" "Na")))
156 (defparameter *scripts*
157 (init-indices
158 '("Unknown" "Common" "Latin" "Greek" "Cyrillic" "Armenian" "Hebrew" "Arabic"
159 "Syriac" "Thaana" "Devanagari" "Bengali" "Gurmukhi" "Gujarati" "Oriya"
160 "Tamil" "Telugu" "Kannada" "Malayalam" "Sinhala" "Thai" "Lao" "Tibetan"
161 "Myanmar" "Georgian" "Hangul" "Ethiopic" "Cherokee" "Canadian_Aboriginal"
162 "Ogham" "Runic" "Khmer" "Mongolian" "Hiragana" "Katakana" "Bopomofo" "Han"
163 "Yi" "Old_Italic" "Gothic" "Deseret" "Inherited" "Tagalog" "Hanunoo" "Buhid"
164 "Tagbanwa" "Limbu" "Tai_Le" "Linear_B" "Ugaritic" "Shavian" "Osmanya"
165 "Cypriot" "Braille" "Buginese" "Coptic" "New_Tai_Lue" "Glagolitic"
166 "Tifinagh" "Syloti_Nagri" "Old_Persian" "Kharoshthi" "Balinese" "Cuneiform"
167 "Phoenician" "Phags_Pa" "Nko" "Sundanese" "Lepcha" "Ol_Chiki" "Vai"
168 "Saurashtra" "Kayah_Li" "Rejang" "Lycian" "Carian" "Lydian" "Cham"
169 "Tai_Tham" "Tai_Viet" "Avestan" "Egyptian_Hieroglyphs" "Samaritan" "Lisu"
170 "Bamum" "Javanese" "Meetei_Mayek" "Imperial_Aramaic" "Old_South_Arabian"
171 "Inscriptional_Parthian" "Inscriptional_Pahlavi" "Old_Turkic" "Kaithi"
172 "Batak" "Brahmi" "Mandaic" "Chakma" "Meroitic_Cursive"
173 "Meroitic_Hieroglyphs" "Miao" "Sharada" "Sora_Sompeng" "Takri"
174 "Bassa_Vah" "Mahajani" "Pahawh_Hmong" "Caucasian_Albanian" "Manichaean"
175 "Palmyrene" "Duployan" "Mende_Kikakui" "Pau_Cin_Hau" "Elbasan" "Modi"
176 "Psalter_Pahlavi" "Grantha" "Mro" "Siddham" "Khojki" "Nabataean" "Tirhuta"
177 "Khudawadi" "Old_North_Arabian" "Warang_Citi" "Linear_A" "Old_Permic")))
178 (defparameter *line-break-classes*
179 (init-indices
180 '("XX" "AI" "AL" "B2" "BA" "BB" "BK" "CB" "CJ" "CL" "CM" "CP" "CR" "EX" "GL"
181 "HL" "HY" "ID" "IN" "IS" "LF" "NL" "NS" "NU" "OP" "PO" "PR" "QU" "RI" "SA"
182 "SG" "SP" "SY" "WJ" "ZW")))
184 (defparameter *east-asian-width-table*
185 (with-input-txt-file (s "EastAsianWidth")
186 (loop with hash = (make-hash-table)
187 for line = (read-line s nil nil) while line
188 unless (or (not (position #\# line)) (= 0 (position #\# line)))
189 do (destructuring-bind (codepoints value)
190 (split-string
191 (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
192 (let ((range (parse-codepoint-range codepoints))
193 (index (gethash value *east-asian-widths*)))
194 (loop for i from (car range) to (cadr range)
195 do (setf (gethash i hash) index))))
196 finally (return hash)))
197 "Table of East Asian Widths. Used in the creation of misc entries.")
199 (defparameter *script-table*
200 (with-input-txt-file (s "Scripts")
201 (loop with hash = (make-hash-table)
202 for line = (read-line s nil nil) while line
203 unless (or (not (position #\# line)) (= 0 (position #\# line)))
204 do (destructuring-bind (codepoints value)
205 (split-string
206 (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
207 (let ((range (parse-codepoint-range codepoints))
208 (index (gethash (subseq value 1) *scripts*)))
209 (loop for i from (car range) to (cadr range)
210 do (setf (gethash i hash) index))))
211 finally (return hash)))
212 "Table of scripts. Used in the creation of misc entries.")
214 (defparameter *line-break-class-table*
215 (with-input-txt-file (s "LineBreakProperty")
216 (loop with hash = (make-hash-table)
217 for line = (read-line s nil nil) while line
218 unless (or (not (position #\# line)) (= 0 (position #\# line)))
219 do (destructuring-bind (codepoints value)
220 (split-string
221 (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
222 (let ((range (parse-codepoint-range codepoints))
223 ;; Hangul syllables temporarily go to "Unkwown"
224 (index (gethash value *line-break-classes* 0)))
225 (loop for i from (car range) to (cadr range)
226 do (setf (gethash i hash) index))))
227 finally (return hash)))
228 "Table of line break classes. Used in the creation of misc entries.")
230 (defparameter *age-table*
231 (with-input-txt-file (s "DerivedAge")
232 (loop with hash = (make-hash-table)
233 for line = (read-line s nil nil) while line
234 unless (or (not (position #\# line)) (= 0 (position #\# line)))
235 do (destructuring-bind (codepoints value)
236 (split-string
237 (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
238 (let* ((range (parse-codepoint-range codepoints))
239 (age-parts (mapcar #'parse-integer (split-string value #\.)))
240 (age (logior (ash (car age-parts) 3) (cadr age-parts))))
241 (loop for i from (car range) to (cadr range)
242 do (setf (gethash i hash) age))))
243 finally (return hash)))
244 "Table of character ages. Used in the creation of misc entries.")
246 (defvar *block-first* nil)
249 ;;; Unicode data file parsing
250 (defun hash-misc (gc-index bidi-index ccc digit decomposition-info flags
251 script line-break age)
252 (let* ((list (list gc-index bidi-index ccc digit decomposition-info flags
253 script line-break age))
254 (index (gethash list *misc-hash*)))
255 (or index
256 (progn
257 (setf (gethash list *misc-hash*)
258 (fill-pointer *misc-table*))
259 (when (eql nil (vector-push list *misc-table*))
260 (error "Misc table too small."))
261 (gethash list *misc-hash*)))))
263 (defun ordered-ranges-member (item vector)
264 (labels ((recurse (start end)
265 (when (< start end)
266 (let* ((i (+ start (truncate (- end start) 2)))
267 (index (* 2 i))
268 (elt1 (svref vector index))
269 (elt2 (svref vector (1+ index))))
270 (cond ((< item elt1)
271 (recurse start i))
272 ((> item elt2)
273 (recurse (+ 1 i) end))
275 item))))))
276 (recurse 0 (/ (length vector) 2))))
278 (defun unallocated-bidi-class (code-point)
279 ;; See tests/data/DerivedBidiClass.txt for more information
280 (flet ((in (vector class)
281 (when (ordered-ranges-member code-point vector)
282 (gethash class *bidi-classes*))))
283 (cond
284 ((in
285 #(#x0600 #x07BF #x08A0 #x08FF #xFB50 #xFDCF #xFDF0 #xFDFF #xFE70 #xFEFF
286 #x1EE00 #x1EEFF) "AL"))
287 ((in
288 #(#x0590 #x05FF #x07C0 #x089F #xFB1D #xFB4F #x10800 #x10FFF #x1E800 #x1EDFF
289 #x1EF00 #x1EFFF) "R"))
290 ((in #(#x20A0 #x20CF) "ET"))
291 ;; BN is non-characters and default-ignorable.
292 ;; Default-ignorable will be dealt with elsewhere
293 ((in #(#xFDD0 #xFDEF #xFFFE #xFFFF #x1FFFE #x1FFFF #x2FFFE #x2FFFF
294 #x3FFFE #x3FFFF #x4FFFE #x4FFFF #x5FFFE #x5FFFF #x6FFFE #x6FFFF
295 #x7FFFE #x7FFFF #x8FFFE #x8FFFF #x9FFFE #x9FFFF #xAFFFE #xAFFFF
296 #xBFFFE #xBFFFF #xCFFFE #xCFFFF #xDFFFE #xDFFFF #xEFFFE #xEFFFF
297 #xFFFFE #xFFFFF #x10FFFE #x10FFFF)
298 "BN"))
299 ((in #(#x0 #x10FFFF) "L"))
300 (t (error "Somehow we've gone too far in unallocated bidi determination")))))
302 (defun complete-misc-table ()
303 (loop for code-point from 0 to #x10FFFF do ; Flood-fil unallocated codepoints
304 (unless (second (multiple-value-list (gethash code-point *ucd-entries*)))
305 (let* ((unallocated-misc
306 ;; unallocated characters have a GC of "Cn", aren't digits
307 ;; (digit = 128), have a bidi that depends on their block, and
308 ;; don't decompose, combine, or have case. They have an East
309 ;; Asian Width (eaw) of "N" (0), and a script, line breaking
310 ;; class, and age of 0 ("Unknown"), unless some of those
311 ;; properties are otherwise assigned.
312 `(,(gethash "Cn" *general-categories*)
313 ,(unallocated-bidi-class code-point) 0 128 0
314 ,(gethash code-point *east-asian-width-table* 0)
315 0 ,(gethash code-point *line-break-class-table* 0)
316 ,(gethash code-point *age-table* 0)))
317 (unallocated-index (apply #'hash-misc unallocated-misc))
318 (unallocated-ucd (make-ucd :misc unallocated-index)))
319 (setf (gethash code-point *ucd-entries*) unallocated-ucd)))))
321 (defun expand-decomposition (decomposition)
322 (loop for cp in decomposition
323 for ucd = (gethash cp *ucd-entries*)
324 for length = (elt (aref *misc-table* (ucd-misc ucd)) 4)
325 if (and (not (logbitp 7 length))
326 (plusp length))
327 append (expand-decomposition (ucd-decomp ucd))
328 else
329 collect cp))
331 ;;; Recursively expand canonical decompositions
332 (defun fixup-decompositions ()
333 (loop for did-something = nil
335 (loop for ucd being each hash-value of *ucd-entries*
336 when (and (ucd-decomp ucd)
337 (not (logbitp 7 (elt (aref *misc-table* (ucd-misc ucd)) 4))))
339 (let ((expanded (expand-decomposition (ucd-decomp ucd))))
340 (unless (equal expanded (ucd-decomp ucd))
341 (setf (ucd-decomp ucd) expanded
342 did-something t))))
343 while did-something)
344 (loop for i below (hash-table-count *ucd-entries*)
345 for ucd = (gethash i *ucd-entries*)
346 for decomp = (ucd-decomp ucd)
348 (setf (ucd-decomp ucd)
349 (cond ((not (consp decomp)) 0)
350 ((logbitp 7 (elt (aref *misc-table* (ucd-misc ucd)) 4))
351 (prog1 (length *decompositions*)
352 (loop for cp in decomp
353 do (vector-push-extend cp *decompositions*))))
355 (let ((misc-entry (copy-list (aref *misc-table* (ucd-misc ucd)))))
356 (setf (elt misc-entry 4) (length decomp)
357 (ucd-misc ucd) (apply #'hash-misc misc-entry))
358 (prog1 (length *decompositions*)
359 (loop for cp in decomp
360 do (vector-push-extend cp *decompositions*)))))))))
362 (defun fixup-compositions ()
363 (flet ((fixup (k v)
364 (declare (ignore v))
365 (let* ((cp (car k))
366 (ucd (gethash cp *ucd-entries*))
367 (misc (aref *misc-table* (ucd-misc ucd)))
368 (ccc (third misc)))
369 ;; we can do everything in the first pass except for
370 ;; accounting for decompositions where the first
371 ;; character of the decomposition is not a starter.
372 (when (/= ccc 0)
373 (remhash k *compositions*)))))
374 (maphash #'fixup *compositions*)))
376 (defun add-jamo-information (line table)
377 (let* ((split (split-string line #\;))
378 (code (parse-integer (first split) :radix 16))
379 (syllable (string-trim
381 (subseq (second split) 0 (position #\# (second split))))))
382 (setf (gethash code table) syllable)))
384 (defun fixup-hangul-syllables ()
385 ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
386 (let* ((sbase #xac00)
387 (lbase #x1100)
388 (vbase #x1161)
389 (tbase #x11a7)
390 (scount 11172)
391 (lcount 19)
392 (vcount 21)
393 (tcount 28)
394 (ncount (* vcount tcount))
395 (table (make-hash-table)))
396 (declare (ignore lcount))
397 (with-input-txt-file (*standard-input* "Jamo")
398 (loop for line = (read-line nil nil)
399 while line
400 if (position #\; line)
401 do (add-jamo-information line table)))
402 (dotimes (sindex scount)
403 (let* ((l (+ lbase (floor sindex ncount)))
404 (v (+ vbase (floor (mod sindex ncount) tcount)))
405 (tee (+ tbase (mod sindex tcount)))
406 (code-point (+ sbase sindex))
407 (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]"
408 (gethash l table) (gethash v table)
409 (= tee tbase) (gethash tee table))))
410 (setf (gethash code-point *unicode-names*) name)))))
412 (defun normalize-character-name (name)
413 (when (find #\_ name)
414 (error "Bad name for a character: ~A" name))
415 ;; U+1F5CF (PAGE)'s name conflicts with the ANSI CL-assigned
416 ;; name for form feed (^L, U+000C). To avoid a case where
417 ;; more than one character has a particular name while remaining
418 ;; standards-compliant, we remove U+1F5CF's name here.
419 (when (string= name "PAGE")
420 (return-from normalize-character-name "UNICODE_PAGE"))
421 (unless (or (zerop (length name)) (find #\< name) (find #\> name))
422 (substitute #\_ #\Space name)))
424 ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
425 ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
426 ;;; D800 -- F8FF : surrogates and private use
427 ;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
428 ;;; F0000 -- FFFFD : private use
429 ;;; 100000 -- 10FFFD: private use
430 (defun encode-ucd-line (line code-point)
431 (destructuring-bind (name general-category canonical-combining-class
432 bidi-class decomposition-type-and-mapping
433 decimal-digit digit numeric bidi-mirrored
434 unicode-1-name iso-10646-comment simple-uppercase
435 simple-lowercase simple-titlecase)
436 line
437 (declare (ignore iso-10646-comment))
438 (if (and (> (length name) 8)
439 (string= ", First>" name :start2 (- (length name) 8)))
440 (progn
441 (setf *block-first* code-point)
442 nil)
443 (let* ((gc-index (or (gethash general-category *general-categories*)
444 (error "unknown general category ~A"
445 general-category)))
446 (bidi-index (or (gethash bidi-class *bidi-classes*)
447 (error "unknown bidirectional class ~A"
448 bidi-class)))
449 (ccc (parse-integer canonical-combining-class))
450 (digit-index (if (string= "" digit) 128 ; non-digits have high bit
451 (let ((%digit (parse-integer digit)))
452 (if (string= digit decimal-digit)
453 ;; decimal-digit-p is in bit 6
454 (logior (ash 1 6) %digit) %digit))))
455 (upper-index (unless (string= "" simple-uppercase)
456 (parse-integer simple-uppercase :radix 16)))
457 (lower-index (unless (string= "" simple-lowercase)
458 (parse-integer simple-lowercase :radix 16)))
459 (title-index (unless (string= "" simple-titlecase)
460 (parse-integer simple-titlecase :radix 16)))
461 (cl-both-case-p (or (and (= gc-index 0) lower-index)
462 (and (= gc-index 1) upper-index)))
463 (bidi-mirrored-p (string= bidi-mirrored "Y"))
464 (decomposition-info 0)
465 (eaw-index (gethash code-point *east-asian-width-table*))
466 (script-index (gethash code-point *script-table* 0))
467 (line-break-index (gethash code-point *line-break-class-table* 0))
468 (age-index (gethash code-point *age-table* 0))
469 decomposition)
470 #+nil
471 (when (and (not cl-both-case-p)
472 (< gc-index 2))
473 (format t "~A~%" name))
475 (when (string/= "" decomposition-type-and-mapping)
476 (let* ((compatibility-p (position #\> decomposition-type-and-mapping)))
477 (setf decomposition
478 (parse-codepoints
479 (subseq decomposition-type-and-mapping
480 (if compatibility-p (1+ compatibility-p) 0))))
481 (when (assoc code-point *decomposition-corrections*)
482 (setf decomposition
483 (list (cdr (assoc code-point *decomposition-corrections*)))))
484 (setf decomposition-info
485 (logior (length decomposition) (if compatibility-p 128 0)))
486 (unless compatibility-p
487 ;; Primary composition excludes:
488 ;; * singleton decompositions;
489 ;; * decompositions of non-starters;
490 ;; * script-specific decompositions;
491 ;; * later-version decompositions;
492 ;; * decompositions whose first character is a
493 ;; non-starter.
494 ;; All but the last case can be handled here;
495 ;; for the fixup, see FIXUP-COMPOSITIONS
496 (when (and (> decomposition-info 1)
497 (= ccc 0)
498 (not (member code-point *composition-exclusions*)))
499 (unless (= decomposition-info 2)
500 (error "canonical decomposition unexpectedly long"))
501 (setf (gethash (cons (first decomposition)
502 (second decomposition))
503 *compositions*)
504 code-point)))))
505 ;; Hangul decomposition; see Unicode 6.2 section 3-12
506 (when (= code-point #xd7a3)
507 ;; KLUDGE: The decomposition-length for Hangul syllables in the
508 ;; misc database will be a bit of a lie. It doesn't really matter
509 ;; since the only purpose of the length is to index into the
510 ;; decompositions array (which Hangul decomposition doesn't use).
511 ;; The decomposition index is 0 because we won't be going into the
512 ;; array
513 (setf decomposition-info 3))
515 (unless (gethash code-point *case-mapping*) ; Exclude codepoints from SpecialCasing
516 (when (string/= simple-uppercase simple-titlecase)
517 (push (cons code-point title-index) *different-titlecases*))
518 (and (or upper-index lower-index)
519 (setf (gethash code-point *case-mapping*)
520 (cons
521 (or upper-index code-point)
522 (or lower-index code-point)))))
524 (when (string/= digit numeric)
525 (push (cons code-point numeric) *different-numerics*))
527 (when (> ccc 255)
528 (error "canonical combining class too large ~A" ccc))
529 (let* ((flags (logior
530 (if cl-both-case-p (ash 1 7) 0)
531 (if (gethash code-point *case-mapping*) (ash 1 6) 0)
532 (if bidi-mirrored-p (ash 1 5) 0)
533 eaw-index))
534 (misc-index (hash-misc gc-index bidi-index ccc digit-index
535 decomposition-info flags script-index
536 line-break-index age-index))
537 (result (make-ucd :misc misc-index
538 :decomp decomposition)))
539 (when (and (> (length name) 7)
540 (string= ", Last>" name :start2 (- (length name) 7)))
541 ;; We can still do this despite East Asian Width being in the
542 ;; databasce since each of the UCD <First><Last> blocks
543 ;; has a consistent East Asian Width
544 (loop for point from *block-first* to code-point do
545 (setf (gethash point *ucd-entries*) result)))
546 (values result (normalize-character-name name)
547 (normalize-character-name unicode-1-name)))))))
549 (defun slurp-ucd-line (line)
550 (let* ((split-line (split-string line #\;))
551 (code-point (parse-integer (first split-line) :radix 16)))
552 (multiple-value-bind (encoding name unicode-1-name)
553 (encode-ucd-line (cdr split-line) code-point)
554 (setf (gethash code-point *ucd-entries*) encoding
555 (gethash code-point *unicode-names*) name)
556 (when unicode-1-name
557 (setf (gethash code-point *unicode-1-names*) unicode-1-name)))))
559 ;;; this fixes up the case conversion discrepancy between CL and
560 ;;; Unicode: CL operators depend on char-downcase / char-upcase being
561 ;;; inverses, which is not true in general in Unicode even for
562 ;;; characters which change case to single characters.
563 ;;; Also, fix misassigned age values, which are not constant across blocks
564 (defun second-pass ()
565 (let ((case-mapping
566 (sort (loop for code-point being the hash-keys in *case-mapping*
567 using (hash-value value)
568 collect (cons code-point value))
569 #'< :key #'car)))
570 (loop for (code-point upper . lower) in case-mapping
571 for misc-index = (ucd-misc (gethash code-point *ucd-entries*))
572 for (gc bidi ccc digit decomp flags script lb age) = (aref *misc-table* misc-index)
573 when (logbitp 7 flags) do
574 (when (or (not (atom upper)) (not (atom lower))
575 (and (= gc 0)
576 (not (equal (car (gethash lower *case-mapping*)) code-point)))
577 (and (= gc 1)
578 (not (equal (cdr (gethash upper *case-mapping*)) code-point))))
579 (let* ((new-flags (clear-flag 7 flags))
580 (new-misc (hash-misc gc bidi ccc digit decomp new-flags script lb age)))
581 (setf (ucd-misc (gethash code-point *ucd-entries*)) new-misc))))))
583 (defun fixup-casefolding ()
584 (with-input-txt-file (s "CaseFolding")
585 (loop for line = (read-line s nil nil)
586 while line
587 unless (or (not (position #\; line)) (equal (position #\# line) 0))
588 do (destructuring-bind (original type mapping comment)
589 (split-string line #\;)
590 (declare (ignore comment))
591 (let ((cp (parse-integer original :radix 16))
592 (fold (parse-codepoints mapping :singleton-list nil)))
593 (unless (or (string= type " S") (string= type " T"))
594 (when (not (equal (cdr (gethash cp *case-mapping*)) fold))
595 (push (cons cp fold) *different-casefolds*))))))))
597 (defun fixup-ages ()
598 (let ((age (sort
599 (loop for code-point being the hash-keys in *age-table*
600 using (hash-value true-age)
601 collect (cons code-point true-age))
602 #'< :key #'car)))
603 (loop for (code-point . true-age) in age
604 for misc-index = (ucd-misc (gethash code-point *ucd-entries*))
605 for (gc bidi ccc digit decomp flags script lb age) = (aref *misc-table* misc-index)
606 unless (= age true-age) do
607 (let* ((new-misc (hash-misc gc bidi ccc digit decomp flags script lb true-age))
608 (new-ucd (make-ucd
609 :misc new-misc
610 :decomp (ucd-decomp (gethash code-point *ucd-entries*)))))
611 (setf (gethash code-point *ucd-entries*) new-ucd)))))
613 (defun slurp-ucd ()
614 (with-input-txt-file (*standard-input* "UnicodeData")
615 (format t "~%//slurp-ucd~%")
616 (loop for line = (read-line nil nil)
617 while line
618 do (slurp-ucd-line line)))
619 (second-pass)
620 (fixup-compositions)
621 (fixup-hangul-syllables)
622 (complete-misc-table)
623 (fixup-casefolding)
624 (fixup-ages)
625 (fixup-decompositions)
626 nil)
629 ;;; PropList.txt
630 (defparameter **proplist-properties** nil
631 "A list of properties extracted from PropList.txt")
633 (defun parse-property (stream &optional name)
634 (let ((result (make-array 1 :fill-pointer 0 :adjustable t)))
635 ;; FIXME: something in this loop provokes a warning from CLISP
636 (loop for line = (read-line stream nil nil)
637 ;; Deal with Blah=Blah in DerivedNormalizationProps.txt
638 while (and line (not (position #\= (substitute #\Space #\= line :count 1))))
639 for entry = (subseq line 0 (position #\# line))
640 when (and entry (string/= entry ""))
642 (destructuring-bind (start end)
643 (parse-codepoint-range (car (split-string entry #\;)))
644 (vector-push-extend start result)
645 (vector-push-extend end result)))
646 (when name
647 (push name **proplist-properties**)
648 (push result **proplist-properties**))))
650 (defun slurp-proplist ()
651 (with-input-txt-file (s "PropList")
652 (parse-property s) ;; Initial comments
653 (parse-property s :white-space)
654 (parse-property s :bidi-control)
655 (parse-property s :join-control)
656 (parse-property s :dash)
657 (parse-property s :hyphen)
658 (parse-property s :quotation-mark)
659 (parse-property s :terminal-punctuation)
660 (parse-property s :other-math)
661 (parse-property s :hex-digit)
662 (parse-property s :ascii-hex-digit)
663 (parse-property s :other-alphabetic)
664 (parse-property s :ideographic)
665 (parse-property s :diacritic)
666 (parse-property s :extender)
667 (parse-property s :other-lowercase)
668 (parse-property s :other-uppercase)
669 (parse-property s :noncharacter-code-point)
670 (parse-property s :other-grapheme-extend)
671 (parse-property s :ids-binary-operator)
672 (parse-property s :ids-trinary-operator)
673 (parse-property s :radical)
674 (parse-property s :unified-ideograph)
675 (parse-property s :other-default-ignorable-code-point)
676 (parse-property s :deprecated)
677 (parse-property s :soft-dotted)
678 (parse-property s :logical-order-exception)
679 (parse-property s :other-id-start)
680 (parse-property s :other-id-continue)
681 (parse-property s :sterm)
682 (parse-property s :variation-selector)
683 (parse-property s :pattern-white-space)
684 (parse-property s :pattern-syntax))
686 (with-input-txt-file (s "DerivedNormalizationProps")
687 (parse-property s) ;; Initial comments
688 (parse-property s) ;; FC_NFKC_Closure
689 (parse-property s) ;; FC_NFKC_Closure
690 (parse-property s) ;; Full_Composition_Exclusion
691 (parse-property s) ;; NFD_QC Comments
692 (parse-property s :nfd-qc)
693 (parse-property s) ;; NFC_QC Comments
694 (parse-property s :nfc-qc)
695 (parse-property s :nfc-qc-maybe)
696 (parse-property s) ;; NFKD_QC Comments
697 (parse-property s :nfkd-qc)
698 (parse-property s) ;; NFKC_QC Comments
699 (parse-property s :nfkc-qc)
700 (parse-property s :nfkc-qc-maybe))
701 (setf **proplist-properties** (nreverse **proplist-properties**))
702 (values))
705 ;;; Collation keys
706 (defvar *maximum-variable-key* 1)
708 (defun bitpack-collation-key (primary secondary tertiary)
709 ;; 0 <= primary <= #xFFFD (default table)
710 ;; 0 <= secondary <= #x10C [9 bits]
711 ;; 0 <= tertiary <= #x1E (#x1F allowed) [5 bits]
712 ;; Because of this, the bit packs don't overlap
713 (logior (ash primary 16) (ash secondary 5) tertiary))
715 (defun parse-collation-line (line)
716 (destructuring-bind (%code-points %keys) (split-string line #\;)
717 (let* ((code-points (parse-codepoints %code-points))
718 (keys
719 (remove
721 (split-string (remove #\[ (remove #\Space %keys)) #\]) :test #'string=))
722 (ret
723 (loop for key in keys
724 for variable-p = (position #\* key)
725 for parsed =
726 ;; Don't need first value, it's always just ""
727 (cdr (mapcar (lambda (x) (parse-integer x :radix 16 :junk-allowed t))
728 (split-string (substitute #\. #\* key) #\.)))
729 collect
730 (destructuring-bind (primary secondary tertiary) parsed
731 (when variable-p (setf *maximum-variable-key*
732 (max primary *maximum-variable-key*)))
733 (bitpack-collation-key primary secondary tertiary)))))
734 (values code-points ret))))
736 (defparameter *collation-table*
737 (with-input-txt-file (stream "Allkeys70")
738 (loop with hash = (make-hash-table :test #'equal)
739 for line = (read-line stream nil nil) while line
740 unless (eql 0 (position #\# line))
741 do (multiple-value-bind (codepoints keys) (parse-collation-line line)
742 (setf (gethash codepoints hash) keys))
743 finally (return hash))))
746 ;;; Other properties
747 (defparameter *confusables*
748 (with-input-txt-file (s "ConfusablesEdited")
749 (loop for line = (read-line s nil nil) while line
750 unless (eql 0 (position #\# line))
751 collect (mapcar #'parse-codepoints (split-string line #\<))))
752 "List of confusable codepoint sets")
754 (defparameter *bidi-mirroring-glyphs*
755 (with-input-txt-file (s "BidiMirroring")
756 (loop for line = (read-line s nil nil) while line
757 when (and (plusp (length line))
758 (char/= (char line 0) #\#))
759 collect
760 (mapcar
761 #'(lambda (c) (parse-codepoints c :singleton-list nil))
762 (split-string (subseq line 0 (position #\# line)) #\;))))
763 "List of BIDI mirroring glyph pairs")
765 (defparameter *block-ranges*
766 (with-input-txt-file (stream "Blocks")
767 (loop with result = (make-array (* 252 2) :fill-pointer 0)
768 for line = (read-line stream nil nil) while line
769 unless (or (string= line "") (position #\# line))
771 (map nil #'(lambda (x) (vector-push x result))
772 (parse-codepoint-range (car (split-string line #\;))))
773 finally (return result)))
774 "Vector of block starts and ends in a form acceptable to `ordered-ranges-position`.
775 Used to look up block data.")
777 ;;; Output code
778 (defun write-codepoint (code-point stream)
779 (declare (type (unsigned-byte 32) code-point))
780 (write-byte (ldb (byte 8 16) code-point) stream)
781 (write-byte (ldb (byte 8 8) code-point) stream)
782 (write-byte (ldb (byte 8 0) code-point) stream))
784 (defun write-4-byte (value stream)
785 (declare (type (unsigned-byte 32) value))
786 (write-byte (ldb (byte 8 24) value) stream)
787 (write-byte (ldb (byte 8 16) value) stream)
788 (write-byte (ldb (byte 8 8) value) stream)
789 (write-byte (ldb (byte 8 0) value) stream))
791 (defun output-misc-data ()
792 (with-output-dat-file (stream "ucdmisc")
793 (loop for (gc-index bidi-index ccc digit decomposition-info flags
794 script line-break age)
795 across *misc-table*
796 ;; three bits spare here
797 do (write-byte gc-index stream)
798 ;; three bits spare here
799 (write-byte bidi-index stream)
800 (write-byte ccc stream)
801 ;; bits 0-3 encode [0,9], bit 7 is for non-digit status,
802 ;; bit 6 is the decimal-digit flag. Two bits spare
803 (write-byte digit stream)
804 (write-byte decomposition-info stream)
805 (write-byte flags stream) ; includes EAW in bits 0-3, bit 4 is free
806 (write-byte script stream)
807 (write-byte line-break stream)
808 (write-byte age stream))))
810 (defun output-ucd-data ()
811 (with-output-dat-file (high-pages "ucdhigh")
812 (with-output-dat-file (low-pages "ucdlow")
813 ;; Output either the index into the misc array (if all the points in the
814 ;; high-page have the same misc value) or an index into the law-pages
815 ;; array / 256. For indexes into the misc array, set bit 15 (high bit).
816 ;; We should never have that many misc entries, so that's not a problem.
818 ;; If Unicode ever allocates an all-decomposing <First>/<Last> block (the
819 ;; only way to get a high page that outputs as the same and has a
820 ;; non-zero decomposition-index, which there's nowhere to store now),
821 ;; find me, slap me with a fish, and have fun fixing this mess.
822 (loop with low-pages-index = 0
823 for high-page from 0 to (ash #x10FFFF -8)
824 for uniq-ucd-entries = nil do
825 (loop for low-page from 0 to #xFF do
826 (pushnew
827 (gethash (logior low-page (ash high-page 8)) *ucd-entries*)
828 uniq-ucd-entries :test #'equalp))
829 (flet ((write-2-byte (int stream)
830 (declare (type (unsigned-byte 16) int))
831 (write-byte (ldb (byte 8 8) int) stream)
832 (write-byte (ldb (byte 8 0) int) stream)))
833 (case (length uniq-ucd-entries)
834 (0 (error "Somehow, a high page has no codepoints in it."))
835 (1 (write-2-byte (logior
836 (ash 1 15)
837 (ucd-misc (car uniq-ucd-entries)))
838 high-pages))
839 (t (loop for low-page from 0 to #xFF
840 for cp = (logior low-page (ash high-page 8))
841 for entry = (gethash cp *ucd-entries*) do
842 (write-2-byte (ucd-misc entry) low-pages)
843 (write-2-byte (ucd-decomp entry) low-pages)
844 finally (write-2-byte low-pages-index high-pages)
845 (incf low-pages-index)))))
846 finally (assert (< low-pages-index (ash 1 15))) (print low-pages-index)))))
848 (defun output-decomposition-data ()
849 (with-output-dat-file (stream "decomp")
850 (loop for cp across *decompositions* do
851 (write-codepoint cp stream)))
852 (print (length *decompositions*)))
854 (defun output-composition-data ()
855 (with-output-dat-file (stream "comp")
856 (let (comp)
857 (maphash (lambda (k v) (push (cons k v) comp)) *compositions*)
858 (setq comp (sort comp #'< :key #'cdr))
859 (loop for (k . v) in comp
860 do (write-codepoint (car k) stream)
861 (write-codepoint (cdr k) stream)
862 (write-codepoint v stream)))))
864 (defun output-case-data ()
865 (let (casing-pages points-with-case)
866 (with-output-dat-file (stream "case")
867 (loop for cp being the hash-keys in *case-mapping*
868 do (push cp points-with-case))
869 (setf points-with-case (sort points-with-case #'<))
870 (loop for cp in points-with-case
871 for (upper . lower) = (gethash cp *case-mapping*) do
872 (pushnew (ash cp -6) casing-pages)
873 (write-codepoint cp stream)
874 (write-byte (if (atom upper) 0 (length upper)) stream)
875 (if (atom upper) (write-codepoint upper stream)
876 (map 'nil (lambda (c) (write-codepoint c stream)) upper))
877 (write-byte (if (atom lower) 0 (length lower)) stream)
878 (if (atom lower) (write-codepoint lower stream)
879 (map 'nil (lambda (c) (write-codepoint c stream)) lower))))
880 (setf casing-pages (sort casing-pages #'<))
881 (assert (< (length casing-pages) 256))
882 (let* ((size (1+ (reduce #'max casing-pages)))
883 (array (make-array size :initial-element 255))
884 (page -1))
885 (dolist (entry casing-pages)
886 (setf (aref array entry) (incf page)))
887 (with-output-dat-file (stream "casepages")
888 (dotimes (i size)
889 (write-byte (aref array i) stream))))
890 (with-output-lisp-expr-file (stream "casepages")
891 (print casing-pages stream))))
893 (defun output-collation-data ()
894 (with-output-dat-file (stream "collation")
895 (flet ((length-tag (list1 list2)
896 ;; takes two lists of UB32 (with the caveat that list1[0]
897 ;; needs its high 8 bits free (codepoints always have
898 ;; that) and do
899 (let* ((l1 (length list1)) (l2 (length list2))
900 (tag (dpb l1 (byte 4 28) (dpb l2 (byte 5 23) (car list1)))))
901 (assert (<= l1 3))
902 (write-4-byte tag stream)
903 (map nil #'(lambda (l) (write-4-byte l stream)) (append (cdr list1) list2)))))
904 (let (coll)
905 (maphash (lambda (k v) (push (cons k v) coll)) *collation-table*)
906 (labels ((sorter (o1 o2)
907 (cond
908 ((null o1) t)
909 ((null o2) nil)
910 (t (or (< (car o1) (car o2))
911 (and (= (car o1) (car o2))
912 (sorter (cdr o1) (cdr o2))))))))
913 (setq coll (sort coll #'sorter :key #'car)))
914 (loop for (k . v) in coll
915 do (length-tag k v)))))
916 (with-output-lisp-expr-file (*standard-output* "other-collation-info")
917 (write-string ";;; The highest primary variable collation index")
918 (terpri)
919 (prin1 *maximum-variable-key*) (terpri)))
921 (defun output ()
922 (output-misc-data)
923 (output-ucd-data)
924 (output-decomposition-data)
925 (output-composition-data)
926 (output-case-data)
927 (output-collation-data)
928 (with-output-lisp-expr-file (*standard-output* "misc-properties")
929 (prin1 **proplist-properties**))
931 (with-output-lisp-expr-file (f "ucd-names")
932 (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
933 (maphash (lambda (code name)
934 (when name
935 (print code f)
936 (prin1 name f)))
937 *unicode-names*)
938 (setf *unicode-names* nil))
939 (with-output-lisp-expr-file (f "ucd1-names")
940 (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
941 (maphash (lambda (code name)
942 (when name
943 (print code f)
944 (prin1 name f)))
945 *unicode-1-names*)
946 (setf *unicode-1-names* nil))
948 (with-output-lisp-expr-file (*standard-output* "numerics")
949 (let ((result (make-array (* (length *different-numerics*) 2))))
950 (loop for (code . value) in (sort *different-numerics* #'< :key #'car)
951 for i by 2
952 do (setf (aref result i) code
953 (aref result (1+ i)) (read-from-string value)))
954 (prin1 result)))
955 (with-output-lisp-expr-file (*standard-output* "titlecases")
956 (prin1 *different-titlecases*))
957 (with-output-lisp-expr-file (*standard-output* "foldcases")
958 (prin1 *different-casefolds*))
959 (with-output-lisp-expr-file (*standard-output* "confusables")
960 (prin1 *confusables*))
961 (with-output-lisp-expr-file (*standard-output* "bidi-mirrors")
962 (prin1 *bidi-mirroring-glyphs*))
963 (with-output-lisp-expr-file (*standard-output* "blocks")
964 (prin1 *block-ranges*))
965 (values))