Fix floor type derivation.
[sbcl.git] / tools-for-build / ucd.lisp
blobba6fa9f82c799fd708fb28409d76af082ac61ca1
1 (in-package "SB-COLD")
3 (declaim (optimize debug))
5 ;;; Common functions
7 (defvar *output-directory*
8 (merge-pathnames
9 (make-pathname :directory '(:relative :up "output" "ucd"))
10 (make-pathname :directory (pathname-directory *load-truename*))))
12 (defparameter *unicode-character-database*
13 (make-pathname :directory (pathname-directory *load-pathname*)))
15 (defmacro with-input-txt-file ((s name) &body body)
16 `(with-open-file (,s (make-pathname :name ,name :type "txt"
17 :defaults *unicode-character-database*))
18 (setf (gethash (format nil "tools-for-build/~A.txt" ,name) *ucd-inputs*) 'used)
19 ,@body))
21 (defmacro with-input-utf8-file
22 ((s name &key (eszets 0) (registereds 1) (copyrights 1)) &body body)
23 ;; KLUDGE: Unicode data files in general have registered and
24 ;; copyright marks (non-ASCII characters) in the header;
25 ;; additionally, CaseFolding.txt as distributed by Unicode contains
26 ;; a non-ASCII character, an eszet, within a comment to act as an
27 ;; example. We can't in general assume that our host lisp will let
28 ;; us read those, and we can't portably write that we don't care
29 ;; about the text content of anything on a line after a hash because
30 ;; text decoding happens at a lower level. So here we rewrite data
31 ;; files to exclude the UTF-8 sequences corresponding to those
32 ;; characters (and error if we see any other UTF-8 sequence).
33 (let ((in (gensym "IN"))
34 (out (gensym "OUT")))
35 `(let ((filename (format nil "~A.txt" ,name)))
36 (with-open-file (,in (make-pathname :name ,name :type "txt"
37 :defaults *unicode-character-database*)
38 :element-type '(unsigned-byte 8))
39 (setf (gethash (format nil "tools-for-build/~A.txt" ,name) *ucd-inputs*) 'used)
40 (with-open-file (,out (make-pathname :name ,name :type "txt"
41 :defaults *output-directory*)
42 :element-type '(unsigned-byte 8)
43 :direction :output
44 :if-exists :supersede
45 :if-does-not-exist :create)
46 (setf (gethash (format nil "output/ucd/~A.txt" ,name) *ucd-outputs*) 'made)
47 (do ((inbyte (read-byte ,in nil nil) (read-byte ,in nil nil))
48 (eszet (map '(vector (unsigned-byte 8)) 'char-code "<eszet>"))
49 (eszet-count 0)
50 (copyright (map '(vector (unsigned-byte 8)) 'char-code "<copyright>"))
51 (copyright-count 0)
52 (registered (map '(vector (unsigned-byte 8)) 'char-code "<registered>"))
53 (registered-count 0))
54 ((null inbyte)
55 (unless (= eszet-count ,eszets)
56 (error "Unexpected number of eszets in ~A: ~D (expected ~D)"
57 filename eszet-count ,eszets))
58 (unless (= copyright-count ,copyrights)
59 (error "Unexpected number of copyright symbols in ~A: ~D (expected ~D)"
60 filename copyright-count ,copyrights))
61 (unless (= registered-count ,registereds)
62 (error "Unexpected number of registered symbols in ~A: ~D (expected ~D)"
63 filename registered-count ,registereds)))
64 (cond
65 ((= inbyte #xc3)
66 (let ((second (read-byte ,in nil nil)))
67 (cond
68 ((null second)
69 (error "No continuation after #xc3 in ~A" filename))
70 ((= second #x9f) (incf eszet-count) (write-sequence eszet ,out))
71 (t (error "Unexpected continuation after #xc3 in ~A: #x~X"
72 filename second)))))
73 ((= inbyte #xc2)
74 (let ((second (read-byte ,in nil nil)))
75 (cond
76 ((null second)
77 (error "No continuation after #xc2 in ~A" filename))
78 ((= second #xa9) (incf copyright-count) (write-sequence copyright ,out))
79 ((= second #xae) (incf registered-count) (write-sequence registered ,out)))))
80 ((>= inbyte #x7f)
81 (error "Unexpected octet in ~A: #x~X" filename inbyte))
82 (t (write-byte inbyte ,out))))))
83 (with-open-file (,s (make-pathname :name ,name :type "txt"
84 :defaults *output-directory*))
85 (multiple-value-prog1
86 (progn
87 ,@body)
88 (unless (eql (peek-char nil ,s nil nil) nil)
89 (error "Unread data in data file: ~S" ,name)))))))
91 (defmacro with-input-arbitrary-utf8-file ((s name) &body body)
92 ;; KLUDGE: the emoji data includes literal emoji; likewise the
93 ;; confusables. We just remove all high-bit stuff, on the
94 ;; assumption that the actual data is still ASCII.
95 (let ((in (gensym "IN"))
96 (out (gensym "OUT")))
97 `(progn
98 (with-open-file (,in (make-pathname :name ,name :type "txt"
99 :defaults *unicode-character-database*)
100 :element-type '(unsigned-byte 8))
101 (setf (gethash (format nil "tools-for-build/~A.txt" ,name) *ucd-inputs*) 'used)
102 (with-open-file (,out (make-pathname :name ,name :type "txt"
103 :defaults *output-directory*)
104 :element-type '(unsigned-byte 8)
105 :direction :output
106 :if-exists :supersede
107 :if-does-not-exist :create)
108 (setf (gethash (format nil "output/ucd/~A.txt" ,name) *ucd-outputs*) 'made)
109 (do ((inbyte (read-byte ,in nil nil) (read-byte ,in nil nil)))
110 ((null inbyte) nil)
111 (cond
112 ((>= inbyte #x7f) nil)
113 (t (write-byte inbyte ,out))))))
114 (with-open-file (,s (make-pathname :name ,name :type "txt"
115 :defaults *output-directory*))
116 (multiple-value-prog1
117 (progn
118 ,@body)
119 (unless (eql (peek-char nil ,s nil nil) nil)
120 (error "Unread data in data file: ~S" ,name)))))))
122 (defmacro with-output-dat-file ((s name) &body body)
123 `(with-open-file (,s (make-pathname :name ,name :type "dat"
124 :defaults *output-directory*)
125 :direction :output :element-type '(unsigned-byte 8)
126 :if-exists :supersede :if-does-not-exist :create)
127 (setf (gethash (format nil "output/ucd/~A.dat" ,name) *ucd-outputs*) 'made)
128 ,@body))
130 (defmacro with-ucd-output-syntax (&body body)
131 `(with-standard-io-syntax
132 (let ((*readtable* (copy-readtable))
133 (*print-readably* nil)
134 (*print-pretty* t))
135 ,@body)))
137 (defmacro with-output-lisp-expr-file ((s name) &body body)
138 `(with-open-file (,s (make-pathname :name ,name :type "lisp-expr"
139 :defaults *output-directory*)
140 :direction :output :element-type 'character
141 :if-exists :supersede :if-does-not-exist :create)
142 (setf (gethash (format nil "output/ucd/~A.lisp-expr" ,name) *ucd-outputs*) 'made)
143 (with-ucd-output-syntax
144 ,@body)))
146 (defun split-string (line character)
147 (loop for prev-position = 0 then (1+ position)
148 for position = (position character line :start prev-position)
149 collect (subseq line prev-position position)
150 do (unless position
151 (loop-finish))))
153 (defun parse-codepoints (string &key (singleton-list t))
154 "Gets a list of codepoints out of 'aaaa bbbb cccc', stripping surrounding space"
155 (let ((list (mapcar
156 (lambda (s) (parse-integer s :radix 16))
157 (remove "" (split-string string #\Space) :test #'string=))))
158 (if (not (or (cdr list) singleton-list)) (car list) list)))
161 (defun parse-codepoint-range (string)
162 "Parse the Unicode syntax DDDD|DDDD..DDDD into an inclusive range (start end)"
163 (destructuring-bind (start &optional empty end) (split-string string #\.)
164 (declare (ignore empty))
165 (let* ((head (parse-integer start :radix 16))
166 (tail (if end
167 (parse-integer end :radix 16 :end (position #\Space end))
168 head)))
169 (list head tail))))
171 (defvar *slurped-random-constants*
172 (with-open-file (f (make-pathname :name "more-ucd-consts" :type "lisp-expr"
173 :defaults *unicode-character-database*))
174 (setf (gethash "tools-for-build/more-ucd-consts.lisp-expr" *ucd-inputs*) 'used)
175 (read f)))
177 (defun init-indices (symbol &aux (strings
178 (or (cadr (assoc symbol *slurped-random-constants*))
179 (error "Missing entry in more-ucd-consts for ~S"
180 symbol))))
181 (let ((hash (make-hash-table :test #'equal)))
182 (loop for string in strings
183 for index from 0
184 do (setf (gethash string hash) index))
185 hash))
187 (defun index-or-lose (key table kind)
188 (or (gethash key table)
189 (error "unknown ~A: ~S" kind key)))
191 (defun clear-flag (bit integer)
192 (logandc2 integer (ash 1 bit)))
195 ;;; Output storage globals
196 (defstruct ucd misc decomp)
198 (defparameter *unicode-names* nil)
199 (defparameter *unicode-1-names* nil)
201 (defparameter *decompositions*
202 (make-array 10000 :element-type '(unsigned-byte 24) :fill-pointer 0
203 :adjustable t)) ; 10000 is not a significant number
205 (defparameter *decomposition-corrections*
206 (with-input-utf8-file (s "NormalizationCorrections")
207 (loop with result = nil
208 for line = (read-line s nil nil) while line
209 do (when (position #\; line)
210 (destructuring-bind (cp old-decomp correction version)
211 (split-string line #\;)
212 (declare (ignore old-decomp version))
213 (push (cons (parse-integer cp :radix 16)
214 (parse-integer correction :radix 16))
215 result)))
216 finally (return result)))
217 "List of decompsotions that were amended in Unicode corrigenda")
219 (defparameter *compositions* (make-hash-table :test #'equal))
220 (defparameter *composition-exclusions*
221 (with-input-utf8-file (s "CompositionExclusions")
222 (loop with result = nil
223 for line = (read-line s nil nil) while line
224 when (and (> (length line) 0) (char/= (char line 0) #\#))
225 do (push (parse-integer line :end (position #\Space line) :radix 16)
226 result) finally (return result)))
227 "Characters that are excluded from composition according to UAX#15")
229 (defparameter *different-titlecases* nil)
230 (defparameter *different-casefolds* nil)
232 (defparameter *case-mapping*
233 (with-input-utf8-file (s "SpecialCasing")
234 (loop with hash = (make-hash-table)
235 for line = (read-line s nil nil) while line
236 unless (or (not (position #\# line)) (= 0 (position #\# line)))
237 do (destructuring-bind (%cp %lower %title %upper &optional context comment)
238 (split-string line #\;)
239 (unless (and context comment)
240 (let ((cp (parse-integer %cp :radix 16))
241 (lower (parse-codepoints %lower :singleton-list nil))
242 (title (parse-codepoints %title :singleton-list nil))
243 (upper (parse-codepoints %upper :singleton-list nil)))
244 (setf (gethash cp hash) (cons upper lower))
245 (unless (equal title upper) (push (cons cp title) *different-titlecases*)))))
246 finally (return hash)))
247 "Maps cp -> (cons uppercase|(uppercase ...) lowercase|(lowercase ...))")
249 (defparameter *misc-table* (make-array 3000 :fill-pointer 0)
250 "Holds the entries in the Unicode database's miscellanious array, stored as lists.
251 These lists have the form (gc-index bidi-index ccc digit decomposition-info
252 flags script line-break age). Flags is a bit-bashed integer containing
253 cl-both-case-p, has-case-p, and bidi-mirrored-p, and an east asian width.
254 Length should be adjusted when the standard changes.")
255 (defparameter *misc-hash* (make-hash-table :test #'equal)
256 "Maps a misc list to its position in the misc table.")
258 (defparameter *different-numerics* nil)
260 (defparameter *ucd-entries* (make-hash-table))
262 (defparameter *general-categories* (init-indices '*general-categories*))
263 (defparameter *bidi-classes* (init-indices '*bidi-classes*))
264 (defparameter *east-asian-widths* (init-indices '*east-asian-widths*))
265 (defparameter *scripts* (init-indices '*scripts*))
266 (defparameter *line-break-classes* (init-indices '*line-break-classes*))
268 (defparameter *east-asian-width-table*
269 (with-input-utf8-file (s "EastAsianWidth")
270 (loop with hash = (make-hash-table)
271 for line = (read-line s nil nil) while line
272 unless (or (not (position #\# line)) (= 0 (position #\# line)))
273 do (destructuring-bind (codepoints value)
274 (split-string
275 (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
276 (let ((range (parse-codepoint-range codepoints))
277 (index (index-or-lose value *east-asian-widths* "East Asian width")))
278 (loop for i from (car range) to (cadr range)
279 do (setf (gethash i hash) index))))
280 finally (return hash)))
281 "Table of East Asian Widths. Used in the creation of misc entries.")
283 (defparameter *script-table*
284 (with-input-utf8-file (s "Scripts")
285 (loop with hash = (make-hash-table)
286 for line = (read-line s nil nil) while line
287 unless (or (not (position #\# line)) (= 0 (position #\# line)))
288 do (destructuring-bind (codepoints value)
289 (split-string
290 (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
291 (let ((range (parse-codepoint-range codepoints))
292 (index (index-or-lose (subseq value 1) *scripts* "script")))
293 (loop for i from (car range) to (cadr range)
294 do (setf (gethash i hash) index))))
295 finally (return hash)))
296 "Table of scripts. Used in the creation of misc entries.")
298 (defparameter *line-break-class-table*
299 (with-input-utf8-file (s "LineBreak")
300 (loop with hash = (make-hash-table)
301 for line = (read-line s nil nil) while line
302 unless (or (not (position #\# line)) (= 0 (position #\# line)))
303 do (destructuring-bind (codepoints value)
304 (split-string
305 (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
306 (let* ((range (parse-codepoint-range codepoints))
307 ;; Hangul syllables are marked as "Unknown", and programmatically
308 ;; handled in SB-UNICODE:LINE-BREAK-CLASS
309 (value
310 (or (and (member value '("JL" "JV" "JT" "H2" "H3") :test 'string=) "XX")
311 value))
312 (index (index-or-lose value *line-break-classes* "line break")))
313 (loop for i from (car range) to (cadr range)
314 do (setf (gethash i hash) index))))
315 finally (return hash)))
316 "Table of line break classes. Used in the creation of misc entries.")
318 (defparameter *age-table*
319 (with-input-utf8-file (s "DerivedAge")
320 (loop with hash = (make-hash-table)
321 for line = (read-line s nil nil) while line
322 unless (or (not (position #\# line)) (= 0 (position #\# line)))
323 do (destructuring-bind (codepoints value)
324 (split-string
325 (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
326 (let* ((range (parse-codepoint-range codepoints))
327 (age-parts (mapcar #'parse-integer (split-string value #\.)))
328 (age (logior (ash (car age-parts) 3) (cadr age-parts))))
329 (loop for i from (car range) to (cadr range)
330 do (setf (gethash i hash) age))))
331 finally (return hash)))
332 "Table of character ages. Used in the creation of misc entries.")
334 (defvar *block-first* nil)
337 ;;; Unicode data file parsing
338 (defun hash-misc (gc-index bidi-index ccc digit decomposition-info flags
339 script line-break age)
340 (let* ((list (list gc-index bidi-index ccc digit decomposition-info flags
341 script line-break age))
342 (index (gethash list *misc-hash*)))
343 (or index
344 (progn
345 (setf (gethash list *misc-hash*)
346 (fill-pointer *misc-table*))
347 (when (eql nil (vector-push list *misc-table*))
348 (error "Misc table too small."))
349 (gethash list *misc-hash*)))))
351 (defun ordered-ranges-member (item vector)
352 (labels ((recurse (start end)
353 (when (< start end)
354 (let* ((i (+ start (truncate (- end start) 2)))
355 (index (* 2 i))
356 (elt1 (svref vector index))
357 (elt2 (svref vector (1+ index))))
358 (cond ((< item elt1)
359 (recurse start i))
360 ((> item elt2)
361 (recurse (+ 1 i) end))
363 item))))))
364 (recurse 0 (/ (length vector) 2))))
366 (defun unallocated-bidi-class (code-point)
367 ;; See tests/data/DerivedBidiClass.txt for more information
368 (flet ((in (vector class)
369 (when (ordered-ranges-member code-point vector)
370 (gethash class *bidi-classes*))))
371 (cond
372 ((in #(#x0600 #x07BF
373 #x0860 #x08FF
374 #xFB50 #xFDCF
375 #xFDF0 #xFDFF
376 #xFE70 #xFEFF
377 #x10D00 #x10D3F
378 #x10EC0 #x10EFF
379 #x10F30 #x10F6F
380 #x1EC70 #x1ECBF
381 #x1ED00 #x1ED4F
382 #x1EE00 #x1EEFF)
383 "AL"))
384 ((in #(#x0590 #x05FF
385 #x07C0 #x085F
386 #xFB1D #xFB4F
387 #x10800 #x10CFF
388 #x10D40 #x10EBF
389 #x10F00 #x10F2F
390 #x10F70 #x10FFF
391 #x1E800 #x1EC6F
392 #x1ECC0 #x1ECFF
393 #x1ED50 #x1EDFF
394 #x1EF00 #x1EFFF)
395 "R"))
396 ((in #(#x20A0 #x20CF) "ET"))
397 ;; BN is non-characters and default-ignorable.
398 ;; Default-ignorable will be dealt with elsewhere
399 ((in #(#xFDD0 #xFDEF #xFFFE #xFFFF #x1FFFE #x1FFFF #x2FFFE #x2FFFF
400 #x3FFFE #x3FFFF #x4FFFE #x4FFFF #x5FFFE #x5FFFF #x6FFFE #x6FFFF
401 #x7FFFE #x7FFFF #x8FFFE #x8FFFF #x9FFFE #x9FFFF #xAFFFE #xAFFFF
402 #xBFFFE #xBFFFF #xCFFFE #xCFFFF #xDFFFE #xDFFFF #xEFFFE #xEFFFF
403 #xFFFFE #xFFFFF #x10FFFE #x10FFFF)
404 "BN"))
405 ((in #(#x0 #x10FFFF) "L"))
406 (t (error "Somehow we've gone too far in unallocated bidi determination")))))
408 (defun complete-misc-table ()
409 (loop for code-point from 0 to #x10FFFF do ; Flood-fill unallocated codepoints
410 (unless (second (multiple-value-list (gethash code-point *ucd-entries*)))
411 (let* ((unallocated-misc
412 ;; unallocated characters have a GC of "Cn", aren't digits
413 ;; (digit = 128), have a bidi that depends on their block, and
414 ;; don't decompose, combine, or have case. They have an East
415 ;; Asian Width (eaw) of "N" (0), and a script, line breaking
416 ;; class, and age of 0 ("Unknown"), unless some of those
417 ;; properties are otherwise assigned.
418 `(,(gethash "Cn" *general-categories*)
419 ,(unallocated-bidi-class code-point) 0 128 0
420 ,(gethash code-point *east-asian-width-table* 0)
421 0 ,(gethash code-point *line-break-class-table* 0)
422 ,(gethash code-point *age-table* 0)))
423 (unallocated-index (apply #'hash-misc unallocated-misc))
424 (unallocated-ucd (make-ucd :misc unallocated-index)))
425 (setf (gethash code-point *ucd-entries*) unallocated-ucd)))))
427 (defun expand-decomposition (decomposition)
428 (loop for cp in decomposition
429 for ucd = (gethash cp *ucd-entries*)
430 for length = (elt (aref *misc-table* (ucd-misc ucd)) 4)
431 if (and (not (logbitp 7 length))
432 (plusp length))
433 append (expand-decomposition (ucd-decomp ucd))
434 else
435 collect cp))
437 ;;; Recursively expand canonical decompositions
438 (defun fixup-decompositions ()
439 (loop for did-something = nil
441 (loop for ucd being each hash-value of *ucd-entries*
442 when (and (ucd-decomp ucd)
443 (not (logbitp 7 (elt (aref *misc-table* (ucd-misc ucd)) 4))))
445 (let ((expanded (expand-decomposition (ucd-decomp ucd))))
446 (unless (equal expanded (ucd-decomp ucd))
447 (setf (ucd-decomp ucd) expanded
448 did-something t))))
449 while did-something)
450 (loop for i below (hash-table-count *ucd-entries*)
451 for ucd = (gethash i *ucd-entries*)
452 for decomp = (ucd-decomp ucd)
454 (setf (ucd-decomp ucd)
455 (cond ((not (consp decomp)) 0)
456 ((logbitp 7 (elt (aref *misc-table* (ucd-misc ucd)) 4))
457 (prog1 (length *decompositions*)
458 (loop for cp in decomp
459 do (vector-push-extend cp *decompositions*))))
461 (let ((misc-entry (copy-list (aref *misc-table* (ucd-misc ucd)))))
462 (setf (elt misc-entry 4) (length decomp)
463 (ucd-misc ucd) (apply #'hash-misc misc-entry))
464 (prog1 (length *decompositions*)
465 (loop for cp in decomp
466 do (vector-push-extend cp *decompositions*)))))))))
468 (defun fixup-compositions ()
469 (flet ((fixup (k v)
470 (declare (ignore v))
471 (let* ((cp (car k))
472 (ucd (gethash cp *ucd-entries*))
473 (misc (aref *misc-table* (ucd-misc ucd)))
474 (ccc (third misc)))
475 ;; we can do everything in the first pass except for
476 ;; accounting for decompositions where the first
477 ;; character of the decomposition is not a starter.
478 (when (/= ccc 0)
479 (remhash k *compositions*)))))
480 (maphash #'fixup *compositions*)))
482 (defun add-jamo-information (line table)
483 (let* ((split (split-string line #\;))
484 (code (parse-integer (first split) :radix 16))
485 (syllable (string-trim
487 (subseq (second split) 0 (position #\# (second split))))))
488 (setf (gethash code table) syllable)))
490 (defun fixup-hangul-syllables ()
491 ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
492 (let* ((sbase #xac00)
493 (lbase #x1100)
494 (vbase #x1161)
495 (tbase #x11a7)
496 (scount 11172)
497 (lcount 19)
498 (vcount 21)
499 (tcount 28)
500 (ncount (* vcount tcount))
501 (table (make-hash-table)))
502 (declare (ignore lcount))
503 (with-input-utf8-file (*standard-input* "Jamo")
504 (loop for line = (read-line nil nil)
505 while line
506 if (position #\; line)
507 do (add-jamo-information line table)))
508 (dotimes (sindex scount)
509 (let* ((l (+ lbase (floor sindex ncount)))
510 (v (+ vbase (floor (mod sindex ncount) tcount)))
511 (tee (+ tbase (mod sindex tcount)))
512 (code-point (+ sbase sindex))
513 (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]"
514 (gethash l table) (gethash v table)
515 (= tee tbase) (gethash tee table))))
516 (push `(,code-point . ,name) *unicode-names*)))))
518 (defun normalize-character-name (name)
519 (when (find #\_ name)
520 (error "Bad name for a character: ~A" name))
521 ;; U+1F5CF (PAGE)'s name conflicts with the ANSI CL-assigned
522 ;; name for form feed (^L, U+000C). To avoid a case where
523 ;; more than one character has a particular name while remaining
524 ;; standards-compliant, we remove U+1F5CF's name here.
525 (when (string= name "PAGE")
526 (return-from normalize-character-name "UNICODE_PAGE"))
527 (unless (or (zerop (length name)) (find #\< name) (find #\> name))
528 (substitute #\_ #\Space name)))
530 ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
531 ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
532 ;;; D800 -- F8FF : surrogates and private use
533 ;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
534 ;;; F0000 -- FFFFD : private use
535 ;;; 100000 -- 10FFFD: private use
536 (defun encode-ucd-line (line code-point)
537 (destructuring-bind (name general-category canonical-combining-class
538 bidi-class decomposition-type-and-mapping
539 decimal-digit digit numeric bidi-mirrored
540 unicode-1-name iso-10646-comment simple-uppercase
541 simple-lowercase simple-titlecase)
542 line
543 (declare (ignore iso-10646-comment))
544 (if (and (> (length name) 8)
545 (string= ", First>" name :start2 (- (length name) 8)))
546 (progn
547 (setf *block-first* code-point)
548 nil)
549 (let* ((gc-index
550 (index-or-lose general-category *general-categories* "general category"))
551 (bidi-index (index-or-lose bidi-class *bidi-classes* "bidirectional class"))
552 (ccc (parse-integer canonical-combining-class))
553 (digit-index (if (string= "" digit) 128 ; non-digits have high bit
554 (let ((%digit (parse-integer digit)))
555 (if (string= digit decimal-digit)
556 ;; decimal-digit-p is in bit 6
557 (logior (ash 1 6) %digit) %digit))))
558 (upper-index (unless (string= "" simple-uppercase)
559 (parse-integer simple-uppercase :radix 16)))
560 (lower-index (unless (string= "" simple-lowercase)
561 (parse-integer simple-lowercase :radix 16)))
562 (title-index (unless (string= "" simple-titlecase)
563 (parse-integer simple-titlecase :radix 16)))
564 (cl-both-case-p (or (and (= gc-index 0) lower-index)
565 (and (= gc-index 1) upper-index)))
566 (bidi-mirrored-p (string= bidi-mirrored "Y"))
567 (decomposition-info 0)
568 (eaw-index (gethash code-point *east-asian-width-table*))
569 (script-index (gethash code-point *script-table* 0))
570 (line-break-index (gethash code-point *line-break-class-table* 0))
571 (age-index (gethash code-point *age-table* 0))
572 decomposition)
573 #+nil
574 (when (and (not cl-both-case-p)
575 (< gc-index 2))
576 (format t "~A~%" name))
578 (when (string/= "" decomposition-type-and-mapping)
579 (let* ((compatibility-p (position #\> decomposition-type-and-mapping)))
580 (setf decomposition
581 (parse-codepoints
582 (subseq decomposition-type-and-mapping
583 (if compatibility-p (1+ compatibility-p) 0))))
584 (when (assoc code-point *decomposition-corrections*)
585 (setf decomposition
586 (list (cdr (assoc code-point *decomposition-corrections*)))))
587 (setf decomposition-info
588 (logior (length decomposition) (if compatibility-p 128 0)))
589 (unless compatibility-p
590 ;; Primary composition excludes:
591 ;; * singleton decompositions;
592 ;; * decompositions of non-starters;
593 ;; * script-specific decompositions;
594 ;; * later-version decompositions;
595 ;; * decompositions whose first character is a
596 ;; non-starter.
597 ;; All but the last case can be handled here;
598 ;; for the fixup, see FIXUP-COMPOSITIONS
599 (when (and (> decomposition-info 1)
600 (= ccc 0)
601 (not (member code-point *composition-exclusions*)))
602 (unless (= decomposition-info 2)
603 (error "canonical decomposition unexpectedly long"))
604 (setf (gethash (cons (first decomposition)
605 (second decomposition))
606 *compositions*)
607 code-point)))))
608 ;; Hangul decomposition; see Unicode 6.2 section 3-12
609 (when (= code-point #xd7a3)
610 ;; KLUDGE: The decomposition-length for Hangul syllables in the
611 ;; misc database will be a bit of a lie. It doesn't really matter
612 ;; since the only purpose of the length is to index into the
613 ;; decompositions array (which Hangul decomposition doesn't use).
614 ;; The decomposition index is 0 because we won't be going into the
615 ;; array
616 (setf decomposition-info 3))
618 (unless (gethash code-point *case-mapping*) ; Exclude codepoints from SpecialCasing
619 (when (string/= simple-uppercase simple-titlecase)
620 (push (cons code-point title-index) *different-titlecases*))
621 (and (or upper-index lower-index)
622 (setf (gethash code-point *case-mapping*)
623 (cons
624 (or upper-index code-point)
625 (or lower-index code-point)))))
627 (when (string/= digit numeric)
628 (push (cons code-point numeric) *different-numerics*))
630 (when (> ccc 255)
631 (error "canonical combining class too large ~A" ccc))
632 (let* ((flags (logior
633 (if cl-both-case-p (ash 1 7) 0)
634 (if (gethash code-point *case-mapping*) (ash 1 6) 0)
635 (if bidi-mirrored-p (ash 1 5) 0)
636 eaw-index))
637 (misc-index (hash-misc gc-index bidi-index ccc digit-index
638 decomposition-info flags script-index
639 line-break-index age-index))
640 (result (make-ucd :misc misc-index
641 :decomp decomposition)))
642 (when (and (> (length name) 7)
643 (string= ", Last>" name :start2 (- (length name) 7)))
644 ;; We can still do this despite East Asian Width being in the
645 ;; databasce since each of the UCD <First><Last> blocks
646 ;; has a consistent East Asian Width
647 (loop for point from *block-first* to code-point do
648 (setf (gethash point *ucd-entries*) result)))
649 (values result (normalize-character-name name)
650 (normalize-character-name unicode-1-name)))))))
652 (defun slurp-ucd-line (line line-number)
653 (declare (ignorable line-number))
654 (let* ((split-line (split-string line #\;))
655 (code-point (parse-integer (first split-line) :radix 16)))
656 (multiple-value-bind (encoding name unicode-1-name)
657 (encode-ucd-line (cdr split-line) code-point)
658 ;; Dup code-point exist - maybe to do with composition/decomposition? no idea -
659 ;; but we need to ensure that there are no dups in the name tables.
660 ;; Since those tables are not hash-tables any more, we have to inefficiently
661 ;; scan to assert not-existsp. But since scanning an alist is not optimal,
662 ;; don't enable this warning unless debugging.
663 #+nil
664 (when (gethash code-point *ucd-entries*)
665 (assert (not (assoc code-point *unicode-names*)))
666 (assert (not (assoc code-point *unicode-1-names*)))
667 (warn "line ~D: dup for code-point ~x, already have ~S"
668 line-number code-point (gethash code-point *ucd-entries*)))
669 (setf (gethash code-point *ucd-entries*) encoding)
670 (when name
671 (push `(,code-point . ,name) *unicode-names*))
672 (when unicode-1-name
673 (push `(,code-point . ,unicode-1-name) *unicode-1-names*)))))
675 ;;; this fixes up the case conversion discrepancy between CL and
676 ;;; Unicode: CL operators depend on char-downcase / char-upcase being
677 ;;; inverses, which is not true in general in Unicode even for
678 ;;; characters which change case to single characters.
679 ;;; Also, fix misassigned age values, which are not constant across blocks
680 (defun second-pass ()
681 (let ((case-mapping
682 (sort (loop for code-point being the hash-keys in *case-mapping*
683 using (hash-value value)
684 collect (cons code-point value))
685 #'< :key #'car)))
686 (loop for (code-point upper . lower) in case-mapping
687 for misc-index = (ucd-misc (gethash code-point *ucd-entries*))
688 for (gc bidi ccc digit decomp flags script lb age) = (aref *misc-table* misc-index)
689 when (logbitp 7 flags) do
690 (when (or (not (atom upper)) (not (atom lower))
691 (and (= gc 0)
692 (not (equal (car (gethash lower *case-mapping*)) code-point)))
693 (and (= gc 1)
694 (not (equal (cdr (gethash upper *case-mapping*)) code-point))))
695 (let* ((new-flags (clear-flag 7 flags))
696 (new-misc (hash-misc gc bidi ccc digit decomp new-flags script lb age)))
697 (setf (ucd-misc (gethash code-point *ucd-entries*)) new-misc))))))
699 (defun fixup-casefolding ()
700 (with-input-utf8-file (s "CaseFolding" :eszets 1)
701 (loop for line = (read-line s nil nil)
702 while line
703 unless (or (not (position #\; line)) (equal (position #\# line) 0))
704 do (destructuring-bind (original type mapping comment)
705 (split-string line #\;)
706 (declare (ignore comment))
707 (let ((cp (parse-integer original :radix 16))
708 (fold (parse-codepoints mapping :singleton-list nil)))
709 (unless (or (string= type " S") (string= type " T"))
710 (when (not (equal (cdr (gethash cp *case-mapping*)) fold))
711 (push (cons cp fold) *different-casefolds*))))))))
713 (defun fixup-ages ()
714 (let ((age-data (sort
715 (loop for code-point being the hash-keys in *age-table*
716 using (hash-value true-age)
717 collect (cons code-point true-age))
718 #'< :key #'car)))
719 (loop for (code-point . true-age) in age-data
720 for misc-index = (ucd-misc (gethash code-point *ucd-entries*))
721 for (gc bidi ccc digit decomp flags script lb age) = (aref *misc-table* misc-index)
722 unless (= age true-age) do
723 (let* ((new-misc (hash-misc gc bidi ccc digit decomp flags script lb true-age))
724 (new-ucd (make-ucd
725 :misc new-misc
726 :decomp (ucd-decomp (gethash code-point *ucd-entries*)))))
727 (setf (gethash code-point *ucd-entries*) new-ucd)))))
729 (defun slurp-ucd ()
730 (with-input-txt-file (*standard-input* "UnicodeData")
731 (when *load-verbose*
732 (format t "~%//slurp-ucd~%"))
733 (loop for line = (read-line nil nil)
734 for number from 1
735 while line
736 do (slurp-ucd-line line number)))
737 (second-pass)
738 (fixup-compositions)
739 (fixup-hangul-syllables)
740 (complete-misc-table)
741 (fixup-casefolding)
742 (fixup-ages)
743 (fixup-decompositions)
744 nil)
747 ;;; PropList.txt
748 (defparameter **proplist-properties** nil
749 "A list of properties extracted from PropList.txt")
751 (defun parse-property (stream &optional name)
752 (let ((result (make-array 1 :fill-pointer 0 :adjustable t)))
753 (loop for line = (read-line stream nil nil)
754 for entry = (subseq line 0 (position #\# line))
755 ;; Deal with Blah=Blah in DerivedNormalizationProps.txt
756 while (and line (not (position #\= (substitute #\Space #\= line :count 1))))
757 when (and entry (string/= entry ""))
759 (destructuring-bind (start end)
760 (parse-codepoint-range (car (split-string entry #\;)))
761 (vector-push-extend start result)
762 (vector-push-extend end result)))
763 (when name
764 (push name **proplist-properties**)
765 (push result **proplist-properties**))))
767 (defun slurp-proplist ()
768 (with-input-utf8-file (s "PropList")
769 (parse-property s) ;; Initial comments
770 (parse-property s :white-space)
771 (parse-property s :bidi-control)
772 (parse-property s :join-control)
773 (parse-property s :dash)
774 (parse-property s :hyphen)
775 (parse-property s :quotation-mark)
776 (parse-property s :terminal-punctuation)
777 (parse-property s :other-math)
778 (parse-property s :hex-digit)
779 (parse-property s :ascii-hex-digit)
780 (parse-property s :other-alphabetic)
781 (parse-property s :ideographic)
782 (parse-property s :diacritic)
783 (parse-property s :extender)
784 (parse-property s :other-lowercase)
785 (parse-property s :other-uppercase)
786 (parse-property s :noncharacter-code-point)
787 (parse-property s :other-grapheme-extend)
788 (parse-property s :ids-binary-operator)
789 (parse-property s :ids-trinary-operator)
790 (parse-property s :radical)
791 (parse-property s :unified-ideograph)
792 (parse-property s :other-default-ignorable-code-point)
793 (parse-property s :deprecated)
794 (parse-property s :soft-dotted)
795 (parse-property s :logical-order-exception)
796 (parse-property s :other-id-start)
797 (parse-property s :other-id-continue)
798 (parse-property s :sentence-terminal)
799 (parse-property s :variation-selector)
800 (parse-property s :pattern-white-space)
801 (parse-property s :pattern-syntax)
802 (parse-property s :prepended-concatenation-mark)
803 (parse-property s :regional-indicator))
805 (with-input-utf8-file (s "DerivedNormalizationProps")
806 (parse-property s) ;; Initial comments
807 (parse-property s) ;; FC_NFKC_Closure
808 (parse-property s) ;; FC_NFKC_Closure
809 (parse-property s) ;; Full_Composition_Exclusion
810 (parse-property s) ;; NFD_QC Comments
811 (parse-property s :nfd-qc)
812 (parse-property s) ;; NFC_QC Comments
813 (parse-property s :nfc-qc)
814 (parse-property s :nfc-qc-maybe)
815 (parse-property s) ;; NFKD_QC Comments
816 (parse-property s :nfkd-qc)
817 (parse-property s) ;; NFKC_QC Comments
818 (parse-property s :nfkc-qc)
819 (parse-property s :nfkc-qc-maybe)
820 (parse-property s) ;; Expands_On_NFD
821 (parse-property s) ;; Expands_On_NFC
822 (parse-property s) ;; Expands_On_NFKD
823 (parse-property s) ;; Expands_On_NFKC
824 (parse-property s) ;; NFKC_CF
825 (parse-property s)) ;; Changes_When_NFKC_Casefolded
827 (with-input-arbitrary-utf8-file (s "emoji-data")
828 (parse-property s) ;; Initial comments
829 (parse-property s :emoji)
830 (parse-property s :emoji-presentation)
831 (parse-property s :emoji-modifier)
832 (parse-property s :emoji-modifier-base)
833 (parse-property s :emoji-component)
834 (parse-property s :extended-pictographic))
836 (setf **proplist-properties** (nreverse **proplist-properties**))
837 (values))
840 ;;; Collation keys
841 (defvar *maximum-variable-key* 1)
843 (defun bitpack-collation-key (primary secondary tertiary)
844 ;; 0 <= primary <= #xFFFD (default table)
845 ;; 0 <= secondary <= #x10C [9 bits]
846 ;; 0 <= tertiary <= #x1E (#x1F allowed) [5 bits]
847 ;; Because of this, the bit packs don't overlap
848 (logior (ash primary 16) (ash secondary 5) tertiary))
850 (defun parse-collation-line (line)
851 (destructuring-bind (%code-points %keys) (split-string line #\;)
852 (let* ((code-points (parse-codepoints %code-points))
853 (%keys (subseq %keys 0 (search " #" %keys)))
854 (keys
855 (remove
857 (split-string (remove #\[ (remove #\Space %keys)) #\]) :test #'string=))
858 (ret
859 (loop for key in keys
860 for variable-p = (position #\* key)
861 for parsed =
862 ;; Don't need first value, it's always just ""
863 (cdr (mapcar (lambda (x) (parse-integer x :radix 16 :junk-allowed t))
864 (split-string (substitute #\. #\* key) #\.)))
865 collect
866 (destructuring-bind (primary secondary tertiary) parsed
867 (when variable-p (setf *maximum-variable-key*
868 (max primary *maximum-variable-key*)))
869 (bitpack-collation-key primary secondary tertiary)))))
870 (values code-points ret))))
873 ;;; Other properties
874 (defparameter *confusables*
875 (with-input-arbitrary-utf8-file (stream "confusables")
876 (read-line stream)
877 (loop for line = (read-line stream nil nil)
878 while line
879 when (and (not (equal line ""))
880 (char/= (char line 0) #\#))
881 collect
882 (flet ((parse (x)
883 (mapcar (lambda (x)
884 (parse-integer x :radix 16))
885 (split-string x #\Space))))
886 (let* ((semicolon (position #\; line))
887 (semicolon2 (position #\; line :start (1+ semicolon)))
888 (from (parse (subseq line 0 (1- semicolon))))
889 (to (parse (subseq line (+ semicolon 2) (1- semicolon2)))))
890 (assert (= (length from) 1))
891 (list* (car from) to)))))
892 "List of confusable codepoint sets")
894 (defparameter *bidi-mirroring-glyphs*
895 (with-input-utf8-file (s "BidiMirroring")
896 (loop for line = (read-line s nil nil) while line
897 when (and (plusp (length line))
898 (char/= (char line 0) #\#))
899 collect
900 (mapcar
901 #'(lambda (c) (parse-codepoints c :singleton-list nil))
902 (split-string (subseq line 0 (position #\# line)) #\;))))
903 "List of BIDI mirroring glyph pairs")
905 (defparameter *blocks*
906 (let (ranges names)
907 (with-input-utf8-file (stream "Blocks")
908 (loop
909 (let ((line (read-line stream nil nil)))
910 (cond ((not line) (return))
911 ((or (string= line "") (position #\# line))) ; ignore
912 (t (let* ((split (split-string line #\;))
913 (range (parse-codepoint-range (car split))))
914 (setq ranges (list* (cadr range) (car range) ranges))
915 (push (nsubstitute #\- #\Space
916 (string-left-trim " " (cadr split)))
917 names)))))))
918 (cons (nreverse (coerce ranges 'vector)) (nreverse names)))
919 "Vector of block starts and ends in a form acceptable to `ordered-ranges-position`.
920 Used to look up block data.")
922 ;;; Output code
923 (defun write-codepoint (code-point stream)
924 (declare (type (unsigned-byte 32) code-point))
925 (write-byte (ldb (byte 8 16) code-point) stream)
926 (write-byte (ldb (byte 8 8) code-point) stream)
927 (write-byte (ldb (byte 8 0) code-point) stream))
929 (defun output-misc-data ()
930 (with-output-dat-file (stream "ucdmisc")
931 (loop for (gc-index bidi-index ccc digit decomposition-info flags
932 script line-break age)
933 across *misc-table*
934 ;; three bits spare here
935 do (write-byte gc-index stream)
936 ;; three bits spare here
937 (write-byte bidi-index stream)
938 (write-byte ccc stream)
939 ;; bits 0-3 encode [0,9], bit 7 is for non-digit status,
940 ;; bit 6 is the decimal-digit flag. Two bits spare
941 (write-byte digit stream)
942 (write-byte decomposition-info stream)
943 (write-byte flags stream) ; includes EAW in bits 0-3, bit 4 is free
944 (write-byte script stream)
945 (write-byte line-break stream)
946 (write-byte age stream))))
948 (defun output-ucd-data ()
949 (with-output-dat-file (high-pages "ucdhigh")
950 (with-output-dat-file (low-pages "ucdlow")
951 ;; Output either the index into the misc array (if all the points in the
952 ;; high-page have the same misc value) or an index into the law-pages
953 ;; array / 256. For indexes into the misc array, set bit 15 (high bit).
954 ;; We should never have that many misc entries, so that's not a problem.
956 ;; If Unicode ever allocates an all-decomposing <First>/<Last> block (the
957 ;; only way to get a high page that outputs as the same and has a
958 ;; non-zero decomposition-index, which there's nowhere to store now),
959 ;; find me, slap me with a fish, and have fun fixing this mess.
960 (loop with low-pages-index = 0
961 for high-page from 0 to (ash #x10FFFF -8)
962 for uniq-ucd-entries = nil do
963 (loop for low-page from 0 to #xFF do
964 (pushnew
965 (gethash (logior low-page (ash high-page 8)) *ucd-entries*)
966 uniq-ucd-entries :test #'equalp))
967 (flet ((write-2-byte (int stream)
968 (declare (type (unsigned-byte 16) int))
969 (write-byte (ldb (byte 8 8) int) stream)
970 (write-byte (ldb (byte 8 0) int) stream)))
971 (case (length uniq-ucd-entries)
972 (0 (error "Somehow, a high page has no codepoints in it."))
973 (1 (write-2-byte (logior
974 (ash 1 15)
975 (ucd-misc (car uniq-ucd-entries)))
976 high-pages))
977 (t (loop for low-page from 0 to #xFF
978 for cp = (logior low-page (ash high-page 8))
979 for entry = (gethash cp *ucd-entries*) do
980 (write-2-byte (ucd-misc entry) low-pages)
981 (write-2-byte (ucd-decomp entry) low-pages)
982 finally (write-2-byte low-pages-index high-pages)
983 (incf low-pages-index)))))
984 finally (assert (< low-pages-index (ash 1 15)))
985 (when *load-print*
986 (print low-pages-index))))))
988 (defun output-decomposition-data ()
989 (with-output-dat-file (stream "decomp")
990 (loop for cp across *decompositions* do
991 (write-codepoint cp stream)))
992 (when *load-print*
993 (print (length *decompositions*))))
995 (defun output-composition-data ()
996 (with-output-lisp-expr-file (stream "comp")
997 (let (comp)
998 (maphash (lambda (k v) (push (cons k v) comp)) *compositions*)
999 (format stream "#(~%")
1000 (loop for (k . v) in (sort comp #'< :key #'cdr)
1001 do (format stream "(~D . ~D) ; #x~X + #x~X~%"
1002 (logior (ash (car k) 21) (cdr k)) v (car k) (cdr k)))
1003 (format stream ")~%"))))
1005 (defun output-case-data ()
1006 (let (casing-pages points-with-case)
1007 (with-output-dat-file (stream "case")
1008 (loop for cp being the hash-keys in *case-mapping*
1009 do (push cp points-with-case))
1010 (setf points-with-case (sort points-with-case #'<))
1011 (loop for cp in points-with-case
1012 for (upper . lower) = (gethash cp *case-mapping*) do
1013 (pushnew (ash cp -6) casing-pages)
1014 (write-codepoint cp stream)
1015 (write-byte (if (atom upper) 0 (length upper)) stream)
1016 (if (atom upper) (write-codepoint upper stream)
1017 (map 'nil (lambda (c) (write-codepoint c stream)) upper))
1018 (write-byte (if (atom lower) 0 (length lower)) stream)
1019 (if (atom lower) (write-codepoint lower stream)
1020 (map 'nil (lambda (c) (write-codepoint c stream)) lower))))
1021 (setf casing-pages (sort casing-pages #'<))
1022 (assert (< (length casing-pages) 256))
1023 (let* ((size (1+ (reduce #'max casing-pages)))
1024 (array (make-array size :initial-element 255))
1025 (page -1))
1026 (dolist (entry casing-pages)
1027 (setf (aref array entry) (incf page)))
1028 (with-output-dat-file (stream "casepages")
1029 (dotimes (i size)
1030 (write-byte (aref array i) stream))))
1031 (with-output-lisp-expr-file (stream "casepages")
1032 (print casing-pages stream))))
1034 (defun output-collation-data ()
1035 (with-output-lisp-expr-file (output "collation")
1036 (format output ";;;; This is 'allkeys.txt' converted to Lisp syntax~%")
1037 (format output ";;;; Assume *READ-BASE* is 16.~%#(~%")
1038 (with-input-txt-file (input "allkeys")
1039 (loop for line = (read-line input nil nil) while line
1040 unless (or (string= line "")
1041 (eql 0 (position #\@ line)) (eql 0 (position #\# line)))
1043 (multiple-value-bind (codepoints keys) (parse-collation-line line)
1044 (format output "(~x . ~x)~%"
1045 ;; Pack up to 3 codepoints, rightmost in the highest bits
1046 (destructuring-bind (first &optional second third) codepoints
1047 (cond (third (logior (ash third 42) (ash second 21) first))
1048 (second (logior (ash second 21) first))
1049 (t first)))
1050 ;; Pack the blob of "keys" using 32 bits each
1051 (let ((sum 0))
1052 ;; reversal here is magic. Don't worry about getting it wrong-
1053 ;; unicode-collation.pure checks that.
1054 (dolist (part (reverse keys) sum)
1055 (setq sum (logior (ash sum 32) part))))))))
1056 (format output ")~%"))
1057 (with-output-lisp-expr-file (*standard-output* "other-collation-info")
1058 (write-string ";;; The highest primary variable collation index")
1059 (terpri)
1060 (prin1 *maximum-variable-key*) (terpri)))
1062 (defun output (&optional (*output-directory* *output-directory*))
1063 (ensure-directories-exist *output-directory*)
1064 (output-misc-data)
1065 (output-ucd-data)
1066 (output-decomposition-data)
1067 (output-composition-data)
1068 (output-case-data)
1069 (output-collation-data)
1070 (with-output-lisp-expr-file (*standard-output* "misc-properties")
1071 (prin1 **proplist-properties**))
1073 (loop for (filename . data) in '(("ucd-names" . *unicode-names*)
1074 ("ucd1-names" . *unicode-1-names*))
1076 (with-output-lisp-expr-file (f filename)
1077 (format f ";;; Do not edit by hand: generated by ucd.lisp~%")
1078 (loop for (code . name) in (nreverse (symbol-value data))
1079 do (format f "#x~X ~S~%" code name))))
1080 (setf *unicode-names* nil)
1081 (setf *unicode-1-names* nil)
1083 (with-output-lisp-expr-file (*standard-output* "numerics")
1084 (prin1 (mapcar (lambda (x) (cons (car x) (read-from-string (cdr x))))
1085 ;; Print it low to high. It was collected with PUSH
1086 (reverse *different-numerics*))))
1087 (with-output-lisp-expr-file (*standard-output* "titlecases")
1088 (prin1 *different-titlecases*))
1089 (with-output-lisp-expr-file (*standard-output* "foldcases")
1090 (prin1 *different-casefolds*))
1091 (with-output-lisp-expr-file (*standard-output* "confusables")
1092 (prin1 *confusables*))
1093 (with-output-lisp-expr-file (*standard-output* "bidi-mirrors")
1094 (prin1 *bidi-mirroring-glyphs*))
1095 (with-output-lisp-expr-file (*standard-output* "block-ranges")
1096 (prin1 (car *blocks*)))
1097 (with-output-lisp-expr-file (*standard-output* "block-names")
1098 (format t "#(~{:~A~^~% ~})" (cdr *blocks*)))
1099 (values))