3 (declaim (optimize debug
))
7 (defvar *output-directory
*
9 (make-pathname :directory
'(:relative
:up
"output" "ucd"))
10 (make-pathname :directory
(pathname-directory *load-truename
*))))
12 (defparameter *unicode-character-database
*
13 (make-pathname :directory
(pathname-directory *load-pathname
*)))
15 (defmacro with-input-txt-file
((s name
) &body body
)
16 `(with-open-file (,s
(make-pathname :name
,name
:type
"txt"
17 :defaults
*unicode-character-database
*))
18 (setf (gethash (format nil
"tools-for-build/~A.txt" ,name
) *ucd-inputs
*) 'used
)
21 (defmacro with-input-utf8-file
22 ((s name
&key
(eszets 0) (registereds 1) (copyrights 1)) &body body
)
23 ;; KLUDGE: Unicode data files in general have registered and
24 ;; copyright marks (non-ASCII characters) in the header;
25 ;; additionally, CaseFolding.txt as distributed by Unicode contains
26 ;; a non-ASCII character, an eszet, within a comment to act as an
27 ;; example. We can't in general assume that our host lisp will let
28 ;; us read those, and we can't portably write that we don't care
29 ;; about the text content of anything on a line after a hash because
30 ;; text decoding happens at a lower level. So here we rewrite data
31 ;; files to exclude the UTF-8 sequences corresponding to those
32 ;; characters (and error if we see any other UTF-8 sequence).
33 (let ((in (gensym "IN"))
35 `(let ((filename (format nil
"~A.txt" ,name
)))
36 (with-open-file (,in
(make-pathname :name
,name
:type
"txt"
37 :defaults
*unicode-character-database
*)
38 :element-type
'(unsigned-byte 8))
39 (setf (gethash (format nil
"tools-for-build/~A.txt" ,name
) *ucd-inputs
*) 'used
)
40 (with-open-file (,out
(make-pathname :name
,name
:type
"txt"
41 :defaults
*output-directory
*)
42 :element-type
'(unsigned-byte 8)
45 :if-does-not-exist
:create
)
46 (setf (gethash (format nil
"output/ucd/~A.txt" ,name
) *ucd-outputs
*) 'made
)
47 (do ((inbyte (read-byte ,in nil nil
) (read-byte ,in nil nil
))
48 (eszet (map '(vector (unsigned-byte 8)) 'char-code
"<eszet>"))
50 (copyright (map '(vector (unsigned-byte 8)) 'char-code
"<copyright>"))
52 (registered (map '(vector (unsigned-byte 8)) 'char-code
"<registered>"))
55 (unless (= eszet-count
,eszets
)
56 (error "Unexpected number of eszets in ~A: ~D (expected ~D)"
57 filename eszet-count
,eszets
))
58 (unless (= copyright-count
,copyrights
)
59 (error "Unexpected number of copyright symbols in ~A: ~D (expected ~D)"
60 filename copyright-count
,copyrights
))
61 (unless (= registered-count
,registereds
)
62 (error "Unexpected number of registered symbols in ~A: ~D (expected ~D)"
63 filename registered-count
,registereds
)))
66 (let ((second (read-byte ,in nil nil
)))
69 (error "No continuation after #xc3 in ~A" filename
))
70 ((= second
#x9f
) (incf eszet-count
) (write-sequence eszet
,out
))
71 (t (error "Unexpected continuation after #xc3 in ~A: #x~X"
74 (let ((second (read-byte ,in nil nil
)))
77 (error "No continuation after #xc2 in ~A" filename
))
78 ((= second
#xa9
) (incf copyright-count
) (write-sequence copyright
,out
))
79 ((= second
#xae
) (incf registered-count
) (write-sequence registered
,out
)))))
81 (error "Unexpected octet in ~A: #x~X" filename inbyte
))
82 (t (write-byte inbyte
,out
))))))
83 (with-open-file (,s
(make-pathname :name
,name
:type
"txt"
84 :defaults
*output-directory
*))
88 (unless (eql (peek-char nil
,s nil nil
) nil
)
89 (error "Unread data in data file: ~S" ,name
)))))))
91 (defmacro with-input-arbitrary-utf8-file
((s name
) &body body
)
92 ;; KLUDGE: the emoji data includes literal emoji; likewise the
93 ;; confusables. We just remove all high-bit stuff, on the
94 ;; assumption that the actual data is still ASCII.
95 (let ((in (gensym "IN"))
98 (with-open-file (,in
(make-pathname :name
,name
:type
"txt"
99 :defaults
*unicode-character-database
*)
100 :element-type
'(unsigned-byte 8))
101 (setf (gethash (format nil
"tools-for-build/~A.txt" ,name
) *ucd-inputs
*) 'used
)
102 (with-open-file (,out
(make-pathname :name
,name
:type
"txt"
103 :defaults
*output-directory
*)
104 :element-type
'(unsigned-byte 8)
106 :if-exists
:supersede
107 :if-does-not-exist
:create
)
108 (setf (gethash (format nil
"output/ucd/~A.txt" ,name
) *ucd-outputs
*) 'made
)
109 (do ((inbyte (read-byte ,in nil nil
) (read-byte ,in nil nil
)))
112 ((>= inbyte
#x7f
) nil
)
113 (t (write-byte inbyte
,out
))))))
114 (with-open-file (,s
(make-pathname :name
,name
:type
"txt"
115 :defaults
*output-directory
*))
116 (multiple-value-prog1
119 (unless (eql (peek-char nil
,s nil nil
) nil
)
120 (error "Unread data in data file: ~S" ,name
)))))))
122 (defmacro with-output-dat-file
((s name
) &body body
)
123 `(with-open-file (,s
(make-pathname :name
,name
:type
"dat"
124 :defaults
*output-directory
*)
125 :direction
:output
:element-type
'(unsigned-byte 8)
126 :if-exists
:supersede
:if-does-not-exist
:create
)
127 (setf (gethash (format nil
"output/ucd/~A.dat" ,name
) *ucd-outputs
*) 'made
)
130 (defmacro with-ucd-output-syntax
(&body body
)
131 `(with-standard-io-syntax
132 (let ((*readtable
* (copy-readtable))
133 (*print-readably
* nil
)
137 (defmacro with-output-lisp-expr-file
((s name
) &body body
)
138 `(with-open-file (,s
(make-pathname :name
,name
:type
"lisp-expr"
139 :defaults
*output-directory
*)
140 :direction
:output
:element-type
'character
141 :if-exists
:supersede
:if-does-not-exist
:create
)
142 (setf (gethash (format nil
"output/ucd/~A.lisp-expr" ,name
) *ucd-outputs
*) 'made
)
143 (with-ucd-output-syntax
146 (defun split-string (line character
)
147 (loop for prev-position
= 0 then
(1+ position
)
148 for position
= (position character line
:start prev-position
)
149 collect
(subseq line prev-position position
)
153 (defun parse-codepoints (string &key
(singleton-list t
))
154 "Gets a list of codepoints out of 'aaaa bbbb cccc', stripping surrounding space"
156 (lambda (s) (parse-integer s
:radix
16))
157 (remove "" (split-string string
#\Space
) :test
#'string
=))))
158 (if (not (or (cdr list
) singleton-list
)) (car list
) list
)))
161 (defun parse-codepoint-range (string)
162 "Parse the Unicode syntax DDDD|DDDD..DDDD into an inclusive range (start end)"
163 (destructuring-bind (start &optional empty end
) (split-string string
#\.
)
164 (declare (ignore empty
))
165 (let* ((head (parse-integer start
:radix
16))
167 (parse-integer end
:radix
16 :end
(position #\Space end
))
171 (defvar *slurped-random-constants
*
172 (with-open-file (f (make-pathname :name
"more-ucd-consts" :type
"lisp-expr"
173 :defaults
*unicode-character-database
*))
174 (setf (gethash "tools-for-build/more-ucd-consts.lisp-expr" *ucd-inputs
*) 'used
)
177 (defun init-indices (symbol &aux
(strings
178 (or (cadr (assoc symbol
*slurped-random-constants
*))
179 (error "Missing entry in more-ucd-consts for ~S"
181 (let ((hash (make-hash-table :test
#'equal
)))
182 (loop for string in strings
184 do
(setf (gethash string hash
) index
))
187 (defun index-or-lose (key table kind
)
188 (or (gethash key table
)
189 (error "unknown ~A: ~S" kind key
)))
191 (defun clear-flag (bit integer
)
192 (logandc2 integer
(ash 1 bit
)))
195 ;;; Output storage globals
196 (defstruct ucd misc decomp
)
198 (defparameter *unicode-names
* nil
)
199 (defparameter *unicode-1-names
* nil
)
201 (defparameter *decompositions
*
202 (make-array 10000 :element-type
'(unsigned-byte 24) :fill-pointer
0
203 :adjustable t
)) ; 10000 is not a significant number
205 (defparameter *decomposition-corrections
*
206 (with-input-utf8-file (s "NormalizationCorrections")
207 (loop with result
= nil
208 for line
= (read-line s nil nil
) while line
209 do
(when (position #\
; line)
210 (destructuring-bind (cp old-decomp correction version
)
211 (split-string line
#\
;)
212 (declare (ignore old-decomp version
))
213 (push (cons (parse-integer cp
:radix
16)
214 (parse-integer correction
:radix
16))
216 finally
(return result
)))
217 "List of decompsotions that were amended in Unicode corrigenda")
219 (defparameter *compositions
* (make-hash-table :test
#'equal
))
220 (defparameter *composition-exclusions
*
221 (with-input-utf8-file (s "CompositionExclusions")
222 (loop with result
= nil
223 for line
= (read-line s nil nil
) while line
224 when
(and (> (length line
) 0) (char/= (char line
0) #\
#))
225 do
(push (parse-integer line
:end
(position #\Space line
) :radix
16)
226 result
) finally
(return result
)))
227 "Characters that are excluded from composition according to UAX#15")
229 (defparameter *different-titlecases
* nil
)
230 (defparameter *different-casefolds
* nil
)
232 (defparameter *case-mapping
*
233 (with-input-utf8-file (s "SpecialCasing")
234 (loop with hash
= (make-hash-table)
235 for line
= (read-line s nil nil
) while line
236 unless
(or (not (position #\
# line
)) (= 0 (position #\
# line
)))
237 do
(destructuring-bind (%cp %lower %title %upper
&optional context comment
)
238 (split-string line
#\
;)
239 (unless (and context comment
)
240 (let ((cp (parse-integer %cp
:radix
16))
241 (lower (parse-codepoints %lower
:singleton-list nil
))
242 (title (parse-codepoints %title
:singleton-list nil
))
243 (upper (parse-codepoints %upper
:singleton-list nil
)))
244 (setf (gethash cp hash
) (cons upper lower
))
245 (unless (equal title upper
) (push (cons cp title
) *different-titlecases
*)))))
246 finally
(return hash
)))
247 "Maps cp -> (cons uppercase|(uppercase ...) lowercase|(lowercase ...))")
249 (defparameter *misc-table
* (make-array 3000 :fill-pointer
0)
250 "Holds the entries in the Unicode database's miscellanious array, stored as lists.
251 These lists have the form (gc-index bidi-index ccc digit decomposition-info
252 flags script line-break age). Flags is a bit-bashed integer containing
253 cl-both-case-p, has-case-p, and bidi-mirrored-p, and an east asian width.
254 Length should be adjusted when the standard changes.")
255 (defparameter *misc-hash
* (make-hash-table :test
#'equal
)
256 "Maps a misc list to its position in the misc table.")
258 (defparameter *different-numerics
* nil
)
260 (defparameter *ucd-entries
* (make-hash-table))
262 (defparameter *general-categories
* (init-indices '*general-categories
*))
263 (defparameter *bidi-classes
* (init-indices '*bidi-classes
*))
264 (defparameter *east-asian-widths
* (init-indices '*east-asian-widths
*))
265 (defparameter *scripts
* (init-indices '*scripts
*))
266 (defparameter *line-break-classes
* (init-indices '*line-break-classes
*))
268 (defparameter *east-asian-width-table
*
269 (with-input-utf8-file (s "EastAsianWidth")
270 (loop with hash
= (make-hash-table)
271 for line
= (read-line s nil nil
) while line
272 unless
(or (not (position #\
# line
)) (= 0 (position #\
# line
)))
273 do
(destructuring-bind (codepoints value
)
275 (string-right-trim " " (subseq line
0 (position #\
# line
))) #\
;)
276 (let ((range (parse-codepoint-range codepoints
))
277 (index (index-or-lose value
*east-asian-widths
* "East Asian width")))
278 (loop for i from
(car range
) to
(cadr range
)
279 do
(setf (gethash i hash
) index
))))
280 finally
(return hash
)))
281 "Table of East Asian Widths. Used in the creation of misc entries.")
283 (defparameter *script-table
*
284 (with-input-utf8-file (s "Scripts")
285 (loop with hash
= (make-hash-table)
286 for line
= (read-line s nil nil
) while line
287 unless
(or (not (position #\
# line
)) (= 0 (position #\
# line
)))
288 do
(destructuring-bind (codepoints value
)
290 (string-right-trim " " (subseq line
0 (position #\
# line
))) #\
;)
291 (let ((range (parse-codepoint-range codepoints
))
292 (index (index-or-lose (subseq value
1) *scripts
* "script")))
293 (loop for i from
(car range
) to
(cadr range
)
294 do
(setf (gethash i hash
) index
))))
295 finally
(return hash
)))
296 "Table of scripts. Used in the creation of misc entries.")
298 (defparameter *line-break-class-table
*
299 (with-input-utf8-file (s "LineBreak")
300 (loop with hash
= (make-hash-table)
301 for line
= (read-line s nil nil
) while line
302 unless
(or (not (position #\
# line
)) (= 0 (position #\
# line
)))
303 do
(destructuring-bind (codepoints value
)
305 (string-right-trim " " (subseq line
0 (position #\
# line
))) #\
;)
306 (let* ((range (parse-codepoint-range codepoints
))
307 ;; Hangul syllables are marked as "Unknown", and programmatically
308 ;; handled in SB-UNICODE:LINE-BREAK-CLASS
310 (or (and (member value
'("JL" "JV" "JT" "H2" "H3") :test
'string
=) "XX")
312 (index (index-or-lose value
*line-break-classes
* "line break")))
313 (loop for i from
(car range
) to
(cadr range
)
314 do
(setf (gethash i hash
) index
))))
315 finally
(return hash
)))
316 "Table of line break classes. Used in the creation of misc entries.")
318 (defparameter *age-table
*
319 (with-input-utf8-file (s "DerivedAge")
320 (loop with hash
= (make-hash-table)
321 for line
= (read-line s nil nil
) while line
322 unless
(or (not (position #\
# line
)) (= 0 (position #\
# line
)))
323 do
(destructuring-bind (codepoints value
)
325 (string-right-trim " " (subseq line
0 (position #\
# line
))) #\
;)
326 (let* ((range (parse-codepoint-range codepoints
))
327 (age-parts (mapcar #'parse-integer
(split-string value
#\.
)))
328 (age (logior (ash (car age-parts
) 3) (cadr age-parts
))))
329 (loop for i from
(car range
) to
(cadr range
)
330 do
(setf (gethash i hash
) age
))))
331 finally
(return hash
)))
332 "Table of character ages. Used in the creation of misc entries.")
334 (defvar *block-first
* nil
)
337 ;;; Unicode data file parsing
338 (defun hash-misc (gc-index bidi-index ccc digit decomposition-info flags
339 script line-break age
)
340 (let* ((list (list gc-index bidi-index ccc digit decomposition-info flags
341 script line-break age
))
342 (index (gethash list
*misc-hash
*)))
345 (setf (gethash list
*misc-hash
*)
346 (fill-pointer *misc-table
*))
347 (when (eql nil
(vector-push list
*misc-table
*))
348 (error "Misc table too small."))
349 (gethash list
*misc-hash
*)))))
351 (defun ordered-ranges-member (item vector
)
352 (labels ((recurse (start end
)
354 (let* ((i (+ start
(truncate (- end start
) 2)))
356 (elt1 (svref vector index
))
357 (elt2 (svref vector
(1+ index
))))
361 (recurse (+ 1 i
) end
))
364 (recurse 0 (/ (length vector
) 2))))
366 (defun unallocated-bidi-class (code-point)
367 ;; See tests/data/DerivedBidiClass.txt for more information
368 (flet ((in (vector class
)
369 (when (ordered-ranges-member code-point vector
)
370 (gethash class
*bidi-classes
*))))
396 ((in #(#x20A0
#x20CF
) "ET"))
397 ;; BN is non-characters and default-ignorable.
398 ;; Default-ignorable will be dealt with elsewhere
399 ((in #(#xFDD0
#xFDEF
#xFFFE
#xFFFF
#x1FFFE
#x1FFFF
#x2FFFE
#x2FFFF
400 #x3FFFE
#x3FFFF
#x4FFFE
#x4FFFF
#x5FFFE
#x5FFFF
#x6FFFE
#x6FFFF
401 #x7FFFE
#x7FFFF
#x8FFFE
#x8FFFF
#x9FFFE
#x9FFFF
#xAFFFE
#xAFFFF
402 #xBFFFE
#xBFFFF
#xCFFFE
#xCFFFF
#xDFFFE
#xDFFFF
#xEFFFE
#xEFFFF
403 #xFFFFE
#xFFFFF
#x10FFFE
#x10FFFF
)
405 ((in #(#x0
#x10FFFF
) "L"))
406 (t (error "Somehow we've gone too far in unallocated bidi determination")))))
408 (defun complete-misc-table ()
409 (loop for code-point from
0 to
#x10FFFF do
; Flood-fill unallocated codepoints
410 (unless (second (multiple-value-list (gethash code-point
*ucd-entries
*)))
411 (let* ((unallocated-misc
412 ;; unallocated characters have a GC of "Cn", aren't digits
413 ;; (digit = 128), have a bidi that depends on their block, and
414 ;; don't decompose, combine, or have case. They have an East
415 ;; Asian Width (eaw) of "N" (0), and a script, line breaking
416 ;; class, and age of 0 ("Unknown"), unless some of those
417 ;; properties are otherwise assigned.
418 `(,(gethash "Cn" *general-categories
*)
419 ,(unallocated-bidi-class code-point
) 0 128 0
420 ,(gethash code-point
*east-asian-width-table
* 0)
421 0 ,(gethash code-point
*line-break-class-table
* 0)
422 ,(gethash code-point
*age-table
* 0)))
423 (unallocated-index (apply #'hash-misc unallocated-misc
))
424 (unallocated-ucd (make-ucd :misc unallocated-index
)))
425 (setf (gethash code-point
*ucd-entries
*) unallocated-ucd
)))))
427 (defun expand-decomposition (decomposition)
428 (loop for cp in decomposition
429 for ucd
= (gethash cp
*ucd-entries
*)
430 for length
= (elt (aref *misc-table
* (ucd-misc ucd
)) 4)
431 if
(and (not (logbitp 7 length
))
433 append
(expand-decomposition (ucd-decomp ucd
))
437 ;;; Recursively expand canonical decompositions
438 (defun fixup-decompositions ()
439 (loop for did-something
= nil
441 (loop for ucd being each hash-value of
*ucd-entries
*
442 when
(and (ucd-decomp ucd
)
443 (not (logbitp 7 (elt (aref *misc-table
* (ucd-misc ucd
)) 4))))
445 (let ((expanded (expand-decomposition (ucd-decomp ucd
))))
446 (unless (equal expanded
(ucd-decomp ucd
))
447 (setf (ucd-decomp ucd
) expanded
450 (loop for i below
(hash-table-count *ucd-entries
*)
451 for ucd
= (gethash i
*ucd-entries
*)
452 for decomp
= (ucd-decomp ucd
)
454 (setf (ucd-decomp ucd
)
455 (cond ((not (consp decomp
)) 0)
456 ((logbitp 7 (elt (aref *misc-table
* (ucd-misc ucd
)) 4))
457 (prog1 (length *decompositions
*)
458 (loop for cp in decomp
459 do
(vector-push-extend cp
*decompositions
*))))
461 (let ((misc-entry (copy-list (aref *misc-table
* (ucd-misc ucd
)))))
462 (setf (elt misc-entry
4) (length decomp
)
463 (ucd-misc ucd
) (apply #'hash-misc misc-entry
))
464 (prog1 (length *decompositions
*)
465 (loop for cp in decomp
466 do
(vector-push-extend cp
*decompositions
*)))))))))
468 (defun fixup-compositions ()
472 (ucd (gethash cp
*ucd-entries
*))
473 (misc (aref *misc-table
* (ucd-misc ucd
)))
475 ;; we can do everything in the first pass except for
476 ;; accounting for decompositions where the first
477 ;; character of the decomposition is not a starter.
479 (remhash k
*compositions
*)))))
480 (maphash #'fixup
*compositions
*)))
482 (defun add-jamo-information (line table
)
483 (let* ((split (split-string line
#\
;))
484 (code (parse-integer (first split
) :radix
16))
485 (syllable (string-trim
487 (subseq (second split
) 0 (position #\
# (second split
))))))
488 (setf (gethash code table
) syllable
)))
490 (defun fixup-hangul-syllables ()
491 ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
492 (let* ((sbase #xac00
)
500 (ncount (* vcount tcount
))
501 (table (make-hash-table)))
502 (declare (ignore lcount
))
503 (with-input-utf8-file (*standard-input
* "Jamo")
504 (loop for line
= (read-line nil nil
)
506 if
(position #\
; line)
507 do
(add-jamo-information line table
)))
508 (dotimes (sindex scount
)
509 (let* ((l (+ lbase
(floor sindex ncount
)))
510 (v (+ vbase
(floor (mod sindex ncount
) tcount
)))
511 (tee (+ tbase
(mod sindex tcount
)))
512 (code-point (+ sbase sindex
))
513 (name (format nil
"HANGUL_SYLLABLE_~A~A~:[~A~;~]"
514 (gethash l table
) (gethash v table
)
515 (= tee tbase
) (gethash tee table
))))
516 (push `(,code-point .
,name
) *unicode-names
*)))))
518 (defun normalize-character-name (name)
519 (when (find #\_ name
)
520 (error "Bad name for a character: ~A" name
))
521 ;; U+1F5CF (PAGE)'s name conflicts with the ANSI CL-assigned
522 ;; name for form feed (^L, U+000C). To avoid a case where
523 ;; more than one character has a particular name while remaining
524 ;; standards-compliant, we remove U+1F5CF's name here.
525 (when (string= name
"PAGE")
526 (return-from normalize-character-name
"UNICODE_PAGE"))
527 (unless (or (zerop (length name
)) (find #\
< name
) (find #\
> name
))
528 (substitute #\_
#\Space name
)))
530 ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
531 ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
532 ;;; D800 -- F8FF : surrogates and private use
533 ;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
534 ;;; F0000 -- FFFFD : private use
535 ;;; 100000 -- 10FFFD: private use
536 (defun encode-ucd-line (line code-point
)
537 (destructuring-bind (name general-category canonical-combining-class
538 bidi-class decomposition-type-and-mapping
539 decimal-digit digit numeric bidi-mirrored
540 unicode-1-name iso-10646-comment simple-uppercase
541 simple-lowercase simple-titlecase
)
543 (declare (ignore iso-10646-comment
))
544 (if (and (> (length name
) 8)
545 (string= ", First>" name
:start2
(- (length name
) 8)))
547 (setf *block-first
* code-point
)
550 (index-or-lose general-category
*general-categories
* "general category"))
551 (bidi-index (index-or-lose bidi-class
*bidi-classes
* "bidirectional class"))
552 (ccc (parse-integer canonical-combining-class
))
553 (digit-index (if (string= "" digit
) 128 ; non-digits have high bit
554 (let ((%digit
(parse-integer digit
)))
555 (if (string= digit decimal-digit
)
556 ;; decimal-digit-p is in bit 6
557 (logior (ash 1 6) %digit
) %digit
))))
558 (upper-index (unless (string= "" simple-uppercase
)
559 (parse-integer simple-uppercase
:radix
16)))
560 (lower-index (unless (string= "" simple-lowercase
)
561 (parse-integer simple-lowercase
:radix
16)))
562 (title-index (unless (string= "" simple-titlecase
)
563 (parse-integer simple-titlecase
:radix
16)))
564 (cl-both-case-p (or (and (= gc-index
0) lower-index
)
565 (and (= gc-index
1) upper-index
)))
566 (bidi-mirrored-p (string= bidi-mirrored
"Y"))
567 (decomposition-info 0)
568 (eaw-index (gethash code-point
*east-asian-width-table
*))
569 (script-index (gethash code-point
*script-table
* 0))
570 (line-break-index (gethash code-point
*line-break-class-table
* 0))
571 (age-index (gethash code-point
*age-table
* 0))
574 (when (and (not cl-both-case-p
)
576 (format t
"~A~%" name
))
578 (when (string/= "" decomposition-type-and-mapping
)
579 (let* ((compatibility-p (position #\
> decomposition-type-and-mapping
)))
582 (subseq decomposition-type-and-mapping
583 (if compatibility-p
(1+ compatibility-p
) 0))))
584 (when (assoc code-point
*decomposition-corrections
*)
586 (list (cdr (assoc code-point
*decomposition-corrections
*)))))
587 (setf decomposition-info
588 (logior (length decomposition
) (if compatibility-p
128 0)))
589 (unless compatibility-p
590 ;; Primary composition excludes:
591 ;; * singleton decompositions;
592 ;; * decompositions of non-starters;
593 ;; * script-specific decompositions;
594 ;; * later-version decompositions;
595 ;; * decompositions whose first character is a
597 ;; All but the last case can be handled here;
598 ;; for the fixup, see FIXUP-COMPOSITIONS
599 (when (and (> decomposition-info
1)
601 (not (member code-point
*composition-exclusions
*)))
602 (unless (= decomposition-info
2)
603 (error "canonical decomposition unexpectedly long"))
604 (setf (gethash (cons (first decomposition
)
605 (second decomposition
))
608 ;; Hangul decomposition; see Unicode 6.2 section 3-12
609 (when (= code-point
#xd7a3
)
610 ;; KLUDGE: The decomposition-length for Hangul syllables in the
611 ;; misc database will be a bit of a lie. It doesn't really matter
612 ;; since the only purpose of the length is to index into the
613 ;; decompositions array (which Hangul decomposition doesn't use).
614 ;; The decomposition index is 0 because we won't be going into the
616 (setf decomposition-info
3))
618 (unless (gethash code-point
*case-mapping
*) ; Exclude codepoints from SpecialCasing
619 (when (string/= simple-uppercase simple-titlecase
)
620 (push (cons code-point title-index
) *different-titlecases
*))
621 (and (or upper-index lower-index
)
622 (setf (gethash code-point
*case-mapping
*)
624 (or upper-index code-point
)
625 (or lower-index code-point
)))))
627 (when (string/= digit numeric
)
628 (push (cons code-point numeric
) *different-numerics
*))
631 (error "canonical combining class too large ~A" ccc
))
632 (let* ((flags (logior
633 (if cl-both-case-p
(ash 1 7) 0)
634 (if (gethash code-point
*case-mapping
*) (ash 1 6) 0)
635 (if bidi-mirrored-p
(ash 1 5) 0)
637 (misc-index (hash-misc gc-index bidi-index ccc digit-index
638 decomposition-info flags script-index
639 line-break-index age-index
))
640 (result (make-ucd :misc misc-index
641 :decomp decomposition
)))
642 (when (and (> (length name
) 7)
643 (string= ", Last>" name
:start2
(- (length name
) 7)))
644 ;; We can still do this despite East Asian Width being in the
645 ;; databasce since each of the UCD <First><Last> blocks
646 ;; has a consistent East Asian Width
647 (loop for point from
*block-first
* to code-point do
648 (setf (gethash point
*ucd-entries
*) result
)))
649 (values result
(normalize-character-name name
)
650 (normalize-character-name unicode-1-name
)))))))
652 (defun slurp-ucd-line (line line-number
)
653 (declare (ignorable line-number
))
654 (let* ((split-line (split-string line
#\
;))
655 (code-point (parse-integer (first split-line
) :radix
16)))
656 (multiple-value-bind (encoding name unicode-1-name
)
657 (encode-ucd-line (cdr split-line
) code-point
)
658 ;; Dup code-point exist - maybe to do with composition/decomposition? no idea -
659 ;; but we need to ensure that there are no dups in the name tables.
660 ;; Since those tables are not hash-tables any more, we have to inefficiently
661 ;; scan to assert not-existsp. But since scanning an alist is not optimal,
662 ;; don't enable this warning unless debugging.
664 (when (gethash code-point
*ucd-entries
*)
665 (assert (not (assoc code-point
*unicode-names
*)))
666 (assert (not (assoc code-point
*unicode-1-names
*)))
667 (warn "line ~D: dup for code-point ~x, already have ~S"
668 line-number code-point
(gethash code-point
*ucd-entries
*)))
669 (setf (gethash code-point
*ucd-entries
*) encoding
)
671 (push `(,code-point .
,name
) *unicode-names
*))
673 (push `(,code-point .
,unicode-1-name
) *unicode-1-names
*)))))
675 ;;; this fixes up the case conversion discrepancy between CL and
676 ;;; Unicode: CL operators depend on char-downcase / char-upcase being
677 ;;; inverses, which is not true in general in Unicode even for
678 ;;; characters which change case to single characters.
679 ;;; Also, fix misassigned age values, which are not constant across blocks
680 (defun second-pass ()
682 (sort (loop for code-point being the hash-keys in
*case-mapping
*
683 using
(hash-value value
)
684 collect
(cons code-point value
))
686 (loop for
(code-point upper . lower
) in case-mapping
687 for misc-index
= (ucd-misc (gethash code-point
*ucd-entries
*))
688 for
(gc bidi ccc digit decomp flags script lb age
) = (aref *misc-table
* misc-index
)
689 when
(logbitp 7 flags
) do
690 (when (or (not (atom upper
)) (not (atom lower
))
692 (not (equal (car (gethash lower
*case-mapping
*)) code-point
)))
694 (not (equal (cdr (gethash upper
*case-mapping
*)) code-point
))))
695 (let* ((new-flags (clear-flag 7 flags
))
696 (new-misc (hash-misc gc bidi ccc digit decomp new-flags script lb age
)))
697 (setf (ucd-misc (gethash code-point
*ucd-entries
*)) new-misc
))))))
699 (defun fixup-casefolding ()
700 (with-input-utf8-file (s "CaseFolding" :eszets
1)
701 (loop for line
= (read-line s nil nil
)
703 unless
(or (not (position #\
; line)) (equal (position #\# line) 0))
704 do
(destructuring-bind (original type mapping comment
)
705 (split-string line
#\
;)
706 (declare (ignore comment
))
707 (let ((cp (parse-integer original
:radix
16))
708 (fold (parse-codepoints mapping
:singleton-list nil
)))
709 (unless (or (string= type
" S") (string= type
" T"))
710 (when (not (equal (cdr (gethash cp
*case-mapping
*)) fold
))
711 (push (cons cp fold
) *different-casefolds
*))))))))
714 (let ((age-data (sort
715 (loop for code-point being the hash-keys in
*age-table
*
716 using
(hash-value true-age
)
717 collect
(cons code-point true-age
))
719 (loop for
(code-point . true-age
) in age-data
720 for misc-index
= (ucd-misc (gethash code-point
*ucd-entries
*))
721 for
(gc bidi ccc digit decomp flags script lb age
) = (aref *misc-table
* misc-index
)
722 unless
(= age true-age
) do
723 (let* ((new-misc (hash-misc gc bidi ccc digit decomp flags script lb true-age
))
726 :decomp
(ucd-decomp (gethash code-point
*ucd-entries
*)))))
727 (setf (gethash code-point
*ucd-entries
*) new-ucd
)))))
730 (with-input-txt-file (*standard-input
* "UnicodeData")
732 (format t
"~%//slurp-ucd~%"))
733 (loop for line
= (read-line nil nil
)
736 do
(slurp-ucd-line line number
)))
739 (fixup-hangul-syllables)
740 (complete-misc-table)
743 (fixup-decompositions)
748 (defparameter **proplist-properties
** nil
749 "A list of properties extracted from PropList.txt")
751 (defun parse-property (stream &optional name
)
752 (let ((result (make-array 1 :fill-pointer
0 :adjustable t
)))
753 (loop for line
= (read-line stream nil nil
)
754 for entry
= (subseq line
0 (position #\
# line
))
755 ;; Deal with Blah=Blah in DerivedNormalizationProps.txt
756 while
(and line
(not (position #\
= (substitute #\Space
#\
= line
:count
1))))
757 when
(and entry
(string/= entry
""))
759 (destructuring-bind (start end
)
760 (parse-codepoint-range (car (split-string entry
#\
;)))
761 (vector-push-extend start result
)
762 (vector-push-extend end result
)))
764 (push name
**proplist-properties
**)
765 (push result
**proplist-properties
**))))
767 (defun slurp-proplist ()
768 (with-input-utf8-file (s "PropList")
769 (parse-property s
) ;; Initial comments
770 (parse-property s
:white-space
)
771 (parse-property s
:bidi-control
)
772 (parse-property s
:join-control
)
773 (parse-property s
:dash
)
774 (parse-property s
:hyphen
)
775 (parse-property s
:quotation-mark
)
776 (parse-property s
:terminal-punctuation
)
777 (parse-property s
:other-math
)
778 (parse-property s
:hex-digit
)
779 (parse-property s
:ascii-hex-digit
)
780 (parse-property s
:other-alphabetic
)
781 (parse-property s
:ideographic
)
782 (parse-property s
:diacritic
)
783 (parse-property s
:extender
)
784 (parse-property s
:other-lowercase
)
785 (parse-property s
:other-uppercase
)
786 (parse-property s
:noncharacter-code-point
)
787 (parse-property s
:other-grapheme-extend
)
788 (parse-property s
:ids-binary-operator
)
789 (parse-property s
:ids-trinary-operator
)
790 (parse-property s
:radical
)
791 (parse-property s
:unified-ideograph
)
792 (parse-property s
:other-default-ignorable-code-point
)
793 (parse-property s
:deprecated
)
794 (parse-property s
:soft-dotted
)
795 (parse-property s
:logical-order-exception
)
796 (parse-property s
:other-id-start
)
797 (parse-property s
:other-id-continue
)
798 (parse-property s
:sentence-terminal
)
799 (parse-property s
:variation-selector
)
800 (parse-property s
:pattern-white-space
)
801 (parse-property s
:pattern-syntax
)
802 (parse-property s
:prepended-concatenation-mark
)
803 (parse-property s
:regional-indicator
))
805 (with-input-utf8-file (s "DerivedNormalizationProps")
806 (parse-property s
) ;; Initial comments
807 (parse-property s
) ;; FC_NFKC_Closure
808 (parse-property s
) ;; FC_NFKC_Closure
809 (parse-property s
) ;; Full_Composition_Exclusion
810 (parse-property s
) ;; NFD_QC Comments
811 (parse-property s
:nfd-qc
)
812 (parse-property s
) ;; NFC_QC Comments
813 (parse-property s
:nfc-qc
)
814 (parse-property s
:nfc-qc-maybe
)
815 (parse-property s
) ;; NFKD_QC Comments
816 (parse-property s
:nfkd-qc
)
817 (parse-property s
) ;; NFKC_QC Comments
818 (parse-property s
:nfkc-qc
)
819 (parse-property s
:nfkc-qc-maybe
)
820 (parse-property s
) ;; Expands_On_NFD
821 (parse-property s
) ;; Expands_On_NFC
822 (parse-property s
) ;; Expands_On_NFKD
823 (parse-property s
) ;; Expands_On_NFKC
824 (parse-property s
) ;; NFKC_CF
825 (parse-property s
)) ;; Changes_When_NFKC_Casefolded
827 (with-input-arbitrary-utf8-file (s "emoji-data")
828 (parse-property s
) ;; Initial comments
829 (parse-property s
:emoji
)
830 (parse-property s
:emoji-presentation
)
831 (parse-property s
:emoji-modifier
)
832 (parse-property s
:emoji-modifier-base
)
833 (parse-property s
:emoji-component
)
834 (parse-property s
:extended-pictographic
))
836 (setf **proplist-properties
** (nreverse **proplist-properties
**))
841 (defvar *maximum-variable-key
* 1)
843 (defun bitpack-collation-key (primary secondary tertiary
)
844 ;; 0 <= primary <= #xFFFD (default table)
845 ;; 0 <= secondary <= #x10C [9 bits]
846 ;; 0 <= tertiary <= #x1E (#x1F allowed) [5 bits]
847 ;; Because of this, the bit packs don't overlap
848 (logior (ash primary
16) (ash secondary
5) tertiary
))
850 (defun parse-collation-line (line)
851 (destructuring-bind (%code-points %keys
) (split-string line
#\
;)
852 (let* ((code-points (parse-codepoints %code-points
))
853 (%keys
(subseq %keys
0 (search " #" %keys
)))
857 (split-string (remove #\
[ (remove #\Space %keys
)) #\
]) :test
#'string
=))
859 (loop for key in keys
860 for variable-p
= (position #\
* key
)
862 ;; Don't need first value, it's always just ""
863 (cdr (mapcar (lambda (x) (parse-integer x
:radix
16 :junk-allowed t
))
864 (split-string (substitute #\.
#\
* key
) #\.
)))
866 (destructuring-bind (primary secondary tertiary
) parsed
867 (when variable-p
(setf *maximum-variable-key
*
868 (max primary
*maximum-variable-key
*)))
869 (bitpack-collation-key primary secondary tertiary
)))))
870 (values code-points ret
))))
874 (defparameter *confusables
*
875 (with-input-arbitrary-utf8-file (stream "confusables")
877 (loop for line
= (read-line stream nil nil
)
879 when
(and (not (equal line
""))
880 (char/= (char line
0) #\
#))
884 (parse-integer x
:radix
16))
885 (split-string x
#\Space
))))
886 (let* ((semicolon (position #\
; line))
887 (semicolon2 (position #\
; line :start (1+ semicolon)))
888 (from (parse (subseq line
0 (1- semicolon
))))
889 (to (parse (subseq line
(+ semicolon
2) (1- semicolon2
)))))
890 (assert (= (length from
) 1))
891 (list* (car from
) to
)))))
892 "List of confusable codepoint sets")
894 (defparameter *bidi-mirroring-glyphs
*
895 (with-input-utf8-file (s "BidiMirroring")
896 (loop for line
= (read-line s nil nil
) while line
897 when
(and (plusp (length line
))
898 (char/= (char line
0) #\
#))
901 #'(lambda (c) (parse-codepoints c
:singleton-list nil
))
902 (split-string (subseq line
0 (position #\
# line
)) #\
;))))
903 "List of BIDI mirroring glyph pairs")
905 (defparameter *blocks
*
907 (with-input-utf8-file (stream "Blocks")
909 (let ((line (read-line stream nil nil
)))
910 (cond ((not line
) (return))
911 ((or (string= line
"") (position #\
# line
))) ; ignore
912 (t (let* ((split (split-string line
#\
;))
913 (range (parse-codepoint-range (car split
))))
914 (setq ranges
(list* (cadr range
) (car range
) ranges
))
915 (push (nsubstitute #\-
#\Space
916 (string-left-trim " " (cadr split
)))
918 (cons (nreverse (coerce ranges
'vector
)) (nreverse names
)))
919 "Vector of block starts and ends in a form acceptable to `ordered-ranges-position`.
920 Used to look up block data.")
923 (defun write-codepoint (code-point stream
)
924 (declare (type (unsigned-byte 32) code-point
))
925 (write-byte (ldb (byte 8 16) code-point
) stream
)
926 (write-byte (ldb (byte 8 8) code-point
) stream
)
927 (write-byte (ldb (byte 8 0) code-point
) stream
))
929 (defun output-misc-data ()
930 (with-output-dat-file (stream "ucdmisc")
931 (loop for
(gc-index bidi-index ccc digit decomposition-info flags
932 script line-break age
)
934 ;; three bits spare here
935 do
(write-byte gc-index stream
)
936 ;; three bits spare here
937 (write-byte bidi-index stream
)
938 (write-byte ccc stream
)
939 ;; bits 0-3 encode [0,9], bit 7 is for non-digit status,
940 ;; bit 6 is the decimal-digit flag. Two bits spare
941 (write-byte digit stream
)
942 (write-byte decomposition-info stream
)
943 (write-byte flags stream
) ; includes EAW in bits 0-3, bit 4 is free
944 (write-byte script stream
)
945 (write-byte line-break stream
)
946 (write-byte age stream
))))
948 (defun output-ucd-data ()
949 (with-output-dat-file (high-pages "ucdhigh")
950 (with-output-dat-file (low-pages "ucdlow")
951 ;; Output either the index into the misc array (if all the points in the
952 ;; high-page have the same misc value) or an index into the law-pages
953 ;; array / 256. For indexes into the misc array, set bit 15 (high bit).
954 ;; We should never have that many misc entries, so that's not a problem.
956 ;; If Unicode ever allocates an all-decomposing <First>/<Last> block (the
957 ;; only way to get a high page that outputs as the same and has a
958 ;; non-zero decomposition-index, which there's nowhere to store now),
959 ;; find me, slap me with a fish, and have fun fixing this mess.
960 (loop with low-pages-index
= 0
961 for high-page from
0 to
(ash #x10FFFF -
8)
962 for uniq-ucd-entries
= nil do
963 (loop for low-page from
0 to
#xFF do
965 (gethash (logior low-page
(ash high-page
8)) *ucd-entries
*)
966 uniq-ucd-entries
:test
#'equalp
))
967 (flet ((write-2-byte (int stream
)
968 (declare (type (unsigned-byte 16) int
))
969 (write-byte (ldb (byte 8 8) int
) stream
)
970 (write-byte (ldb (byte 8 0) int
) stream
)))
971 (case (length uniq-ucd-entries
)
972 (0 (error "Somehow, a high page has no codepoints in it."))
973 (1 (write-2-byte (logior
975 (ucd-misc (car uniq-ucd-entries
)))
977 (t (loop for low-page from
0 to
#xFF
978 for cp
= (logior low-page
(ash high-page
8))
979 for entry
= (gethash cp
*ucd-entries
*) do
980 (write-2-byte (ucd-misc entry
) low-pages
)
981 (write-2-byte (ucd-decomp entry
) low-pages
)
982 finally
(write-2-byte low-pages-index high-pages
)
983 (incf low-pages-index
)))))
984 finally
(assert (< low-pages-index
(ash 1 15)))
986 (print low-pages-index
))))))
988 (defun output-decomposition-data ()
989 (with-output-dat-file (stream "decomp")
990 (loop for cp across
*decompositions
* do
991 (write-codepoint cp stream
)))
993 (print (length *decompositions
*))))
995 (defun output-composition-data ()
996 (with-output-lisp-expr-file (stream "comp")
998 (maphash (lambda (k v
) (push (cons k v
) comp
)) *compositions
*)
999 (format stream
"#(~%")
1000 (loop for
(k . v
) in
(sort comp
#'< :key
#'cdr
)
1001 do
(format stream
"(~D . ~D) ; #x~X + #x~X~%"
1002 (logior (ash (car k
) 21) (cdr k
)) v
(car k
) (cdr k
)))
1003 (format stream
")~%"))))
1005 (defun output-case-data ()
1006 (let (casing-pages points-with-case
)
1007 (with-output-dat-file (stream "case")
1008 (loop for cp being the hash-keys in
*case-mapping
*
1009 do
(push cp points-with-case
))
1010 (setf points-with-case
(sort points-with-case
#'<))
1011 (loop for cp in points-with-case
1012 for
(upper . lower
) = (gethash cp
*case-mapping
*) do
1013 (pushnew (ash cp -
6) casing-pages
)
1014 (write-codepoint cp stream
)
1015 (write-byte (if (atom upper
) 0 (length upper
)) stream
)
1016 (if (atom upper
) (write-codepoint upper stream
)
1017 (map 'nil
(lambda (c) (write-codepoint c stream
)) upper
))
1018 (write-byte (if (atom lower
) 0 (length lower
)) stream
)
1019 (if (atom lower
) (write-codepoint lower stream
)
1020 (map 'nil
(lambda (c) (write-codepoint c stream
)) lower
))))
1021 (setf casing-pages
(sort casing-pages
#'<))
1022 (assert (< (length casing-pages
) 256))
1023 (let* ((size (1+ (reduce #'max casing-pages
)))
1024 (array (make-array size
:initial-element
255))
1026 (dolist (entry casing-pages
)
1027 (setf (aref array entry
) (incf page
)))
1028 (with-output-dat-file (stream "casepages")
1030 (write-byte (aref array i
) stream
))))
1031 (with-output-lisp-expr-file (stream "casepages")
1032 (print casing-pages stream
))))
1034 (defun output-collation-data ()
1035 (with-output-lisp-expr-file (output "collation")
1036 (format output
";;;; This is 'allkeys.txt' converted to Lisp syntax~%")
1037 (format output
";;;; Assume *READ-BASE* is 16.~%#(~%")
1038 (with-input-txt-file (input "allkeys")
1039 (loop for line
= (read-line input nil nil
) while line
1040 unless
(or (string= line
"")
1041 (eql 0 (position #\
@ line
)) (eql 0 (position #\
# line
)))
1043 (multiple-value-bind (codepoints keys
) (parse-collation-line line
)
1044 (format output
"(~x . ~x)~%"
1045 ;; Pack up to 3 codepoints, rightmost in the highest bits
1046 (destructuring-bind (first &optional second third
) codepoints
1047 (cond (third (logior (ash third
42) (ash second
21) first
))
1048 (second (logior (ash second
21) first
))
1050 ;; Pack the blob of "keys" using 32 bits each
1052 ;; reversal here is magic. Don't worry about getting it wrong-
1053 ;; unicode-collation.pure checks that.
1054 (dolist (part (reverse keys
) sum
)
1055 (setq sum
(logior (ash sum
32) part
))))))))
1056 (format output
")~%"))
1057 (with-output-lisp-expr-file (*standard-output
* "other-collation-info")
1058 (write-string ";;; The highest primary variable collation index")
1060 (prin1 *maximum-variable-key
*) (terpri)))
1062 (defun output (&optional
(*output-directory
* *output-directory
*))
1063 (ensure-directories-exist *output-directory
*)
1066 (output-decomposition-data)
1067 (output-composition-data)
1069 (output-collation-data)
1070 (with-output-lisp-expr-file (*standard-output
* "misc-properties")
1071 (prin1 **proplist-properties
**))
1073 (loop for
(filename . data
) in
'(("ucd-names" .
*unicode-names
*)
1074 ("ucd1-names" .
*unicode-1-names
*))
1076 (with-output-lisp-expr-file (f filename
)
1077 (format f
";;; Do not edit by hand: generated by ucd.lisp~%")
1078 (loop for
(code . name
) in
(nreverse (symbol-value data
))
1079 do
(format f
"#x~X ~S~%" code name
))))
1080 (setf *unicode-names
* nil
)
1081 (setf *unicode-1-names
* nil
)
1083 (with-output-lisp-expr-file (*standard-output
* "numerics")
1084 (prin1 (mapcar (lambda (x) (cons (car x
) (read-from-string (cdr x
))))
1085 ;; Print it low to high. It was collected with PUSH
1086 (reverse *different-numerics
*))))
1087 (with-output-lisp-expr-file (*standard-output
* "titlecases")
1088 (prin1 *different-titlecases
*))
1089 (with-output-lisp-expr-file (*standard-output
* "foldcases")
1090 (prin1 *different-casefolds
*))
1091 (with-output-lisp-expr-file (*standard-output
* "confusables")
1092 (prin1 *confusables
*))
1093 (with-output-lisp-expr-file (*standard-output
* "bidi-mirrors")
1094 (prin1 *bidi-mirroring-glyphs
*))
1095 (with-output-lisp-expr-file (*standard-output
* "block-ranges")
1096 (prin1 (car *blocks
*)))
1097 (with-output-lisp-expr-file (*standard-output
* "block-names")
1098 (format t
"#(~{:~A~^~% ~})" (cdr *blocks
*)))