tools-for-build/ucd.lisp

   1 (in-package "SB-COLD")
   2
   3 (declaim (optimize debug))
   4
   5 ;;; Common functions
   6
   7 (defvar *output-directory*
   8   (merge-pathnames
   9    (make-pathname :directory '(:relative :up "output" "ucd"))
  10    (make-pathname :directory (pathname-directory *load-truename*))))
  11
  12 (defparameter *unicode-character-database*
  13   (make-pathname :directory (pathname-directory *load-pathname*)))
  14
  15 (defmacro with-input-txt-file ((s name) &body body)
  16   `(with-open-file (,s (make-pathname :name ,name :type "txt"
  17                                       :defaults *unicode-character-database*))
  18      (setf (gethash (format nil "tools-for-build/~A.txt" ,name) *ucd-inputs*) 'used)
  19      ,@body))
  20
  21 (defmacro with-input-utf8-file
  22     ((s name &key (eszets 0) (registereds 1) (copyrights 1)) &body body)
  23   ;; KLUDGE: Unicode data files in general have registered and
  24   ;; copyright marks (non-ASCII characters) in the header;
  25   ;; additionally, CaseFolding.txt as distributed by Unicode contains
  26   ;; a non-ASCII character, an eszet, within a comment to act as an
  27   ;; example.  We can't in general assume that our host lisp will let
  28   ;; us read those, and we can't portably write that we don't care
  29   ;; about the text content of anything on a line after a hash because
  30   ;; text decoding happens at a lower level.  So here we rewrite data
  31   ;; files to exclude the UTF-8 sequences corresponding to those
  32   ;; characters (and error if we see any other UTF-8 sequence).
  33   (let ((in (gensym "IN"))
  34         (out (gensym "OUT")))
  35     `(let ((filename (format nil "~A.txt" ,name)))
  36        (with-open-file (,in (make-pathname :name ,name :type "txt"
  37                                            :defaults *unicode-character-database*)
  38                             :element-type '(unsigned-byte 8))
  39          (setf (gethash (format nil "tools-for-build/~A.txt" ,name) *ucd-inputs*) 'used)
  40          (with-open-file (,out (make-pathname :name ,name :type "txt"
  41                                               :defaults *output-directory*)
  42                                :element-type '(unsigned-byte 8)
  43                                :direction :output
  44                                :if-exists :supersede
  45                                :if-does-not-exist :create)
  46            (setf (gethash (format nil "output/ucd/~A.txt" ,name) *ucd-outputs*) 'made)
  47            (do ((inbyte (read-byte ,in nil nil) (read-byte ,in nil nil))
  48                 (eszet (map '(vector (unsigned-byte 8)) 'char-code "<eszet>"))
  49                 (eszet-count 0)
  50                 (copyright (map '(vector (unsigned-byte 8)) 'char-code "<copyright>"))
  51                 (copyright-count 0)
  52                 (registered (map '(vector (unsigned-byte 8)) 'char-code "<registered>"))
  53                 (registered-count 0))
  54                ((null inbyte)
  55                 (unless (= eszet-count ,eszets)
  56                   (error "Unexpected number of eszets in ~A: ~D (expected ~D)"
  57                          filename eszet-count ,eszets))
  58                 (unless (= copyright-count ,copyrights)
  59                   (error "Unexpected number of copyright symbols in ~A: ~D (expected ~D)"
  60                          filename copyright-count ,copyrights))
  61                 (unless (= registered-count ,registereds)
  62                   (error "Unexpected number of registered symbols in ~A: ~D (expected ~D)"
  63                          filename registered-count ,registereds)))
  64              (cond
  65                ((= inbyte #xc3)
  66                 (let ((second (read-byte ,in nil nil)))
  67                   (cond
  68                     ((null second)
  69                      (error "No continuation after #xc3 in ~A" filename))
  70                     ((= second #x9f) (incf eszet-count) (write-sequence eszet ,out))
  71                     (t (error "Unexpected continuation after #xc3 in ~A: #x~X"
  72                               filename second)))))
  73                ((= inbyte #xc2)
  74                 (let ((second (read-byte ,in nil nil)))
  75                   (cond
  76                     ((null second)
  77                      (error "No continuation after #xc2 in ~A" filename))
  78                     ((= second #xa9) (incf copyright-count) (write-sequence copyright ,out))
  79                     ((= second #xae) (incf registered-count) (write-sequence registered ,out)))))
  80                ((>= inbyte #x7f)
  81                 (error "Unexpected octet in ~A: #x~X" filename inbyte))
  82                (t (write-byte inbyte ,out))))))
  83        (with-open-file (,s (make-pathname :name ,name :type "txt"
  84                                           :defaults *output-directory*))
  85          (multiple-value-prog1
  86              (progn
  87                ,@body)
  88            (unless (eql (peek-char nil ,s nil nil) nil)
  89              (error "Unread data in data file: ~S" ,name)))))))
  90
  91 (defmacro with-input-arbitrary-utf8-file ((s name) &body body)
  92   ;; KLUDGE: the emoji data includes literal emoji; likewise the
  93   ;; confusables.  We just remove all high-bit stuff, on the
  94   ;; assumption that the actual data is still ASCII.
  95   (let ((in (gensym "IN"))
  96         (out (gensym "OUT")))
  97     `(progn
  98        (with-open-file (,in (make-pathname :name ,name :type "txt"
  99                                            :defaults *unicode-character-database*)
 100                             :element-type '(unsigned-byte 8))
 101          (setf (gethash (format nil "tools-for-build/~A.txt" ,name) *ucd-inputs*) 'used)
 102          (with-open-file (,out (make-pathname :name ,name :type "txt"
 103                                               :defaults *output-directory*)
 104                                :element-type '(unsigned-byte 8)
 105                                :direction :output
 106                                :if-exists :supersede
 107                                :if-does-not-exist :create)
 108            (setf (gethash (format nil "output/ucd/~A.txt" ,name) *ucd-outputs*) 'made)
 109            (do ((inbyte (read-byte ,in nil nil) (read-byte ,in nil nil)))
 110                ((null inbyte) nil)
 111              (cond
 112                ((>= inbyte #x7f) nil)
 113                (t (write-byte inbyte ,out))))))
 114        (with-open-file (,s (make-pathname :name ,name :type "txt"
 115                                           :defaults *output-directory*))
 116          (multiple-value-prog1
 117              (progn
 118                ,@body)
 119            (unless (eql (peek-char nil ,s nil nil) nil)
 120              (error "Unread data in data file: ~S" ,name)))))))
 121
 122 (defmacro with-output-dat-file ((s name) &body body)
 123   `(with-open-file (,s (make-pathname :name ,name :type "dat"
 124                                       :defaults *output-directory*)
 125                        :direction :output :element-type '(unsigned-byte 8)
 126                        :if-exists :supersede :if-does-not-exist :create)
 127      (setf (gethash (format nil "output/ucd/~A.dat" ,name) *ucd-outputs*) 'made)
 128      ,@body))
 129
 130 (defmacro with-ucd-output-syntax (&body body)
 131   `(with-standard-io-syntax
 132      (let ((*readtable* (copy-readtable))
 133            (*print-readably* nil)
 134            (*print-pretty* t))
 135        ,@body)))
 136
 137 (defmacro with-output-lisp-expr-file ((s name) &body body)
 138   `(with-open-file (,s (make-pathname :name ,name :type "lisp-expr"
 139                                       :defaults *output-directory*)
 140                        :direction :output :element-type 'character
 141                        :if-exists :supersede :if-does-not-exist :create)
 142      (setf (gethash (format nil "output/ucd/~A.lisp-expr" ,name) *ucd-outputs*) 'made)
 143      (with-ucd-output-syntax
 144          ,@body)))
 145
 146 (defun split-string (line character)
 147   (loop for prev-position = 0 then (1+ position)
 148      for position = (position character line :start prev-position)
 149      collect (subseq line prev-position position)
 150      do (unless position
 151           (loop-finish))))
 152
 153 (defun parse-codepoints (string &key (singleton-list t))
 154   "Gets a list of codepoints out of 'aaaa bbbb cccc', stripping surrounding space"
 155   (let ((list (mapcar
 156               (lambda (s) (parse-integer s :radix 16))
 157               (remove "" (split-string string #\Space) :test #'string=))))
 158     (if (not (or (cdr list) singleton-list)) (car list) list)))
 159
 160
 161 (defun parse-codepoint-range (string)
 162   "Parse the Unicode syntax DDDD|DDDD..DDDD into an inclusive range (start end)"
 163   (destructuring-bind (start &optional empty end) (split-string string #\.)
 164     (declare (ignore empty))
 165     (let* ((head (parse-integer start :radix 16))
 166            (tail (if end
 167                      (parse-integer end :radix 16 :end (position #\Space end))
 168              head)))
 169       (list head tail))))
 170
 171 (defvar *slurped-random-constants*
 172   (with-open-file (f (make-pathname :name "more-ucd-consts" :type "lisp-expr"
 173                                     :defaults *unicode-character-database*))
 174     (setf (gethash "tools-for-build/more-ucd-consts.lisp-expr" *ucd-inputs*) 'used)
 175     (read f)))
 176
 177 (defun init-indices (symbol &aux (strings
 178                                   (or (cadr (assoc symbol *slurped-random-constants*))
 179                                       (error "Missing entry in more-ucd-consts for ~S"
 180                                              symbol))))
 181   (let ((hash (make-hash-table :test #'equal)))
 182     (loop for string in strings
 183        for index from 0
 184        do (setf (gethash string hash) index))
 185     hash))
 186
 187 (defun index-or-lose (key table kind)
 188   (or (gethash key table)
 189       (error "unknown ~A: ~S" kind key)))
 190
 191 (defun clear-flag (bit integer)
 192   (logandc2 integer (ash 1 bit)))
 193
 194 \f
 195 ;;; Output storage globals
 196 (defstruct ucd misc decomp)
 197
 198 (defparameter *unicode-names* nil)
 199 (defparameter *unicode-1-names* nil)
 200
 201 (defparameter *decompositions*
 202   (make-array 10000 :element-type '(unsigned-byte 24) :fill-pointer 0
 203               :adjustable t)) ; 10000 is not a significant number
 204
 205 (defparameter *decomposition-corrections*
 206   (with-input-utf8-file (s "NormalizationCorrections")
 207     (loop with result = nil
 208        for line = (read-line s nil nil) while line
 209        do (when (position #\; line)
 210             (destructuring-bind (cp old-decomp correction version)
 211                 (split-string line #\;)
 212               (declare (ignore old-decomp version))
 213               (push (cons (parse-integer cp :radix 16)
 214                           (parse-integer correction :radix 16))
 215                     result)))
 216        finally (return result)))
 217   "List of decompsotions that were amended in Unicode corrigenda")
 218
 219 (defparameter *compositions* (make-hash-table :test #'equal))
 220 (defparameter *composition-exclusions*
 221   (with-input-utf8-file (s "CompositionExclusions")
 222     (loop with result = nil
 223        for line = (read-line s nil nil) while line
 224        when (and (> (length line) 0) (char/= (char line 0) #\#))
 225        do (push (parse-integer line :end (position #\Space line) :radix 16)
 226                 result) finally (return result)))
 227   "Characters that are excluded from composition according to UAX#15")
 228
 229 (defparameter *different-titlecases* nil)
 230 (defparameter *different-casefolds* nil)
 231
 232 (defparameter *case-mapping*
 233   (with-input-utf8-file (s "SpecialCasing")
 234     (loop with hash = (make-hash-table)
 235        for line = (read-line s nil nil) while line
 236        unless (or (not (position #\# line)) (= 0 (position #\# line)))
 237        do (destructuring-bind (%cp %lower %title %upper &optional context comment)
 238               (split-string line #\;)
 239             (unless (and context comment)
 240               (let ((cp (parse-integer %cp :radix 16))
 241                     (lower (parse-codepoints %lower :singleton-list nil))
 242                     (title (parse-codepoints %title :singleton-list nil))
 243                     (upper (parse-codepoints %upper :singleton-list nil)))
 244                 (setf (gethash cp hash) (cons upper lower))
 245                 (unless (equal title upper) (push (cons cp title) *different-titlecases*)))))
 246          finally (return hash)))
 247   "Maps cp -> (cons uppercase|(uppercase ...) lowercase|(lowercase ...))")
 248
 249 (defparameter *misc-table* (make-array 3000 :fill-pointer 0)
 250 "Holds the entries in the Unicode database's miscellanious array, stored as lists.
 251 These lists have the form (gc-index bidi-index ccc digit decomposition-info
 252 flags script line-break age). Flags is a bit-bashed integer containing
 253 cl-both-case-p, has-case-p, and bidi-mirrored-p, and an east asian width.
 254 Length should be adjusted when the standard changes.")
 255 (defparameter *misc-hash* (make-hash-table :test #'equal)
 256 "Maps a misc list to its position in the misc table.")
 257
 258 (defparameter *different-numerics* nil)
 259
 260 (defparameter *ucd-entries* (make-hash-table))
 261
 262 (defparameter *general-categories* (init-indices '*general-categories*))
 263 (defparameter *bidi-classes* (init-indices '*bidi-classes*))
 264 (defparameter *east-asian-widths* (init-indices '*east-asian-widths*))
 265 (defparameter *scripts* (init-indices '*scripts*))
 266 (defparameter *line-break-classes* (init-indices '*line-break-classes*))
 267
 268 (defparameter *east-asian-width-table*
 269   (with-input-utf8-file (s "EastAsianWidth")
 270     (loop with hash = (make-hash-table)
 271        for line = (read-line s nil nil) while line
 272        unless (or (not (position #\# line)) (= 0 (position #\# line)))
 273        do (destructuring-bind (codepoints value)
 274               (split-string
 275                (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
 276             (let ((range (parse-codepoint-range codepoints))
 277                   (index (index-or-lose value *east-asian-widths* "East Asian width")))
 278               (loop for i from (car range) to (cadr range)
 279                  do (setf (gethash i hash) index))))
 280        finally (return hash)))
 281   "Table of East Asian Widths. Used in the creation of misc entries.")
 282
 283 (defparameter *script-table*
 284   (with-input-utf8-file (s "Scripts")
 285     (loop with hash = (make-hash-table)
 286        for line = (read-line s nil nil) while line
 287        unless (or (not (position #\# line)) (= 0 (position #\# line)))
 288        do (destructuring-bind (codepoints value)
 289               (split-string
 290                (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
 291             (let ((range (parse-codepoint-range codepoints))
 292                   (index (index-or-lose (subseq value 1) *scripts* "script")))
 293               (loop for i from (car range) to (cadr range)
 294                  do (setf (gethash i hash) index))))
 295        finally (return hash)))
 296 "Table of scripts. Used in the creation of misc entries.")
 297
 298 (defparameter *line-break-class-table*
 299   (with-input-utf8-file (s "LineBreak")
 300     (loop with hash = (make-hash-table)
 301        for line = (read-line s nil nil) while line
 302        unless (or (not (position #\# line)) (= 0 (position #\# line)))
 303        do (destructuring-bind (codepoints value)
 304               (split-string
 305                (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
 306             (let* ((range (parse-codepoint-range codepoints))
 307                    ;; Hangul syllables are marked as "Unknown", and programmatically
 308                    ;; handled in SB-UNICODE:LINE-BREAK-CLASS
 309                    (value
 310                      (or (and (member value '("JL" "JV" "JT" "H2" "H3") :test 'string=) "XX")
 311                          value))
 312                    (index (index-or-lose value *line-break-classes* "line break")))
 313               (loop for i from (car range) to (cadr range)
 314                  do (setf (gethash i hash) index))))
 315        finally (return hash)))
 316 "Table of line break classes. Used in the creation of misc entries.")
 317
 318 (defparameter *age-table*
 319   (with-input-utf8-file (s "DerivedAge")
 320     (loop with hash = (make-hash-table)
 321        for line = (read-line s nil nil) while line
 322        unless (or (not (position #\# line)) (= 0 (position #\# line)))
 323        do (destructuring-bind (codepoints value)
 324               (split-string
 325                (string-right-trim " " (subseq line 0 (position #\# line))) #\;)
 326             (let* ((range (parse-codepoint-range codepoints))
 327                    (age-parts (mapcar #'parse-integer (split-string value #\.)))
 328                    (age (logior (ash (car age-parts) 3) (cadr age-parts))))
 329               (loop for i from (car range) to (cadr range)
 330                  do (setf (gethash i hash) age))))
 331        finally (return hash)))
 332 "Table of character ages. Used in the creation of misc entries.")
 333
 334 (defvar *block-first* nil)
 335
 336 \f
 337 ;;; Unicode data file parsing
 338 (defun hash-misc (gc-index bidi-index ccc digit decomposition-info flags
 339                   script line-break age)
 340   (let* ((list (list gc-index bidi-index ccc digit decomposition-info flags
 341                      script line-break age))
 342          (index (gethash list *misc-hash*)))
 343     (or index
 344         (progn
 345           (setf (gethash list *misc-hash*)
 346                 (fill-pointer *misc-table*))
 347           (when (eql nil (vector-push list *misc-table*))
 348             (error "Misc table too small."))
 349           (gethash list *misc-hash*)))))
 350
 351 (defun ordered-ranges-member (item vector)
 352   (labels ((recurse (start end)
 353              (when (< start end)
 354                (let* ((i (+ start (truncate (- end start) 2)))
 355                       (index (* 2 i))
 356                       (elt1 (svref vector index))
 357                       (elt2 (svref vector (1+ index))))
 358                  (cond ((< item elt1)
 359                         (recurse start i))
 360                        ((> item elt2)
 361                         (recurse (+ 1 i) end))
 362                        (t
 363                         item))))))
 364     (recurse 0 (/ (length vector) 2))))
 365
 366 (defun unallocated-bidi-class (code-point)
 367   ;; See tests/data/DerivedBidiClass.txt for more information
 368   (flet ((in (vector class)
 369            (when (ordered-ranges-member code-point vector)
 370              (gethash class *bidi-classes*))))
 371     (cond
 372       ((in #(#x0600 #x07BF
 373              #x0860 #x08FF
 374              #xFB50 #xFDCF
 375              #xFDF0 #xFDFF
 376              #xFE70 #xFEFF
 377              #x10D00 #x10D3F
 378              #x10EC0 #x10EFF
 379              #x10F30 #x10F6F
 380              #x1EC70 #x1ECBF
 381              #x1ED00 #x1ED4F
 382              #x1EE00 #x1EEFF)
 383            "AL"))
 384       ((in #(#x0590 #x05FF
 385              #x07C0 #x085F
 386              #xFB1D #xFB4F
 387              #x10800 #x10CFF
 388              #x10D40 #x10EBF
 389              #x10F00 #x10F2F
 390              #x10F70 #x10FFF
 391              #x1E800 #x1EC6F
 392              #x1ECC0 #x1ECFF
 393              #x1ED50 #x1EDFF
 394              #x1EF00 #x1EFFF)
 395            "R"))
 396       ((in #(#x20A0 #x20CF) "ET"))
 397       ;; BN is non-characters and default-ignorable.
 398       ;; Default-ignorable will be dealt with elsewhere
 399       ((in #(#xFDD0 #xFDEF #xFFFE #xFFFF #x1FFFE #x1FFFF #x2FFFE #x2FFFF
 400              #x3FFFE #x3FFFF #x4FFFE #x4FFFF #x5FFFE #x5FFFF #x6FFFE #x6FFFF
 401              #x7FFFE #x7FFFF #x8FFFE #x8FFFF #x9FFFE #x9FFFF #xAFFFE #xAFFFF
 402              #xBFFFE #xBFFFF #xCFFFE #xCFFFF #xDFFFE #xDFFFF #xEFFFE #xEFFFF
 403              #xFFFFE #xFFFFF #x10FFFE #x10FFFF)
 404             "BN"))
 405       ((in #(#x0 #x10FFFF) "L"))
 406       (t (error "Somehow we've gone too far in unallocated bidi determination")))))
 407
 408 (defun complete-misc-table ()
 409   (loop for code-point from 0 to #x10FFFF do ; Flood-fill unallocated codepoints
 410        (unless (second (multiple-value-list (gethash code-point *ucd-entries*)))
 411          (let* ((unallocated-misc
 412                  ;; unallocated characters have a GC of "Cn", aren't digits
 413                  ;; (digit = 128), have a bidi that depends on their block, and
 414                  ;; don't decompose, combine, or have case. They have an East
 415                  ;; Asian Width (eaw) of "N" (0), and a script, line breaking
 416                  ;; class, and age of 0 ("Unknown"), unless some of those
 417                  ;; properties are otherwise assigned.
 418                  `(,(gethash "Cn" *general-categories*)
 419                     ,(unallocated-bidi-class code-point) 0 128 0
 420                     ,(gethash code-point *east-asian-width-table* 0)
 421                     0 ,(gethash code-point *line-break-class-table* 0)
 422                     ,(gethash code-point *age-table* 0)))
 423                 (unallocated-index (apply #'hash-misc unallocated-misc))
 424                 (unallocated-ucd (make-ucd :misc unallocated-index)))
 425            (setf (gethash code-point *ucd-entries*) unallocated-ucd)))))
 426
 427 (defun expand-decomposition (decomposition)
 428   (loop for cp in decomposition
 429         for ucd = (gethash cp *ucd-entries*)
 430         for length = (elt (aref *misc-table* (ucd-misc ucd)) 4)
 431         if (and (not (logbitp 7 length))
 432                 (plusp length))
 433         append (expand-decomposition (ucd-decomp ucd))
 434         else
 435         collect cp))
 436
 437 ;;; Recursively expand canonical decompositions
 438 (defun fixup-decompositions ()
 439   (loop for did-something = nil
 440         do
 441         (loop for ucd being each hash-value of *ucd-entries*
 442               when (and (ucd-decomp ucd)
 443                         (not (logbitp 7 (elt (aref *misc-table* (ucd-misc ucd)) 4))))
 444               do
 445               (let ((expanded (expand-decomposition (ucd-decomp ucd))))
 446                 (unless (equal expanded (ucd-decomp ucd))
 447                   (setf (ucd-decomp ucd) expanded
 448                         did-something t))))
 449         while did-something)
 450   (loop for i below (hash-table-count *ucd-entries*)
 451         for ucd = (gethash i *ucd-entries*)
 452         for decomp = (ucd-decomp ucd)
 453         do
 454         (setf (ucd-decomp ucd)
 455               (cond ((not (consp decomp)) 0)
 456                     ((logbitp 7 (elt (aref *misc-table* (ucd-misc ucd)) 4))
 457                      (prog1 (length *decompositions*)
 458                        (loop for cp in decomp
 459                              do (vector-push-extend cp *decompositions*))))
 460                     (t
 461                      (let ((misc-entry (copy-list (aref *misc-table* (ucd-misc ucd)))))
 462                        (setf (elt misc-entry 4) (length decomp)
 463                              (ucd-misc ucd) (apply #'hash-misc misc-entry))
 464                        (prog1 (length *decompositions*)
 465                          (loop for cp in decomp
 466                                do (vector-push-extend cp *decompositions*)))))))))
 467
 468 (defun fixup-compositions ()
 469   (flet ((fixup (k v)
 470            (declare (ignore v))
 471            (let* ((cp (car k))
 472                   (ucd (gethash cp *ucd-entries*))
 473                   (misc (aref *misc-table* (ucd-misc ucd)))
 474                   (ccc (third misc)))
 475              ;; we can do everything in the first pass except for
 476              ;; accounting for decompositions where the first
 477              ;; character of the decomposition is not a starter.
 478              (when (/= ccc 0)
 479                (remhash k *compositions*)))))
 480     (maphash #'fixup *compositions*)))
 481
 482 (defun add-jamo-information (line table)
 483   (let* ((split (split-string line #\;))
 484          (code (parse-integer (first split) :radix 16))
 485          (syllable (string-trim
 486                     " "
 487                     (subseq (second split) 0 (position #\# (second split))))))
 488     (setf (gethash code table) syllable)))
 489
 490 (defun fixup-hangul-syllables ()
 491   ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
 492   (let* ((sbase #xac00)
 493          (lbase #x1100)
 494          (vbase #x1161)
 495          (tbase #x11a7)
 496          (scount 11172)
 497          (lcount 19)
 498          (vcount 21)
 499          (tcount 28)
 500          (ncount (* vcount tcount))
 501          (table (make-hash-table)))
 502     (declare (ignore lcount))
 503     (with-input-utf8-file (*standard-input* "Jamo")
 504       (loop for line = (read-line nil nil)
 505             while line
 506             if (position #\; line)
 507             do (add-jamo-information line table)))
 508     (dotimes (sindex scount)
 509       (let* ((l (+ lbase (floor sindex ncount)))
 510              (v (+ vbase (floor (mod sindex ncount) tcount)))
 511              (tee (+ tbase (mod sindex tcount)))
 512              (code-point (+ sbase sindex))
 513              (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]"
 514                            (gethash l table) (gethash v table)
 515                            (= tee tbase) (gethash tee table))))
 516         (push `(,code-point . ,name) *unicode-names*)))))
 517
 518 (defun normalize-character-name (name)
 519   (when (find #\_ name)
 520     (error "Bad name for a character: ~A" name))
 521   ;; U+1F5CF (PAGE)'s name conflicts with the ANSI CL-assigned
 522   ;; name for form feed (^L, U+000C). To avoid a case where
 523   ;; more than one character has a particular name while remaining
 524   ;; standards-compliant, we remove U+1F5CF's name here.
 525   (when (string= name "PAGE")
 526     (return-from normalize-character-name "UNICODE_PAGE"))
 527   (unless (or (zerop (length name)) (find #\< name) (find #\> name))
 528     (substitute #\_ #\Space name)))
 529
 530 ;;;   3400  --  4DB5  : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
 531 ;;;   AC00  --  D7A3  : hangul syllables ;Lo;0;L;;;;;N;;;;;
 532 ;;;   D800  --  F8FF  : surrogates and private use
 533 ;;;  20000  --  2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
 534 ;;;  F0000  --  FFFFD : private use
 535 ;;; 100000  --  10FFFD: private use
 536 (defun encode-ucd-line (line code-point)
 537   (destructuring-bind (name general-category canonical-combining-class
 538                             bidi-class decomposition-type-and-mapping
 539                             decimal-digit digit numeric bidi-mirrored
 540                             unicode-1-name iso-10646-comment simple-uppercase
 541                             simple-lowercase simple-titlecase)
 542       line
 543     (declare (ignore iso-10646-comment))
 544     (if (and (> (length name) 8)
 545              (string= ", First>" name :start2 (- (length name) 8)))
 546         (progn
 547           (setf *block-first* code-point)
 548           nil)
 549         (let* ((gc-index
 550                  (index-or-lose general-category *general-categories* "general category"))
 551                (bidi-index (index-or-lose bidi-class *bidi-classes* "bidirectional class"))
 552                (ccc (parse-integer canonical-combining-class))
 553                (digit-index (if (string= "" digit) 128 ; non-digits have high bit
 554                                 (let ((%digit (parse-integer digit)))
 555                                   (if (string= digit decimal-digit)
 556                                       ;; decimal-digit-p is in bit 6
 557                                       (logior (ash 1 6) %digit) %digit))))
 558                (upper-index (unless (string= "" simple-uppercase)
 559                               (parse-integer simple-uppercase :radix 16)))
 560                (lower-index (unless (string= "" simple-lowercase)
 561                               (parse-integer simple-lowercase :radix 16)))
 562                (title-index (unless (string= "" simple-titlecase)
 563                               (parse-integer simple-titlecase :radix 16)))
 564                (cl-both-case-p (or (and (= gc-index 0) lower-index)
 565                                    (and (= gc-index 1) upper-index)))
 566                (bidi-mirrored-p (string= bidi-mirrored "Y"))
 567                (decomposition-info 0)
 568                (eaw-index (gethash code-point *east-asian-width-table*))
 569                (script-index (gethash code-point *script-table* 0))
 570                (line-break-index (gethash code-point *line-break-class-table* 0))
 571                (age-index (gethash code-point *age-table* 0))
 572                decomposition)
 573           #+nil
 574           (when (and (not cl-both-case-p)
 575                      (< gc-index 2))
 576             (format t "~A~%" name))
 577
 578           (when (string/= "" decomposition-type-and-mapping)
 579             (let* ((compatibility-p (position #\> decomposition-type-and-mapping)))
 580               (setf decomposition
 581                     (parse-codepoints
 582                      (subseq decomposition-type-and-mapping
 583                              (if compatibility-p (1+ compatibility-p) 0))))
 584               (when (assoc code-point *decomposition-corrections*)
 585                 (setf decomposition
 586                       (list (cdr (assoc code-point *decomposition-corrections*)))))
 587               (setf decomposition-info
 588                     (logior (length decomposition) (if compatibility-p 128 0)))
 589               (unless compatibility-p
 590                 ;; Primary composition excludes:
 591                 ;; * singleton decompositions;
 592                 ;; * decompositions of non-starters;
 593                 ;; * script-specific decompositions;
 594                 ;; * later-version decompositions;
 595                 ;; * decompositions whose first character is a
 596                 ;;   non-starter.
 597                 ;; All but the last case can be handled here;
 598                 ;; for the fixup, see FIXUP-COMPOSITIONS
 599                 (when (and (> decomposition-info 1)
 600                            (= ccc 0)
 601                            (not (member code-point *composition-exclusions*)))
 602                   (unless (= decomposition-info 2)
 603                     (error "canonical decomposition unexpectedly long"))
 604                   (setf (gethash (cons (first decomposition)
 605                                        (second decomposition))
 606                                  *compositions*)
 607                         code-point)))))
 608           ;; Hangul decomposition; see Unicode 6.2 section 3-12
 609           (when (= code-point #xd7a3)
 610             ;; KLUDGE: The decomposition-length for Hangul syllables in the
 611             ;; misc database will be a bit of a lie. It doesn't really matter
 612             ;; since the only purpose of the length is to index into the
 613             ;; decompositions array (which Hangul decomposition doesn't use).
 614             ;; The decomposition index is 0 because we won't be going into the
 615             ;; array
 616             (setf decomposition-info 3))
 617
 618           (unless (gethash code-point *case-mapping*) ; Exclude codepoints from SpecialCasing
 619             (when (string/= simple-uppercase simple-titlecase)
 620               (push (cons code-point title-index) *different-titlecases*))
 621             (and (or upper-index lower-index)
 622                  (setf (gethash code-point *case-mapping*)
 623                        (cons
 624                         (or upper-index code-point)
 625                         (or lower-index code-point)))))
 626
 627           (when (string/= digit numeric)
 628             (push (cons code-point numeric) *different-numerics*))
 629
 630           (when (> ccc 255)
 631             (error "canonical combining class too large ~A" ccc))
 632           (let* ((flags (logior
 633                          (if cl-both-case-p (ash 1 7) 0)
 634                          (if (gethash code-point *case-mapping*) (ash 1 6) 0)
 635                          (if bidi-mirrored-p (ash 1 5) 0)
 636                          eaw-index))
 637                  (misc-index (hash-misc gc-index bidi-index ccc digit-index
 638                                         decomposition-info flags script-index
 639                                         line-break-index age-index))
 640                  (result (make-ucd :misc misc-index
 641                                    :decomp decomposition)))
 642             (when (and (> (length name) 7)
 643                        (string= ", Last>" name :start2 (- (length name) 7)))
 644               ;; We can still do this despite East Asian Width being in the
 645               ;; databasce since each of the UCD <First><Last> blocks
 646               ;; has a consistent East Asian Width
 647               (loop for point from *block-first* to code-point do
 648                    (setf (gethash point *ucd-entries*) result)))
 649             (values result (normalize-character-name name)
 650                     (normalize-character-name unicode-1-name)))))))
 651
 652 (defun slurp-ucd-line (line line-number)
 653   (declare (ignorable line-number))
 654   (let* ((split-line (split-string line #\;))
 655          (code-point (parse-integer (first split-line) :radix 16)))
 656     (multiple-value-bind (encoding name unicode-1-name)
 657         (encode-ucd-line (cdr split-line) code-point)
 658       ;; Dup code-point exist - maybe to do with composition/decomposition? no idea -
 659       ;; but we need to ensure that there are no dups in the name tables.
 660       ;; Since those tables are not hash-tables any more, we have to inefficiently
 661       ;; scan to assert not-existsp. But since scanning an alist is not optimal,
 662       ;; don't enable this warning unless debugging.
 663       #+nil
 664       (when (gethash code-point *ucd-entries*)
 665         (assert (not (assoc code-point *unicode-names*)))
 666         (assert (not (assoc code-point *unicode-1-names*)))
 667         (warn "line ~D: dup for code-point ~x, already have ~S"
 668                line-number code-point (gethash code-point *ucd-entries*)))
 669       (setf (gethash code-point *ucd-entries*) encoding)
 670       (when name
 671         (push `(,code-point . ,name) *unicode-names*))
 672       (when unicode-1-name
 673         (push `(,code-point . ,unicode-1-name) *unicode-1-names*)))))
 674
 675 ;;; this fixes up the case conversion discrepancy between CL and
 676 ;;; Unicode: CL operators depend on char-downcase / char-upcase being
 677 ;;; inverses, which is not true in general in Unicode even for
 678 ;;; characters which change case to single characters.
 679 ;;; Also, fix misassigned age values, which are not constant across blocks
 680 (defun second-pass ()
 681   (let ((case-mapping
 682          (sort (loop for code-point being the hash-keys in *case-mapping*
 683                     using (hash-value value)
 684                     collect (cons code-point value))
 685                #'< :key #'car)))
 686     (loop for (code-point upper . lower) in case-mapping
 687        for misc-index = (ucd-misc (gethash code-point *ucd-entries*))
 688        for (gc bidi ccc digit decomp flags script lb age) = (aref *misc-table* misc-index)
 689        when (logbitp 7 flags) do
 690          (when (or (not (atom upper)) (not (atom lower))
 691                    (and (= gc 0)
 692                         (not (equal (car (gethash lower *case-mapping*)) code-point)))
 693                    (and (= gc 1)
 694                         (not (equal (cdr (gethash upper *case-mapping*)) code-point))))
 695            (let* ((new-flags (clear-flag 7 flags))
 696                   (new-misc (hash-misc gc bidi ccc digit decomp new-flags script lb age)))
 697              (setf (ucd-misc (gethash code-point *ucd-entries*)) new-misc))))))
 698
 699 (defun fixup-casefolding ()
 700   (with-input-utf8-file (s "CaseFolding" :eszets 1)
 701     (loop for line = (read-line s nil nil)
 702           while line
 703           unless (or (not (position #\; line)) (equal (position #\# line) 0))
 704             do (destructuring-bind (original type mapping comment)
 705                    (split-string line #\;)
 706                  (declare (ignore comment))
 707                  (let ((cp (parse-integer original :radix 16))
 708                        (fold (parse-codepoints mapping :singleton-list nil)))
 709                    (unless (or (string= type " S") (string= type " T"))
 710                      (when (not (equal (cdr (gethash cp *case-mapping*)) fold))
 711                        (push (cons cp fold) *different-casefolds*))))))))
 712
 713 (defun fixup-ages ()
 714   (let ((age-data (sort
 715                    (loop for code-point being the hash-keys in *age-table*
 716                            using (hash-value true-age)
 717                          collect (cons code-point true-age))
 718                    #'< :key #'car)))
 719     (loop for (code-point . true-age) in age-data
 720        for misc-index = (ucd-misc (gethash code-point *ucd-entries*))
 721        for (gc bidi ccc digit decomp flags script lb age) = (aref *misc-table* misc-index)
 722        unless (= age true-age) do
 723          (let* ((new-misc (hash-misc gc bidi ccc digit decomp flags script lb true-age))
 724                 (new-ucd (make-ucd
 725                           :misc new-misc
 726                           :decomp (ucd-decomp (gethash code-point *ucd-entries*)))))
 727            (setf (gethash code-point *ucd-entries*) new-ucd)))))
 728
 729 (defun slurp-ucd ()
 730   (with-input-txt-file (*standard-input* "UnicodeData")
 731     (when *load-verbose*
 732       (format t "~%//slurp-ucd~%"))
 733     (loop for line = (read-line nil nil)
 734           for number from 1
 735           while line
 736           do (slurp-ucd-line line number)))
 737   (second-pass)
 738   (fixup-compositions)
 739   (fixup-hangul-syllables)
 740   (complete-misc-table)
 741   (fixup-casefolding)
 742   (fixup-ages)
 743   (fixup-decompositions)
 744   nil)
 745
 746 \f
 747 ;;; PropList.txt
 748 (defparameter **proplist-properties** nil
 749   "A list of properties extracted from PropList.txt")
 750
 751 (defun parse-property (stream &optional name)
 752   (let ((result (make-array 1 :fill-pointer 0 :adjustable t)))
 753     (loop for line = (read-line stream nil nil)
 754        for entry = (subseq line 0 (position #\# line))
 755        ;; Deal with Blah=Blah in DerivedNormalizationProps.txt
 756        while (and line (not (position #\= (substitute #\Space #\= line :count 1))))
 757        when (and entry (string/= entry ""))
 758        do
 759          (destructuring-bind (start end)
 760              (parse-codepoint-range (car (split-string entry #\;)))
 761            (vector-push-extend start result)
 762            (vector-push-extend end result)))
 763     (when name
 764       (push name **proplist-properties**)
 765       (push result **proplist-properties**))))
 766
 767 (defun slurp-proplist ()
 768   (with-input-utf8-file (s "PropList")
 769     (parse-property s) ;; Initial comments
 770     (parse-property s :white-space)
 771     (parse-property s :bidi-control)
 772     (parse-property s :join-control)
 773     (parse-property s :dash)
 774     (parse-property s :hyphen)
 775     (parse-property s :quotation-mark)
 776     (parse-property s :terminal-punctuation)
 777     (parse-property s :other-math)
 778     (parse-property s :hex-digit)
 779     (parse-property s :ascii-hex-digit)
 780     (parse-property s :other-alphabetic)
 781     (parse-property s :ideographic)
 782     (parse-property s :diacritic)
 783     (parse-property s :extender)
 784     (parse-property s :other-lowercase)
 785     (parse-property s :other-uppercase)
 786     (parse-property s :noncharacter-code-point)
 787     (parse-property s :other-grapheme-extend)
 788     (parse-property s :ids-binary-operator)
 789     (parse-property s :ids-trinary-operator)
 790     (parse-property s :radical)
 791     (parse-property s :unified-ideograph)
 792     (parse-property s :other-default-ignorable-code-point)
 793     (parse-property s :deprecated)
 794     (parse-property s :soft-dotted)
 795     (parse-property s :logical-order-exception)
 796     (parse-property s :other-id-start)
 797     (parse-property s :other-id-continue)
 798     (parse-property s :sentence-terminal)
 799     (parse-property s :variation-selector)
 800     (parse-property s :pattern-white-space)
 801     (parse-property s :pattern-syntax)
 802     (parse-property s :prepended-concatenation-mark)
 803     (parse-property s :regional-indicator))
 804
 805   (with-input-utf8-file (s "DerivedNormalizationProps")
 806     (parse-property s) ;; Initial comments
 807     (parse-property s) ;; FC_NFKC_Closure
 808     (parse-property s) ;; FC_NFKC_Closure
 809     (parse-property s) ;; Full_Composition_Exclusion
 810     (parse-property s) ;; NFD_QC Comments
 811     (parse-property s :nfd-qc)
 812     (parse-property s) ;; NFC_QC Comments
 813     (parse-property s :nfc-qc)
 814     (parse-property s :nfc-qc-maybe)
 815     (parse-property s) ;; NFKD_QC Comments
 816     (parse-property s :nfkd-qc)
 817     (parse-property s) ;; NFKC_QC Comments
 818     (parse-property s :nfkc-qc)
 819     (parse-property s :nfkc-qc-maybe)
 820     (parse-property s) ;; Expands_On_NFD
 821     (parse-property s) ;; Expands_On_NFC
 822     (parse-property s) ;; Expands_On_NFKD
 823     (parse-property s) ;; Expands_On_NFKC
 824     (parse-property s) ;; NFKC_CF
 825     (parse-property s)) ;; Changes_When_NFKC_Casefolded
 826
 827   (with-input-arbitrary-utf8-file (s "emoji-data")
 828     (parse-property s) ;; Initial comments
 829     (parse-property s :emoji)
 830     (parse-property s :emoji-presentation)
 831     (parse-property s :emoji-modifier)
 832     (parse-property s :emoji-modifier-base)
 833     (parse-property s :emoji-component)
 834     (parse-property s :extended-pictographic))
 835
 836   (setf **proplist-properties** (nreverse **proplist-properties**))
 837   (values))
 838
 839 \f
 840 ;;; Collation keys
 841 (defvar *maximum-variable-key* 1)
 842
 843 (defun bitpack-collation-key (primary secondary tertiary)
 844   ;; 0 <= primary <= #xFFFD (default table)
 845   ;; 0 <= secondary <= #x10C [9 bits]
 846   ;; 0 <= tertiary <= #x1E (#x1F allowed) [5 bits]
 847   ;; Because of this, the bit packs don't overlap
 848   (logior (ash primary 16) (ash secondary 5) tertiary))
 849
 850 (defun parse-collation-line (line)
 851   (destructuring-bind (%code-points %keys) (split-string line #\;)
 852     (let* ((code-points (parse-codepoints %code-points))
 853            (%keys (subseq %keys 0 (search " #" %keys)))
 854            (keys
 855              (remove
 856               ""
 857               (split-string (remove #\[ (remove #\Space %keys)) #\]) :test #'string=))
 858            (ret
 859             (loop for key in keys
 860                for variable-p = (position #\* key)
 861                for parsed =
 862                  ;; Don't need first value, it's always just ""
 863                  (cdr (mapcar (lambda (x) (parse-integer x :radix 16 :junk-allowed t))
 864                               (split-string (substitute #\. #\* key) #\.)))
 865                collect
 866                  (destructuring-bind (primary secondary tertiary) parsed
 867                    (when variable-p (setf *maximum-variable-key*
 868                                           (max primary *maximum-variable-key*)))
 869                    (bitpack-collation-key primary secondary tertiary)))))
 870     (values code-points ret))))
 871
 872 \f
 873 ;;; Other properties
 874 (defparameter *confusables*
 875   (with-input-arbitrary-utf8-file (stream "confusables")
 876     (read-line stream)
 877     (loop for line = (read-line stream nil nil)
 878           while line
 879           when (and (not (equal line ""))
 880                     (char/= (char line 0) #\#))
 881           collect
 882           (flet ((parse (x)
 883                    (mapcar (lambda (x)
 884                              (parse-integer x :radix 16))
 885                            (split-string x #\Space))))
 886             (let* ((semicolon (position #\; line))
 887                    (semicolon2 (position #\; line :start (1+ semicolon)))
 888                    (from (parse (subseq line 0 (1- semicolon))))
 889                    (to (parse (subseq line (+ semicolon 2) (1- semicolon2)))))
 890               (assert (= (length from) 1))
 891               (list* (car from) to)))))
 892   "List of confusable codepoint sets")
 893
 894 (defparameter *bidi-mirroring-glyphs*
 895   (with-input-utf8-file (s "BidiMirroring")
 896     (loop for line = (read-line s nil nil) while line
 897           when (and (plusp (length line))
 898                     (char/= (char line 0) #\#))
 899           collect
 900           (mapcar
 901            #'(lambda (c) (parse-codepoints c :singleton-list nil))
 902            (split-string (subseq line 0 (position #\# line)) #\;))))
 903   "List of BIDI mirroring glyph pairs")
 904
 905 (defparameter *blocks*
 906   (let (ranges names)
 907     (with-input-utf8-file (stream "Blocks")
 908       (loop
 909        (let ((line (read-line stream nil nil)))
 910          (cond ((not line) (return))
 911                ((or (string= line "") (position #\# line))) ; ignore
 912                (t (let* ((split (split-string line #\;))
 913                          (range (parse-codepoint-range (car split))))
 914                     (setq ranges (list* (cadr range) (car range) ranges))
 915                     (push (nsubstitute #\- #\Space
 916                                        (string-left-trim " " (cadr split)))
 917                           names)))))))
 918     (cons (nreverse (coerce ranges 'vector)) (nreverse names)))
 919   "Vector of block starts and ends in a form acceptable to `ordered-ranges-position`.
 920 Used to look up block data.")
 921 \f
 922 ;;; Output code
 923 (defun write-codepoint (code-point stream)
 924   (declare (type (unsigned-byte 32) code-point))
 925   (write-byte (ldb (byte 8 16) code-point) stream)
 926   (write-byte (ldb (byte 8 8) code-point) stream)
 927   (write-byte (ldb (byte 8 0) code-point) stream))
 928
 929 (defun output-misc-data ()
 930   (with-output-dat-file (stream "ucdmisc")
 931     (loop for (gc-index bidi-index ccc digit decomposition-info flags
 932                         script line-break age)
 933        across *misc-table*
 934        ;; three bits spare here
 935        do (write-byte gc-index stream)
 936        ;; three bits spare here
 937          (write-byte bidi-index stream)
 938          (write-byte ccc stream)
 939        ;; bits 0-3 encode [0,9], bit 7 is for non-digit status,
 940        ;; bit 6 is the decimal-digit flag. Two bits spare
 941          (write-byte digit stream)
 942          (write-byte decomposition-info stream)
 943          (write-byte flags stream) ; includes EAW in bits 0-3, bit 4 is free
 944          (write-byte script stream)
 945          (write-byte line-break stream)
 946          (write-byte age stream))))
 947
 948 (defun output-ucd-data ()
 949   (with-output-dat-file (high-pages "ucdhigh")
 950     (with-output-dat-file (low-pages "ucdlow")
 951       ;; Output either the index into the misc array (if all the points in the
 952       ;; high-page have the same misc value) or an index into the law-pages
 953       ;; array / 256. For indexes into the misc array, set bit 15 (high bit).
 954       ;; We should never have that many misc entries, so that's not a problem.
 955
 956       ;; If Unicode ever allocates an all-decomposing <First>/<Last> block (the
 957       ;; only way to get a high page that outputs as the same and has a
 958       ;; non-zero decomposition-index, which there's nowhere to store now),
 959       ;; find me, slap me with a fish, and have fun fixing this mess.
 960       (loop with low-pages-index = 0
 961          for high-page from 0 to (ash #x10FFFF -8)
 962          for uniq-ucd-entries = nil do
 963            (loop for low-page from 0 to #xFF do
 964                 (pushnew
 965                  (gethash (logior low-page (ash high-page 8)) *ucd-entries*)
 966                  uniq-ucd-entries :test #'equalp))
 967            (flet ((write-2-byte (int stream)
 968                     (declare (type (unsigned-byte 16) int))
 969                     (write-byte (ldb (byte 8 8) int) stream)
 970                     (write-byte (ldb (byte 8 0) int) stream)))
 971              (case (length uniq-ucd-entries)
 972                (0 (error "Somehow, a high page has no codepoints in it."))
 973                (1 (write-2-byte (logior
 974                                  (ash 1 15)
 975                                  (ucd-misc (car uniq-ucd-entries)))
 976                                 high-pages))
 977                (t (loop for low-page from 0 to #xFF
 978                      for cp = (logior low-page (ash high-page 8))
 979                      for entry = (gethash cp *ucd-entries*) do
 980                        (write-2-byte (ucd-misc entry) low-pages)
 981                        (write-2-byte (ucd-decomp entry) low-pages)
 982                      finally (write-2-byte low-pages-index high-pages)
 983                        (incf low-pages-index)))))
 984          finally (assert (< low-pages-index (ash 1 15)))
 985                  (when *load-print*
 986                    (print low-pages-index))))))
 987
 988 (defun output-decomposition-data ()
 989   (with-output-dat-file (stream "decomp")
 990     (loop for cp across *decompositions* do
 991          (write-codepoint cp stream)))
 992   (when *load-print*
 993     (print (length *decompositions*))))
 994
 995 (defun output-composition-data ()
 996   (with-output-lisp-expr-file (stream "comp")
 997     (let (comp)
 998       (maphash (lambda (k v) (push (cons k v) comp)) *compositions*)
 999       (format stream "#(~%")
1000       (loop for (k . v) in (sort comp #'< :key #'cdr)
1001          do (format stream "(~D . ~D) ; #x~X + #x~X~%"
1002                     (logior (ash (car k) 21) (cdr k)) v (car k) (cdr k)))
1003       (format stream ")~%"))))
1004
1005 (defun output-case-data ()
1006   (let (casing-pages points-with-case)
1007     (with-output-dat-file (stream "case")
1008       (loop for cp being the hash-keys in *case-mapping*
1009            do (push cp points-with-case))
1010       (setf points-with-case (sort points-with-case #'<))
1011       (loop for cp in points-with-case
1012          for (upper . lower) = (gethash cp *case-mapping*) do
1013            (pushnew (ash cp -6) casing-pages)
1014            (write-codepoint cp stream)
1015            (write-byte (if (atom upper) 0 (length upper)) stream)
1016            (if (atom upper) (write-codepoint upper stream)
1017                (map 'nil (lambda (c) (write-codepoint c stream)) upper))
1018            (write-byte (if (atom lower) 0 (length lower)) stream)
1019            (if (atom lower) (write-codepoint lower stream)
1020                (map 'nil (lambda (c) (write-codepoint c stream)) lower))))
1021     (setf casing-pages (sort casing-pages #'<))
1022     (assert (< (length casing-pages) 256))
1023     (let* ((size (1+ (reduce #'max casing-pages)))
1024            (array (make-array size :initial-element 255))
1025            (page -1))
1026       (dolist (entry casing-pages)
1027         (setf (aref array entry) (incf page)))
1028       (with-output-dat-file (stream "casepages")
1029         (dotimes (i size)
1030           (write-byte (aref array i) stream))))
1031     (with-output-lisp-expr-file (stream "casepages")
1032       (print casing-pages stream))))
1033
1034 (defun output-collation-data ()
1035   (with-output-lisp-expr-file (output "collation")
1036     (format output ";;;; This is 'allkeys.txt' converted to Lisp syntax~%")
1037     (format output ";;;; Assume *READ-BASE* is 16.~%#(~%")
1038     (with-input-txt-file (input "allkeys")
1039       (loop for line = (read-line input nil nil) while line
1040             unless (or (string= line "")
1041                        (eql 0 (position #\@ line)) (eql 0 (position #\# line)))
1042             do
1043          (multiple-value-bind (codepoints keys) (parse-collation-line line)
1044            (format output "(~x . ~x)~%"
1045                    ;; Pack up to 3 codepoints, rightmost in the highest bits
1046                    (destructuring-bind (first &optional second third) codepoints
1047                      (cond (third (logior (ash third 42) (ash second 21) first))
1048                            (second (logior (ash second 21) first))
1049                            (t first)))
1050                    ;; Pack the blob of "keys" using 32 bits each
1051                    (let ((sum 0))
1052                      ;; reversal here is magic. Don't worry about getting it wrong-
1053                      ;; unicode-collation.pure checks that.
1054                      (dolist (part (reverse keys) sum)
1055                        (setq sum (logior (ash sum 32) part))))))))
1056     (format output ")~%"))
1057   (with-output-lisp-expr-file (*standard-output* "other-collation-info")
1058     (write-string ";;; The highest primary variable collation index")
1059     (terpri)
1060     (prin1 *maximum-variable-key*) (terpri)))
1061
1062 (defun output (&optional (*output-directory* *output-directory*))
1063   (ensure-directories-exist *output-directory*)
1064   (output-misc-data)
1065   (output-ucd-data)
1066   (output-decomposition-data)
1067   (output-composition-data)
1068   (output-case-data)
1069   (output-collation-data)
1070   (with-output-lisp-expr-file (*standard-output* "misc-properties")
1071     (prin1 **proplist-properties**))
1072
1073   (loop for (filename . data) in '(("ucd-names" . *unicode-names*)
1074                                    ("ucd1-names" . *unicode-1-names*))
1075         do
1076     (with-output-lisp-expr-file (f filename)
1077       (format f ";;; Do not edit by hand: generated by ucd.lisp~%")
1078       (loop for (code . name) in (nreverse (symbol-value data))
1079             do (format f "#x~X ~S~%" code name))))
1080   (setf *unicode-names* nil)
1081   (setf *unicode-1-names* nil)
1082
1083   (with-output-lisp-expr-file (*standard-output* "numerics")
1084     (prin1 (mapcar (lambda (x) (cons (car x) (read-from-string (cdr x))))
1085                    ;; Print it low to high. It was collected with PUSH
1086                    (reverse *different-numerics*))))
1087   (with-output-lisp-expr-file (*standard-output* "titlecases")
1088     (prin1 *different-titlecases*))
1089   (with-output-lisp-expr-file (*standard-output* "foldcases")
1090     (prin1 *different-casefolds*))
1091   (with-output-lisp-expr-file (*standard-output* "confusables")
1092     (prin1 *confusables*))
1093   (with-output-lisp-expr-file (*standard-output* "bidi-mirrors")
1094     (prin1 *bidi-mirroring-glyphs*))
1095   (with-output-lisp-expr-file (*standard-output* "block-ranges")
1096     (prin1 (car *blocks*)))
1097   (with-output-lisp-expr-file (*standard-output* "block-names")
1098     (format t "#(~{:~A~^~%  ~})" (cdr *blocks*)))
1099   (values))