5 (defparameter *output-directory
*
7 (make-pathname :directory
'(:relative
:up
"output"))
8 (make-pathname :directory
(pathname-directory *load-truename
*))))
10 (defparameter *page-size-exponent
* 8)
13 (ash cp
(- *page-size-exponent
*)))
16 (ldb (byte *page-size-exponent
* 0) cp
))
20 (defstruct ucd misc transform
)
22 (defparameter *unicode-character-database
*
23 (make-pathname :directory
(pathname-directory *load-truename
*)))
25 (defparameter *ucd-base
* nil
)
26 (defparameter *unicode-names
* (make-hash-table))
28 (defparameter *last-uppercase
* nil
)
29 (defparameter *uppercase-transition-count
* 0)
30 (defparameter *different-titlecases
* nil
)
31 (defparameter *different-numerics
* nil
)
32 (defparameter *name-size
* 0)
33 (defparameter *misc-hash
* (make-hash-table :test
#'equal
))
34 (defparameter *misc-index
* -
1)
35 (defparameter *misc-table
* nil
)
36 (defparameter *misc-mapping
* nil
)
37 (defparameter *both-cases
* nil
)
38 (defparameter *decompositions
* nil
)
39 (defparameter *decomposition-length-max
* nil
)
40 (defparameter *decomposition-types
* nil
)
41 (defparameter *decomposition-base
* nil
)
43 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
44 bidi-mirrored cl-both-case-p
)
45 (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
46 bidi-mirrored cl-both-case-p
))
47 (index (gethash list
*misc-hash
*)))
50 (vector-push list
*misc-table
*)
51 (setf (gethash list
*misc-hash
*)
52 (incf *misc-index
*))))))
54 (defun compare-misc-entry (left right
)
55 (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
56 left-decimal-digit left-digit left-bidi-mirrored
59 (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
60 right-decimal-digit right-digit right-bidi-mirrored
63 (or (and left-cl-both-case-p
(not right-cl-both-case-p
))
64 (and (or left-cl-both-case-p
(not right-cl-both-case-p
))
65 (or (< left-gc-index right-gc-index
)
66 (and (= left-gc-index right-gc-index
)
67 (or (< left-bidi-index right-bidi-index
)
68 (and (= left-bidi-index right-bidi-index
)
69 (or (< left-ccc-index right-ccc-index
)
70 (and (= left-ccc-index right-ccc-index
)
71 (or (string< left-decimal-digit
73 (and (string= left-decimal-digit
75 (or (string< left-digit right-digit
)
76 (and (string= left-digit
78 (string< left-bidi-mirrored
79 right-bidi-mirrored
))))))))))))))))
81 (defun build-misc-table ()
82 (sort *misc-table
* #'compare-misc-entry
)
83 (setq *misc-mapping
* (make-array (1+ *misc-index
*)))
84 (loop for i from
0 to
*misc-index
*
85 do
(setf (aref *misc-mapping
*
86 (gethash (aref *misc-table
* i
) *misc-hash
*))
90 (setq *last-uppercase
* nil
)
91 (setq *uppercase-transition-count
* 0)
92 (setq *different-titlecases
* nil
)
93 (setq *different-numerics
* nil
)
95 (setq *misc-hash
* (make-hash-table :test
#'equal
))
96 (setq *misc-index
* -
1)
97 (setq *misc-table
* (make-array 256 :fill-pointer
0))
98 (setq *both-cases
* nil
)
99 (setq *decompositions
* 0)
100 (setq *decomposition-types
* (make-hash-table :test
#'equal
))
101 (setq *decomposition-length-max
* 0)
102 (setq *decomposition-base
* (make-array (ash #x110000
103 (- *page-size-exponent
*))
104 :initial-element nil
))
105 (setq *ucd-base
* (make-array (ash #x110000
(- *page-size-exponent
*))
106 :initial-element nil
))
107 (with-open-file (*standard-input
*
108 (make-pathname :name
"UnicodeData"
110 :defaults
*unicode-character-database
*)
112 (loop for line
= (read-line nil nil
)
114 do
(slurp-ucd-line line
)))
117 (fixup-hangul-syllables)
120 (defun fixup-hangul-syllables ()
121 ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
122 (let* ((sbase #xac00
)
130 (ncount (* vcount tcount
))
131 (table (make-hash-table)))
132 (with-open-file (*standard-input
*
133 (make-pathname :name
"Jamo" :type
"txt"
134 :defaults
*unicode-character-database
*))
135 (loop for line
= (read-line nil nil
)
137 if
(position #\
; line)
138 do
(add-jamo-information line table
)))
139 (dotimes (sindex scount
)
140 (let* ((l (+ lbase
(floor sindex ncount
)))
141 (v (+ vbase
(floor (mod sindex ncount
) tcount
)))
142 (tee (+ tbase
(mod sindex tcount
)))
143 (name (format nil
"HANGUL_SYLLABLE_~A~A~:[~A~;~]"
144 (gethash l table
) (gethash v table
)
145 (= tee tbase
) (gethash tee table
))))
146 (setf (gethash (+ sbase sindex
) *unicode-names
*) name
)))))
148 (defun add-jamo-information (line table
)
149 (let* ((split (split-string line
#\
;))
150 (code (parse-integer (first split
) :radix
16))
151 (syllable (string-trim '(#\Space
)
152 (subseq (second split
) 0 (position #\
# (second split
))))))
153 (setf (gethash code table
) syllable
)))
155 (defun split-string (line character
)
156 (loop for prev-position
= 0 then
(1+ position
)
157 for position
= (position character line
:start prev-position
)
158 collect
(subseq line prev-position position
)
162 (defun init-indices (strings)
163 (let ((hash (make-hash-table :test
#'equal
)))
164 (loop for string in strings
166 do
(setf (gethash string hash
) index
))
169 (defparameter *general-categories
*
170 (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
171 "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
172 "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
173 (defparameter *bidi-classes
*
174 (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
175 "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
178 (defparameter *block-first
* nil
)
180 (defun normalize-character-name (name)
181 (when (find #\_ name
)
182 (error "Bad name for a character: ~A" name
))
183 (unless (or (zerop (length name
)) (find #\
< name
) (find #\
> name
))
184 (substitute #\_
#\Space name
)))
186 ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
187 ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
188 ;;; D800 -- F8FF : surrogates and private use
189 ;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
190 ;;; F0000 -- FFFFD : private use
191 ;;; 100000 -- 10FFFD: private use
192 (defun encode-ucd-line (line code-point
)
193 (destructuring-bind (name general-category canonical-combining-class
194 bidi-class decomposition-type-and-mapping
195 decimal-digit digit numeric bidi-mirrored
196 unicode-1-name iso-10646-comment simple-uppercase
197 simple-lowercase simple-titlecase
)
199 (declare (ignore unicode-1-name iso-10646-comment
))
200 (if (and (> (length name
) 8)
201 (string= ", First>" name
:start2
(- (length name
) 8)))
203 (setq *block-first
* code-point
)
205 (let* ((gc-index (or (gethash general-category
*general-categories
*)
206 (error "unknown general category ~A"
208 (bidi-index (or (gethash bidi-class
*bidi-classes
*)
209 (error "unknown bidirectional class ~A"
211 (ccc-index (parse-integer canonical-combining-class
))
212 (digit-index (unless (string= "" decimal-digit
)
213 (parse-integer decimal-digit
)))
214 (upper-index (unless (string= "" simple-uppercase
)
215 (parse-integer simple-uppercase
:radix
16)))
216 (lower-index (unless (string= "" simple-lowercase
)
217 (parse-integer simple-lowercase
:radix
16)))
218 (title-index (unless (string= "" simple-titlecase
)
219 (parse-integer simple-titlecase
:radix
16)))
221 (not (null (or (and (= gc-index
0) lower-index
)
222 (and (= gc-index
1) upper-index
)))))
223 (misc-index (hash-misc gc-index bidi-index ccc-index
224 decimal-digit digit bidi-mirrored
226 (declare (ignore digit-index
))
227 (when (and (not cl-both-case-p
)
229 (format t
"~A~%" name
))
230 (incf *name-size
* (length name
))
231 (when (string/= "" decomposition-type-and-mapping
)
232 (let ((split (split-string decomposition-type-and-mapping
234 (when (char= #\
< (aref (first split
) 0))
235 (setf (gethash (pop split
) *decomposition-types
*) t
))
236 (unless (aref *decomposition-base
* (cp-high code-point
))
237 (setf (aref *decomposition-base
* (cp-high code-point
))
238 (make-array (ash 1 *page-size-exponent
*)
239 :initial-element nil
)))
240 (setf (aref (aref *decomposition-base
* (cp-high code-point
))
242 (mapcar #'(lambda (string)
243 (parse-integer string
:radix
16))
245 (setq *decomposition-length-max
*
246 (max *decomposition-length-max
* (length split
)))
247 (incf *decompositions
* (length split
))))
248 (when (and (string/= "" simple-uppercase
)
249 (string/= "" simple-lowercase
))
250 (push (list code-point upper-index lower-index
) *both-cases
*))
251 (when (string/= simple-uppercase simple-titlecase
)
252 (push (cons code-point title-index
) *different-titlecases
*))
253 (when (string/= digit numeric
)
254 (push (cons code-point numeric
) *different-numerics
*))
257 (unless *last-uppercase
*
258 (incf *uppercase-transition-count
*))
259 (setq *last-uppercase
* t
))
261 (when *last-uppercase
*
262 (incf *uppercase-transition-count
*))
263 (setq *last-uppercase
* nil
)))
264 (when (> ccc-index
255)
265 (error "canonical combining class too large ~A" ccc-index
))
266 (let ((result (make-ucd :misc misc-index
267 :transform
(or upper-index lower-index
0))))
268 (when (and (> (length name
) 7)
269 (string= ", Last>" name
:start2
(- (length name
) 7)))
270 (let ((page-start (ash (+ *block-first
*
271 (ash 1 *page-size-exponent
*)
273 (- *page-size-exponent
*)))
274 (page-end (ash code-point
(- *page-size-exponent
*))))
275 (loop for point from
*block-first
*
276 below
(ash page-start
*page-size-exponent
*)
277 do
(setf (aref (aref *ucd-base
* (cp-high point
))
280 (loop for page from page-start below page-end
281 do
(setf (aref *ucd-base
* page
)
282 (make-array (ash 1 *page-size-exponent
*)
283 :initial-element result
)))
284 (loop for point from
(ash page-end
*page-size-exponent
*)
286 do
(setf (aref (aref *ucd-base
* (cp-high point
))
289 (values result
(normalize-character-name name
)))))))
291 (defun slurp-ucd-line (line)
292 (let* ((split-line (split-string line
#\
;))
293 (code-point (parse-integer (first split-line
) :radix
16))
294 (code-high (ash code-point
(- *page-size-exponent
*)))
295 (code-low (ldb (byte *page-size-exponent
* 0) code-point
)))
296 (unless (aref *ucd-base
* code-high
)
297 (setf (aref *ucd-base
* code-high
)
298 (make-array (ash 1 *page-size-exponent
*)
299 :initial-element nil
)))
300 (multiple-value-bind (encoding name
)
301 (encode-ucd-line (cdr split-line
) code-point
)
302 (setf (aref (aref *ucd-base
* code-high
) code-low
) encoding
303 (gethash code-point
*unicode-names
*) name
))))
305 (defun second-pass ()
306 (loop for i from
0 below
(length *ucd-base
*)
307 when
(aref *ucd-base
* i
)
308 do
(loop for j from
0 below
(length (aref *ucd-base
* i
))
309 for result
= (aref (aref *ucd-base
* i
) j
)
311 when
(let* ((transform-point (ucd-transform result
))
312 (transform-high (ash transform-point
313 (- *page-size-exponent
*)))
314 (transform-low (ldb (byte *page-size-exponent
* 0)
316 (and (plusp transform-point
)
318 (aref (aref *ucd-base
* transform-high
)
320 (+ (ash i
*page-size-exponent
*) j
))))
321 do
(destructuring-bind (gc-index bidi-index ccc-index
322 decimal-digit digit bidi-mirrored
324 (aref *misc-table
* (ucd-misc result
))
325 (declare (ignore cl-both-case-p
))
326 (format t
"~A~%" (+ (ash i
*page-size-exponent
*) j
))
327 (setf (ucd-misc result
)
328 (hash-misc gc-index bidi-index ccc-index
329 decimal-digit digit bidi-mirrored
332 (defun write-3-byte (triplet stream
)
333 (write-byte (ldb (byte 8 0) triplet
) stream
)
334 (write-byte (ldb (byte 8 8) triplet
) stream
)
335 (write-byte (ldb (byte 8 16) triplet
) stream
))
337 (defun digit-to-byte (digit)
338 (if (string= "" digit
)
340 (parse-integer digit
)))
343 (let ((hash (make-hash-table :test
#'equalp
))
345 (loop for page across
*ucd-base
*
347 (unless (gethash page hash
)
348 (setf (gethash page hash
)
350 (let ((array (make-array (1+ index
))))
351 (maphash #'(lambda (key value
)
352 (setf (aref array value
) key
))
355 (make-array (ash 1 *page-size-exponent
*) :initial-element nil
))
356 (with-open-file (stream (make-pathname :name
"ucd"
358 :defaults
*output-directory
*)
360 :element-type
'(unsigned-byte 8)
361 :if-exists
:supersede
362 :if-does-not-exist
:create
)
363 (loop for
(gc-index bidi-index ccc-index decimal-digit digit
366 do
(write-byte gc-index stream
)
367 do
(write-byte bidi-index stream
)
368 do
(write-byte ccc-index stream
)
369 do
(write-byte (digit-to-byte decimal-digit
) stream
)
370 do
(write-byte (digit-to-byte digit
) stream
)
371 do
(write-byte (if (string= "N" bidi-mirrored
) 0 1) stream
)
372 do
(write-byte 0 stream
)
373 do
(write-byte 0 stream
))
374 (loop for page across
*ucd-base
*
375 do
(write-byte (if page
(gethash page hash
) 0) stream
))
376 (loop for page across array
377 do
(loop for entry across page
378 do
(write-byte (if entry
379 (aref *misc-mapping
* (ucd-misc entry
))
382 do
(write-3-byte (if entry
(ucd-transform entry
) 0)
384 (with-open-file (f (make-pathname :name
"ucd-names" :type
"lisp-expr"
385 :defaults
*output-directory
*)
387 :if-exists
:supersede
388 :if-does-not-exist
:create
)
389 (with-standard-io-syntax
390 (write-string ";;; Do not edit by hand: generated by ucd.lisp" f
)
391 (maphash (lambda (code name
)
396 (setf *unicode-names
* nil
))
397 (with-open-file (*standard-output
*
398 (make-pathname :name
"numerics"
400 :defaults
*output-directory
*)
402 :if-exists
:supersede
403 :if-does-not-exist
:create
)
404 (with-standard-io-syntax
405 (let ((*print-pretty
* t
))
406 (prin1 (mapcar #'(lambda (x) (cons (car x
) (read-from-string (cdr x
))))
407 *different-numerics
*)))))
408 (with-open-file (*standard-output
*
409 (make-pathname :name
"titlecases"
411 :defaults
*output-directory
*)
413 :if-exists
:supersede
414 :if-does-not-exist
:create
)
415 (with-standard-io-syntax
416 (let ((*print-pretty
* t
))
417 (prin1 *different-titlecases
*))))
418 (with-open-file (*standard-output
*
419 (make-pathname :name
"misc"
421 :defaults
*output-directory
*)
423 :if-exists
:supersede
424 :if-does-not-exist
:create
)
425 (with-standard-io-syntax
426 (let ((*print-pretty
* t
))
427 (prin1 `(:length
,(length *misc-table
*)
428 :uppercase
,(loop for
(gc-index) across
*misc-table
*
432 :lowercase
,(loop for
(gc-index) across
*misc-table
*
436 :titlecase
,(loop for
(gc-index) across
*misc-table
*
442 ;;; Use of the generated files
444 (defparameter *compiled-ucd
* nil
)
446 (defun read-compiled-ucd ()
447 (with-open-file (stream (make-pathname :name
"ucd"
449 :defaults
*output-directory
*)
451 :element-type
'(unsigned-byte 8))
452 (let ((length (file-length stream
)))
454 (make-array length
:element-type
'(unsigned-byte 8)))
455 (read-sequence *compiled-ucd
* stream
)))
458 ;;; The stuff below is dependent on misc.lisp-expr being
459 ;;; (:LENGTH 215 :UPPERCASE (0 2) :LOWERCASE (1 3) :TITLECASE (4)).
461 ;;; There are two entries for UPPERCASE and LOWERCASE because some
462 ;;; characters have case (by Unicode standards) but are not
463 ;;; transformable character-by-character in a locale-independent way
464 ;;; (as CL requires for its standard operators).
466 ;;; for more details on these debugging functions, see the description
467 ;;; of the character database format in src/code/target-char.lisp
469 (defparameter *length
* 215)
472 (let* ((cp-high (cp-high cp
))
473 (page (aref *compiled-ucd
* (+ (* 8 *length
*) cp-high
))))
475 (ash #x110000
(- *page-size-exponent
*))
476 (* (ash 4 *page-size-exponent
*) page
)
479 (defun cp-value-0 (cp)
480 (aref *compiled-ucd
* (cp-index cp
)))
482 (defun cp-value-1 (cp)
483 (let ((index (cp-index cp
)))
484 (dpb (aref *compiled-ucd
* (+ index
3)) (byte 8 16)
485 (dpb (aref *compiled-ucd
* (+ index
2)) (byte 8 8)
486 (aref *compiled-ucd
* (1+ index
))))))
488 (defun cp-general-category (cp)
489 (aref *compiled-ucd
* (* 8 (cp-value-0 cp
))))
491 (defun cp-decimal-digit (cp)
492 (let ((decimal-digit (aref *compiled-ucd
* (+ 3 (* 8 (cp-value-0 cp
))))))
493 (and (< decimal-digit
10)
496 (defun cp-alpha-char-p (cp)
497 (< (cp-general-category cp
) 5))
499 (defun cp-alphanumericp (cp)
500 (let ((gc (cp-general-category cp
)))
504 (defun cp-digit-char-p (cp &optional
(radix 10))
505 (let ((number (or (cp-decimal-digit cp
)
510 (when (and number
(< number radix
))
513 (defun cp-graphic-char-p (cp)
517 (defun cp-char-upcase (cp)
518 (if (= (cp-value-0 cp
) 1)
522 (defun cp-char-downcase (cp)
523 (if (= (cp-value-0 cp
) 0)
527 (defun cp-upper-case-p (cp)
528 (= (cp-value-0 cp
) 0))
530 (defun cp-lower-case-p (cp)
531 (= (cp-value-0 cp
) 1))
533 (defun cp-both-case-p (cp)
534 (< (cp-value-0 cp
) 2))