5 (defparameter *output-directory
*
7 (make-pathname :directory
'(:relative
:up
"output"))
8 (make-pathname :directory
(pathname-directory *load-truename
*))))
10 (defparameter *page-size-exponent
* 8)
13 (ash cp
(- *page-size-exponent
*)))
16 (ldb (byte *page-size-exponent
* 0) cp
))
20 (defstruct ucd misc transform
)
22 (defparameter *unicode-character-database
*
23 (make-pathname :directory
(pathname-directory *load-truename
*)))
25 (defparameter *ucd-base
* nil
)
26 (defparameter *unicode-names
* (make-hash-table))
28 (defparameter *last-uppercase
* nil
)
29 (defparameter *uppercase-transition-count
* 0)
30 (defparameter *different-titlecases
* nil
)
31 (defparameter *different-numerics
* nil
)
32 (defparameter *name-size
* 0)
33 (defparameter *misc-hash
* (make-hash-table :test
#'equal
))
34 (defparameter *misc-index
* -
1)
35 (defparameter *misc-table
* nil
)
36 (defparameter *misc-mapping
* nil
)
37 (defparameter *both-cases
* nil
)
38 (defparameter *decompositions
* nil
)
39 (defparameter *decomposition-length-max
* nil
)
40 (defparameter *decomposition-types
* nil
)
41 (defparameter *decomposition-base
* nil
)
43 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
44 bidi-mirrored cl-both-case-p
)
45 (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
46 bidi-mirrored cl-both-case-p
))
47 (index (gethash list
*misc-hash
*)))
50 (vector-push list
*misc-table
*)
51 (setf (gethash list
*misc-hash
*)
52 (incf *misc-index
*))))))
54 (defun compare-misc-entry (left right
)
55 (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
56 left-decimal-digit left-digit left-bidi-mirrored
59 (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
60 right-decimal-digit right-digit right-bidi-mirrored
63 (or (and left-cl-both-case-p
(not right-cl-both-case-p
))
64 (and (or left-cl-both-case-p
(not right-cl-both-case-p
))
65 (or (< left-gc-index right-gc-index
)
66 (and (= left-gc-index right-gc-index
)
67 (or (< left-bidi-index right-bidi-index
)
68 (and (= left-bidi-index right-bidi-index
)
69 (or (< left-ccc-index right-ccc-index
)
70 (and (= left-ccc-index right-ccc-index
)
71 (or (string< left-decimal-digit
73 (and (string= left-decimal-digit
75 (or (string< left-digit right-digit
)
76 (and (string= left-digit
78 (string< left-bidi-mirrored
79 right-bidi-mirrored
))))))))))))))))
81 (defun build-misc-table ()
82 (sort *misc-table
* #'compare-misc-entry
)
83 (setq *misc-mapping
* (make-array (1+ *misc-index
*)))
84 (loop for i from
0 to
*misc-index
*
85 do
(setf (aref *misc-mapping
*
86 (gethash (aref *misc-table
* i
) *misc-hash
*))
90 (setq *last-uppercase
* nil
)
91 (setq *uppercase-transition-count
* 0)
92 (setq *different-titlecases
* nil
)
93 (setq *different-numerics
* nil
)
95 (setq *misc-hash
* (make-hash-table :test
#'equal
))
96 (setq *misc-index
* -
1)
97 (setq *misc-table
* (make-array 256 :fill-pointer
0))
98 (setq *both-cases
* nil
)
99 (setq *decompositions
* 0)
100 (setq *decomposition-types
* (make-hash-table :test
#'equal
))
101 (setq *decomposition-length-max
* 0)
102 (setq *decomposition-base
* (make-array (ash #x110000
103 (- *page-size-exponent
*))
104 :initial-element nil
))
105 (setq *ucd-base
* (make-array (ash #x110000
(- *page-size-exponent
*))
106 :initial-element nil
))
107 (with-open-file (*standard-input
*
108 (make-pathname :name
"UnicodeData"
110 :defaults
*unicode-character-database
*)
112 (loop for line
= (read-line nil nil
)
114 do
(slurp-ucd-line line
)))
119 (defun split-string (line character
)
120 (loop for prev-position
= 0 then
(1+ position
)
121 for position
= (position character line
:start prev-position
)
122 collect
(subseq line prev-position position
)
126 (defun init-indices (strings)
127 (let ((hash (make-hash-table :test
#'equal
)))
128 (loop for string in strings
130 do
(setf (gethash string hash
) index
))
133 (defparameter *general-categories
*
134 (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
135 "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
136 "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
137 (defparameter *bidi-classes
*
138 (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
139 "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
142 (defparameter *block-first
* nil
)
144 (defun normalize-character-name (name)
145 (when (find #\_ name
)
146 (error "Bad name for a character: ~A" name
))
147 (unless (or (zerop (length name
)) (find #\
< name
) (find #\
> name
))
148 (substitute #\_
#\Space name
)))
150 ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
151 ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
152 ;;; D800 -- F8FF : surrogates and private use
153 ;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
154 ;;; F0000 -- FFFFD : private use
155 ;;; 100000 -- 10FFFD: private use
156 (defun encode-ucd-line (line code-point
)
157 (destructuring-bind (name general-category canonical-combining-class
158 bidi-class decomposition-type-and-mapping
159 decimal-digit digit numeric bidi-mirrored
160 unicode-1-name iso-10646-comment simple-uppercase
161 simple-lowercase simple-titlecase
)
163 (declare (ignore unicode-1-name iso-10646-comment
))
164 (if (and (> (length name
) 8)
165 (string= ", First>" name
:start2
(- (length name
) 8)))
167 (setq *block-first
* code-point
)
169 (let* ((gc-index (or (gethash general-category
*general-categories
*)
170 (error "unknown general category ~A"
172 (bidi-index (or (gethash bidi-class
*bidi-classes
*)
173 (error "unknown bidirectional class ~A"
175 (ccc-index (parse-integer canonical-combining-class
))
176 (digit-index (unless (string= "" decimal-digit
)
177 (parse-integer decimal-digit
)))
178 (upper-index (unless (string= "" simple-uppercase
)
179 (parse-integer simple-uppercase
:radix
16)))
180 (lower-index (unless (string= "" simple-lowercase
)
181 (parse-integer simple-lowercase
:radix
16)))
182 (title-index (unless (string= "" simple-titlecase
)
183 (parse-integer simple-titlecase
:radix
16)))
185 (not (null (or (and (= gc-index
0) lower-index
)
186 (and (= gc-index
1) upper-index
)))))
187 (misc-index (hash-misc gc-index bidi-index ccc-index
188 decimal-digit digit bidi-mirrored
190 (declare (ignore digit-index
))
191 (incf *name-size
* (length name
))
192 (when (string/= "" decomposition-type-and-mapping
)
193 (let ((split (split-string decomposition-type-and-mapping
195 (when (char= #\
< (aref (first split
) 0))
196 (setf (gethash (pop split
) *decomposition-types
*) t
))
197 (unless (aref *decomposition-base
* (cp-high code-point
))
198 (setf (aref *decomposition-base
* (cp-high code-point
))
199 (make-array (ash 1 *page-size-exponent
*)
200 :initial-element nil
)))
201 (setf (aref (aref *decomposition-base
* (cp-high code-point
))
203 (mapcar #'(lambda (string)
204 (parse-integer string
:radix
16))
206 (setq *decomposition-length-max
*
207 (max *decomposition-length-max
* (length split
)))
208 (incf *decompositions
* (length split
))))
209 (when (and (string/= "" simple-uppercase
)
210 (string/= "" simple-lowercase
))
211 (push (list code-point upper-index lower-index
) *both-cases
*))
212 (when (string/= simple-uppercase simple-titlecase
)
213 (push (cons code-point title-index
) *different-titlecases
*))
214 (when (string/= digit numeric
)
215 (push (cons code-point numeric
) *different-numerics
*))
218 (unless *last-uppercase
*
219 (incf *uppercase-transition-count
*))
220 (setq *last-uppercase
* t
))
222 (when *last-uppercase
*
223 (incf *uppercase-transition-count
*))
224 (setq *last-uppercase
* nil
)))
225 (when (> ccc-index
255)
226 (error "canonical combining class too large ~A" ccc-index
))
227 (let ((result (make-ucd :misc misc-index
228 :transform
(or upper-index lower-index
0))))
229 (when (and (> (length name
) 7)
230 (string= ", Last>" name
:start2
(- (length name
) 7)))
231 (let ((page-start (ash (+ *block-first
*
232 (ash 1 *page-size-exponent
*)
234 (- *page-size-exponent
*)))
235 (page-end (ash code-point
(- *page-size-exponent
*))))
236 (loop for point from
*block-first
*
237 below
(ash page-start
*page-size-exponent
*)
238 do
(setf (aref (aref *ucd-base
* (cp-high point
))
241 (loop for page from page-start below page-end
242 do
(setf (aref *ucd-base
* page
)
243 (make-array (ash 1 *page-size-exponent
*)
244 :initial-element result
)))
245 (loop for point from
(ash page-end
*page-size-exponent
*)
247 do
(setf (aref (aref *ucd-base
* (cp-high point
))
250 (values result
(normalize-character-name name
)))))))
252 (defun slurp-ucd-line (line)
253 (let* ((split-line (split-string line
#\
;))
254 (code-point (parse-integer (first split-line
) :radix
16))
255 (code-high (ash code-point
(- *page-size-exponent
*)))
256 (code-low (ldb (byte *page-size-exponent
* 0) code-point
)))
257 (unless (aref *ucd-base
* code-high
)
258 (setf (aref *ucd-base
* code-high
)
259 (make-array (ash 1 *page-size-exponent
*)
260 :initial-element nil
)))
261 (multiple-value-bind (encoding name
)
262 (encode-ucd-line (cdr split-line
) code-point
)
263 (setf (aref (aref *ucd-base
* code-high
) code-low
) encoding
264 (gethash code-point
*unicode-names
*) name
))))
266 (defun second-pass ()
267 (loop for i from
0 below
(length *ucd-base
*)
268 when
(aref *ucd-base
* i
)
269 do
(loop for j from
0 below
(length (aref *ucd-base
* i
))
270 for result
= (aref (aref *ucd-base
* i
) j
)
272 when
(let* ((transform-point (ucd-transform result
))
273 (transform-high (ash transform-point
274 (- *page-size-exponent
*)))
275 (transform-low (ldb (byte *page-size-exponent
* 0)
277 (and (plusp transform-point
)
279 (aref (aref *ucd-base
* transform-high
)
281 (+ (ash i
*page-size-exponent
*) j
))))
282 do
(destructuring-bind (gc-index bidi-index ccc-index
283 decimal-digit digit bidi-mirrored
285 (aref *misc-table
* (ucd-misc result
))
286 (declare (ignore cl-both-case-p
))
287 (format t
"~A~%" (+ (ash i
*page-size-exponent
*) j
))
288 (setf (ucd-misc result
)
289 (hash-misc gc-index bidi-index ccc-index
290 decimal-digit digit bidi-mirrored
293 (defun write-3-byte (triplet stream
)
294 (write-byte (ldb (byte 8 0) triplet
) stream
)
295 (write-byte (ldb (byte 8 8) triplet
) stream
)
296 (write-byte (ldb (byte 8 16) triplet
) stream
))
298 (defun digit-to-byte (digit)
299 (if (string= "" digit
)
301 (parse-integer digit
)))
304 (let ((hash (make-hash-table :test
#'equalp
))
306 (loop for page across
*ucd-base
*
308 (unless (gethash page hash
)
309 (setf (gethash page hash
)
311 (let ((array (make-array (1+ index
))))
312 (maphash #'(lambda (key value
)
313 (setf (aref array value
) key
))
316 (make-array (ash 1 *page-size-exponent
*) :initial-element nil
))
317 (with-open-file (stream (make-pathname :name
"ucd"
319 :defaults
*output-directory
*)
321 :element-type
'(unsigned-byte 8)
322 :if-exists
:supersede
323 :if-does-not-exist
:create
)
324 (loop for
(gc-index bidi-index ccc-index decimal-digit digit
327 do
(write-byte gc-index stream
)
328 do
(write-byte bidi-index stream
)
329 do
(write-byte ccc-index stream
)
330 do
(write-byte (digit-to-byte decimal-digit
) stream
)
331 do
(write-byte (digit-to-byte digit
) stream
)
332 do
(write-byte (if (string= "N" bidi-mirrored
) 0 1) stream
)
333 do
(write-byte 0 stream
)
334 do
(write-byte 0 stream
))
335 (loop for page across
*ucd-base
*
336 do
(write-byte (if page
(gethash page hash
) 0) stream
))
337 (loop for page across array
338 do
(loop for entry across page
339 do
(write-byte (if entry
340 (aref *misc-mapping
* (ucd-misc entry
))
343 do
(write-3-byte (if entry
(ucd-transform entry
) 0)
345 (with-open-file (f (make-pathname :name
"ucd-names" :type
"lisp-expr"
346 :defaults
*output-directory
*)
348 :if-exists
:supersede
349 :if-does-not-exist
:create
)
350 (with-standard-io-syntax
351 (write-string ";;; Do not edit by hand: generated by ucd.lisp" f
)
352 (maphash (lambda (code name
)
357 (setf *unicode-names
* nil
))
358 (with-open-file (*standard-output
*
359 (make-pathname :name
"numerics"
361 :defaults
*output-directory
*)
363 :if-exists
:supersede
364 :if-does-not-exist
:create
)
365 (let ((*print-pretty
* t
))
366 (prin1 (mapcar #'(lambda (x) (cons (car x
) (read-from-string (cdr x
))))
367 *different-numerics
*))))
368 (with-open-file (*standard-output
*
369 (make-pathname :name
"titlecases"
371 :defaults
*output-directory
*)
373 :if-exists
:supersede
374 :if-does-not-exist
:create
)
375 (let ((*print-pretty
* t
))
376 (prin1 *different-titlecases
*)))
377 (with-open-file (*standard-output
*
378 (make-pathname :name
"misc"
380 :defaults
*output-directory
*)
382 :if-exists
:supersede
383 :if-does-not-exist
:create
)
384 (let ((*print-pretty
* t
))
385 (prin1 `(:length
,(length *misc-table
*)
386 :uppercase
,(loop for
(gc-index) across
*misc-table
*
390 :lowercase
,(loop for
(gc-index) across
*misc-table
*
394 :titlecase
,(loop for
(gc-index) across
*misc-table
*
400 ;;; Use of the generated files
402 (defparameter *compiled-ucd
* nil
)
404 (defun read-compiled-ucd ()
405 (with-open-file (stream (make-pathname :name
"ucd"
407 :defaults
*output-directory
*)
409 :element-type
'(unsigned-byte 8))
410 (let ((length (file-length stream
)))
412 (make-array length
:element-type
'(unsigned-byte 8)))
413 (read-sequence *compiled-ucd
* stream
)))
416 ;;; The stuff below is dependent on misc.lisp-expr being
417 ;;; (:LENGTH 186 :UPPERCASE (0) :LOWERCASE (1) :TITLECASE (2))
419 (defparameter *length
* 186)
422 (let* ((cp-high (cp-high cp
))
423 (page (aref *compiled-ucd
* (+ (* 8 *length
*) cp-high
))))
425 (ash #x110000
(- *page-size-exponent
*))
426 (* (ash 4 *page-size-exponent
*) page
)
429 (defun cp-value-0 (cp)
430 (aref *compiled-ucd
* (cp-index cp
)))
432 (defun cp-value-1 (cp)
433 (let ((index (cp-index cp
)))
434 (dpb (aref *compiled-ucd
* (+ index
3)) (byte 8 16)
435 (dpb (aref *compiled-ucd
* (+ index
2)) (byte 8 8)
436 (aref *compiled-ucd
* (1+ index
))))))
438 (defun cp-general-category (cp)
439 (aref *compiled-ucd
* (* 8 (cp-value-0 cp
))))
441 (defun cp-decimal-digit (cp)
442 (let ((decimal-digit (aref *compiled-ucd
* (+ 3 (* 8 (cp-value-0 cp
))))))
443 (and (< decimal-digit
10)
446 (defun cp-alpha-char-p (cp)
447 (< (cp-general-category cp
) 5))
449 (defun cp-alphanumericp (cp)
450 (let ((gc (cp-general-category cp
)))
454 (defun cp-digit-char-p (cp &optional
(radix 10))
455 (let ((number (or (cp-decimal-digit cp
)
460 (when (and number
(< number radix
))
463 (defun cp-graphic-char-p (cp)
467 (defun cp-char-upcase (cp)
468 (if (= (cp-value-0 cp
) 1)
472 (defun cp-char-downcase (cp)
473 (if (= (cp-value-0 cp
) 0)
477 (defun cp-upper-case-p (cp)
478 (= (cp-value-0 cp
) 0))
480 (defun cp-lower-case-p (cp)
481 (= (cp-value-0 cp
) 1))
483 (defun cp-both-case-p (cp)
484 (< (cp-value-0 cp
) 2))