0.9.2.10:
[sbcl/eslaughter.git] / tools-for-build / ucd.lisp
blob4ecfdbb40bc4fbe3de7e7291375629cad580ee1e
1 (in-package "SB-COLD")
3 ;;; Common
5 (defparameter *output-directory*
6 (merge-pathnames
7 (make-pathname :directory '(:relative :up "output"))
8 (make-pathname :directory (pathname-directory *load-truename*))))
10 (defparameter *page-size-exponent* 8)
12 (defun cp-high (cp)
13 (ash cp (- *page-size-exponent*)))
15 (defun cp-low (cp)
16 (ldb (byte *page-size-exponent* 0) cp))
18 ;;; Generator
20 (defparameter *unicode-character-database*
21 (make-pathname :directory (pathname-directory *load-truename*)))
23 (defparameter *ucd-base* nil)
25 (defparameter *last-uppercase* nil)
26 (defparameter *uppercase-transition-count* 0)
27 (defparameter *different-titlecases* nil)
28 (defparameter *different-numerics* nil)
29 (defparameter *name-size* 0)
30 (defparameter *misc-hash* (make-hash-table :test #'equal))
31 (defparameter *misc-index* -1)
32 (defparameter *misc-table* nil)
33 (defparameter *misc-mapping* nil)
34 (defparameter *both-cases* nil)
35 (defparameter *decompositions* nil)
36 (defparameter *decomposition-length-max* nil)
37 (defparameter *decomposition-types* nil)
38 (defparameter *decomposition-base* nil)
40 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
41 bidi-mirrored cl-both-case-p)
42 (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
43 bidi-mirrored cl-both-case-p))
44 (index (gethash list *misc-hash*)))
45 (or index
46 (progn
47 (vector-push list *misc-table*)
48 (setf (gethash list *misc-hash*)
49 (incf *misc-index*))))))
51 (defun compare-misc-entry (left right)
52 (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
53 left-decimal-digit left-digit left-bidi-mirrored
54 left-cl-both-case-p)
55 left
56 (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
57 right-decimal-digit right-digit right-bidi-mirrored
58 right-cl-both-case-p)
59 right
60 (or (and left-cl-both-case-p (not right-cl-both-case-p))
61 (and (or left-cl-both-case-p (not right-cl-both-case-p))
62 (or (< left-gc-index right-gc-index)
63 (and (= left-gc-index right-gc-index)
64 (or (< left-bidi-index right-bidi-index)
65 (and (= left-bidi-index right-bidi-index)
66 (or (< left-ccc-index right-ccc-index)
67 (and (= left-ccc-index right-ccc-index)
68 (or (string< left-decimal-digit
69 right-decimal-digit)
70 (and (string= left-decimal-digit
71 right-decimal-digit)
72 (or (string< left-digit right-digit)
73 (and (string= left-digit
74 right-digit)
75 (string< left-bidi-mirrored
76 right-bidi-mirrored))))))))))))))))
78 (defun build-misc-table ()
79 (sort *misc-table* #'compare-misc-entry)
80 (setq *misc-mapping* (make-array (1+ *misc-index*)))
81 (loop for i from 0 to *misc-index*
82 do (setf (aref *misc-mapping*
83 (gethash (aref *misc-table* i) *misc-hash*))
84 i)))
86 (defun slurp-ucd ()
87 (setq *last-uppercase* nil)
88 (setq *uppercase-transition-count* 0)
89 (setq *different-titlecases* nil)
90 (setq *different-numerics* nil)
91 (setq *name-size* 0)
92 (setq *misc-hash* (make-hash-table :test #'equal))
93 (setq *misc-index* -1)
94 (setq *misc-table* (make-array 256 :fill-pointer 0))
95 (setq *both-cases* nil)
96 (setq *decompositions* 0)
97 (setq *decomposition-types* (make-hash-table :test #'equal))
98 (setq *decomposition-length-max* 0)
99 (setq *decomposition-base* (make-array (ash #x110000
100 (- *page-size-exponent*))
101 :initial-element nil))
102 (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*))
103 :initial-element nil))
104 (with-open-file (*standard-input*
105 (make-pathname :name "UnicodeData"
106 :type "txt"
107 :defaults *unicode-character-database*)
108 :direction :input)
109 (loop for line = (read-line nil nil)
110 while line
111 do (slurp-ucd-line line)))
112 (second-pass)
113 (build-misc-table)
114 *decompositions*)
116 (defun split-string (line character)
117 (loop for prev-position = 0 then (1+ position)
118 for position = (position character line :start prev-position)
119 collect (subseq line prev-position position)
120 do (unless position
121 (loop-finish))))
123 (defun init-indices (strings)
124 (let ((hash (make-hash-table :test #'equal)))
125 (loop for string in strings
126 for index from 0
127 do (setf (gethash string hash) index))
128 hash))
130 (defparameter *general-categories*
131 (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
132 "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
133 "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
134 (defparameter *bidi-classes*
135 (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
136 "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
139 (defparameter *block-first* nil)
141 ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
142 ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
143 ;;; D800 -- F8FF : surrogates and private use
144 ;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
145 ;;; F0000 -- FFFFD : private use
146 ;;; 100000 -- 10FFFD: private use
147 (defun encode-ucd-line (line code-point)
148 (destructuring-bind (name general-category canonical-combining-class
149 bidi-class decomposition-type-and-mapping
150 decimal-digit digit numeric bidi-mirrored
151 unicode-1-name iso-10646-comment simple-uppercase
152 simple-lowercase simple-titlecase)
153 line
154 (declare (ignore unicode-1-name iso-10646-comment))
155 (if (and (> (length name) 8)
156 (string= ", First>" name :start2 (- (length name) 8)))
157 (progn
158 (setq *block-first* code-point)
159 nil)
160 (let* ((gc-index (or (gethash general-category *general-categories*)
161 (error "unknown general category ~A"
162 general-category)))
163 (bidi-index (or (gethash bidi-class *bidi-classes*)
164 (error "unknown bidirectional class ~A"
165 bidi-class)))
166 (ccc-index (parse-integer canonical-combining-class))
167 (digit-index (unless (string= "" decimal-digit)
168 (parse-integer decimal-digit)))
169 (upper-index (unless (string= "" simple-uppercase)
170 (parse-integer simple-uppercase :radix 16)))
171 (lower-index (unless (string= "" simple-lowercase)
172 (parse-integer simple-lowercase :radix 16)))
173 (title-index (unless (string= "" simple-titlecase)
174 (parse-integer simple-titlecase :radix 16)))
175 (cl-both-case-p
176 (not (null (or (and (= gc-index 0) lower-index)
177 (and (= gc-index 1) upper-index)))))
178 (misc-index (hash-misc gc-index bidi-index ccc-index
179 decimal-digit digit bidi-mirrored
180 cl-both-case-p)))
181 (declare (ignore digit-index))
182 (incf *name-size* (length name))
183 (when (string/= "" decomposition-type-and-mapping)
184 (let ((split (split-string decomposition-type-and-mapping
185 #\Space)))
186 (when (char= #\< (aref (first split) 0))
187 (setf (gethash (pop split) *decomposition-types*) t))
188 (unless (aref *decomposition-base* (cp-high code-point))
189 (setf (aref *decomposition-base* (cp-high code-point))
190 (make-array (ash 1 *page-size-exponent*)
191 :initial-element nil)))
192 (setf (aref (aref *decomposition-base* (cp-high code-point))
193 (cp-low code-point))
194 (mapcar #'(lambda (string)
195 (parse-integer string :radix 16))
196 split))
197 (setq *decomposition-length-max*
198 (max *decomposition-length-max* (length split)))
199 (incf *decompositions* (length split))))
200 (when (and (string/= "" simple-uppercase)
201 (string/= "" simple-lowercase))
202 (push (list code-point upper-index lower-index) *both-cases*))
203 (when (string/= simple-uppercase simple-titlecase)
204 (push (cons code-point title-index) *different-titlecases*))
205 (when (string/= digit numeric)
206 (push (cons code-point numeric) *different-numerics*))
207 (cond
208 ((= gc-index 8)
209 (unless *last-uppercase*
210 (incf *uppercase-transition-count*))
211 (setq *last-uppercase* t))
213 (when *last-uppercase*
214 (incf *uppercase-transition-count*))
215 (setq *last-uppercase* nil)))
216 (when (> ccc-index 255)
217 (error "canonical combining class too large ~A" ccc-index))
218 (let ((result (vector misc-index (or upper-index lower-index 0))))
219 (when (and (> (length name) 7)
220 (string= ", Last>" name :start2 (- (length name) 7)))
221 (let ((page-start (ash (+ *block-first*
222 (ash 1 *page-size-exponent*)
224 (- *page-size-exponent*)))
225 (page-end (ash code-point (- *page-size-exponent*))))
226 (loop for point from *block-first*
227 below (ash page-start *page-size-exponent*)
228 do (setf (aref (aref *ucd-base* (cp-high point))
229 (cp-low point))
230 result))
231 (loop for page from page-start below page-end
232 do (setf (aref *ucd-base* page)
233 (make-array (ash 1 *page-size-exponent*)
234 :initial-element result)))
235 (loop for point from (ash page-end *page-size-exponent*)
236 below code-point
237 do (setf (aref (aref *ucd-base* (cp-high point))
238 (cp-low point))
239 result))))
240 result)))))
242 (defun slurp-ucd-line (line)
243 (let* ((split-line (split-string line #\;))
244 (code-point (parse-integer (first split-line) :radix 16))
245 (code-high (ash code-point (- *page-size-exponent*)))
246 (code-low (ldb (byte *page-size-exponent* 0) code-point)))
247 (unless (aref *ucd-base* code-high)
248 (setf (aref *ucd-base* code-high)
249 (make-array (ash 1 *page-size-exponent*)
250 :initial-element nil)))
251 (setf (aref (aref *ucd-base* code-high) code-low)
252 (encode-ucd-line (cdr split-line) code-point))))
254 (defun second-pass ()
255 (loop for i from 0 below (length *ucd-base*)
256 when (aref *ucd-base* i)
257 do (loop for j from 0 below (length (aref *ucd-base* i))
258 for result = (aref (aref *ucd-base* i) j)
259 when result
260 when (let* ((transform-point (aref result 1))
261 (transform-high (ash transform-point
262 (- *page-size-exponent*)))
263 (transform-low (ldb (byte *page-size-exponent* 0)
264 transform-point)))
265 (and (plusp transform-point)
266 (/= (aref (aref (aref *ucd-base* transform-high)
267 transform-low)
269 (+ (ash i *page-size-exponent*) j))))
270 do (destructuring-bind (gc-index bidi-index ccc-index
271 decimal-digit digit bidi-mirrored
272 cl-both-case-p)
273 (aref *misc-table* (aref result 0))
274 (declare (ignore cl-both-case-p))
275 (format t "~A~%" (+ (ash i *page-size-exponent*) j))
276 (setf (aref result 0)
277 (hash-misc gc-index bidi-index ccc-index
278 decimal-digit digit bidi-mirrored
279 nil))))))
281 (defun write-3-byte (triplet stream)
282 (write-byte (ldb (byte 8 0) triplet) stream)
283 (write-byte (ldb (byte 8 8) triplet) stream)
284 (write-byte (ldb (byte 8 16) triplet) stream))
286 (defun digit-to-byte (digit)
287 (if (string= "" digit)
289 (parse-integer digit)))
291 (defun output ()
292 (let ((hash (make-hash-table :test #'equalp))
293 (index 0))
294 (loop for page across *ucd-base*
295 do (when page
296 (unless (gethash page hash)
297 (setf (gethash page hash)
298 (incf index)))))
299 (let ((array (make-array (1+ index))))
300 (maphash #'(lambda (key value)
301 (setf (aref array value) key))
302 hash)
303 (setf (aref array 0)
304 (make-array (ash 1 *page-size-exponent*) :initial-element nil))
305 (with-open-file (stream (make-pathname :name "ucd"
306 :type "dat"
307 :defaults *output-directory*)
308 :direction :output
309 :element-type '(unsigned-byte 8)
310 :if-exists :supersede
311 :if-does-not-exist :create)
312 (loop for (gc-index bidi-index ccc-index decimal-digit digit
313 bidi-mirrored)
314 across *misc-table*
315 do (write-byte gc-index stream)
316 do (write-byte bidi-index stream)
317 do (write-byte ccc-index stream)
318 do (write-byte (digit-to-byte decimal-digit) stream)
319 do (write-byte (digit-to-byte digit) stream)
320 do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
321 do (write-byte 0 stream)
322 do (write-byte 0 stream))
323 (loop for page across *ucd-base*
324 do (write-byte (if page (gethash page hash) 0) stream))
325 (loop for page across array
326 do (loop for entry across page
327 do (write-byte (if entry
328 (aref *misc-mapping* (aref entry 0))
329 255)
330 stream)
331 do (write-3-byte (if entry (aref entry 1) 0)
332 stream))))))
333 (with-open-file (*standard-output*
334 (make-pathname :name "numerics"
335 :type "lisp-expr"
336 :defaults *output-directory*)
337 :direction :output
338 :if-exists :supersede
339 :if-does-not-exist :create)
340 (let ((*print-pretty* t))
341 (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
342 *different-numerics*))))
343 (with-open-file (*standard-output*
344 (make-pathname :name "titlecases"
345 :type "lisp-expr"
346 :defaults *output-directory*)
347 :direction :output
348 :if-exists :supersede
349 :if-does-not-exist :create)
350 (let ((*print-pretty* t))
351 (prin1 *different-titlecases*)))
352 (with-open-file (*standard-output*
353 (make-pathname :name "misc"
354 :type "lisp-expr"
355 :defaults *output-directory*)
356 :direction :output
357 :if-exists :supersede
358 :if-does-not-exist :create)
359 (let ((*print-pretty* t))
360 (prin1 `(:length ,(length *misc-table*)
361 :uppercase ,(loop for (gc-index) across *misc-table*
362 for i from 0
363 when (= gc-index 0)
364 collect i)
365 :lowercase ,(loop for (gc-index) across *misc-table*
366 for i from 0
367 when (= gc-index 1)
368 collect i)
369 :titlecase ,(loop for (gc-index) across *misc-table*
370 for i from 0
371 when (= gc-index 2)
372 collect i)))))
373 (values))
375 ;;; Use of the generated files
377 (defparameter *compiled-ucd* nil)
379 (defun read-compiled-ucd ()
380 (with-open-file (stream (make-pathname :name "ucd"
381 :type "dat"
382 :defaults *output-directory*)
383 :direction :input
384 :element-type '(unsigned-byte 8))
385 (let ((length (file-length stream)))
386 (setq *compiled-ucd*
387 (make-array length :element-type '(unsigned-byte 8)))
388 (read-sequence *compiled-ucd* stream)))
389 (values))
391 ;;; The stuff below is dependent on misc.lisp-expr being
392 ;;; (:LENGTH 186 :UPPERCASE (0) :LOWERCASE (1) :TITLECASE (2))
394 (defparameter *length* 186)
396 (defun cp-index (cp)
397 (let* ((cp-high (cp-high cp))
398 (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high))))
399 (+ (* 8 *length*)
400 (ash #x110000 (- *page-size-exponent*))
401 (* (ash 4 *page-size-exponent*) page)
402 (* 4 (cp-low cp)))))
404 (defun cp-value-0 (cp)
405 (aref *compiled-ucd* (cp-index cp)))
407 (defun cp-value-1 (cp)
408 (let ((index (cp-index cp)))
409 (dpb (aref *compiled-ucd* (+ index 3)) (byte 8 16)
410 (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8)
411 (aref *compiled-ucd* (1+ index))))))
413 (defun cp-general-category (cp)
414 (aref *compiled-ucd* (* 8 (cp-value-0 cp))))
416 (defun cp-decimal-digit (cp)
417 (let ((decimal-digit (aref *compiled-ucd* (+ 3 (* 8 (cp-value-0 cp))))))
418 (and (< decimal-digit 10)
419 decimal-digit)))
421 (defun cp-alpha-char-p (cp)
422 (< (cp-general-category cp) 5))
424 (defun cp-alphanumericp (cp)
425 (let ((gc (cp-general-category cp)))
426 (or (< gc 5)
427 (= gc 12))))
429 (defun cp-digit-char-p (cp &optional (radix 10))
430 (let ((number (or (cp-decimal-digit cp)
431 (and (<= 65 cp 90)
432 (- cp 55))
433 (and (<= 97 cp 122)
434 (- cp 87)))))
435 (when (and number (< number radix))
436 number)))
438 (defun cp-graphic-char-p (cp)
439 (or (<= 32 cp 127)
440 (<= 160 cp)))
442 (defun cp-char-upcase (cp)
443 (if (= (cp-value-0 cp) 1)
444 (cp-value-1 cp)
445 cp))
447 (defun cp-char-downcase (cp)
448 (if (= (cp-value-0 cp) 0)
449 (cp-value-1 cp)
450 cp))
452 (defun cp-upper-case-p (cp)
453 (= (cp-value-0 cp) 0))
455 (defun cp-lower-case-p (cp)
456 (= (cp-value-0 cp) 1))
458 (defun cp-both-case-p (cp)
459 (< (cp-value-0 cp) 2))