1.0.12.2: oops, test in wrong place
[sbcl.git] / tools-for-build / ucd.lisp
blob8ddcb6757fc65ff7eeccc7f8ad0fbb18f6a6c7c9
1 (in-package "SB-COLD")
3 ;;; Common
5 (defparameter *output-directory*
6 (merge-pathnames
7 (make-pathname :directory '(:relative :up "output"))
8 (make-pathname :directory (pathname-directory *load-truename*))))
10 (defparameter *page-size-exponent* 8)
12 (defun cp-high (cp)
13 (ash cp (- *page-size-exponent*)))
15 (defun cp-low (cp)
16 (ldb (byte *page-size-exponent* 0) cp))
18 ;;; Generator
20 (defstruct ucd misc transform)
22 (defparameter *unicode-character-database*
23 (make-pathname :directory (pathname-directory *load-truename*)))
25 (defparameter *ucd-base* nil)
26 (defparameter *unicode-names* (make-hash-table))
28 (defparameter *last-uppercase* nil)
29 (defparameter *uppercase-transition-count* 0)
30 (defparameter *different-titlecases* nil)
31 (defparameter *different-numerics* nil)
32 (defparameter *name-size* 0)
33 (defparameter *misc-hash* (make-hash-table :test #'equal))
34 (defparameter *misc-index* -1)
35 (defparameter *misc-table* nil)
36 (defparameter *misc-mapping* nil)
37 (defparameter *both-cases* nil)
38 (defparameter *decompositions* nil)
39 (defparameter *decomposition-length-max* nil)
40 (defparameter *decomposition-types* nil)
41 (defparameter *decomposition-base* nil)
43 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
44 bidi-mirrored cl-both-case-p)
45 (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
46 bidi-mirrored cl-both-case-p))
47 (index (gethash list *misc-hash*)))
48 (or index
49 (progn
50 (vector-push list *misc-table*)
51 (setf (gethash list *misc-hash*)
52 (incf *misc-index*))))))
54 (defun compare-misc-entry (left right)
55 (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
56 left-decimal-digit left-digit left-bidi-mirrored
57 left-cl-both-case-p)
58 left
59 (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
60 right-decimal-digit right-digit right-bidi-mirrored
61 right-cl-both-case-p)
62 right
63 (or (and left-cl-both-case-p (not right-cl-both-case-p))
64 (and (or left-cl-both-case-p (not right-cl-both-case-p))
65 (or (< left-gc-index right-gc-index)
66 (and (= left-gc-index right-gc-index)
67 (or (< left-bidi-index right-bidi-index)
68 (and (= left-bidi-index right-bidi-index)
69 (or (< left-ccc-index right-ccc-index)
70 (and (= left-ccc-index right-ccc-index)
71 (or (string< left-decimal-digit
72 right-decimal-digit)
73 (and (string= left-decimal-digit
74 right-decimal-digit)
75 (or (string< left-digit right-digit)
76 (and (string= left-digit
77 right-digit)
78 (string< left-bidi-mirrored
79 right-bidi-mirrored))))))))))))))))
81 (defun build-misc-table ()
82 (sort *misc-table* #'compare-misc-entry)
83 (setq *misc-mapping* (make-array (1+ *misc-index*)))
84 (loop for i from 0 to *misc-index*
85 do (setf (aref *misc-mapping*
86 (gethash (aref *misc-table* i) *misc-hash*))
87 i)))
89 (defun slurp-ucd ()
90 (setq *last-uppercase* nil)
91 (setq *uppercase-transition-count* 0)
92 (setq *different-titlecases* nil)
93 (setq *different-numerics* nil)
94 (setq *name-size* 0)
95 (setq *misc-hash* (make-hash-table :test #'equal))
96 (setq *misc-index* -1)
97 (setq *misc-table* (make-array 256 :fill-pointer 0))
98 (setq *both-cases* nil)
99 (setq *decompositions* 0)
100 (setq *decomposition-types* (make-hash-table :test #'equal))
101 (setq *decomposition-length-max* 0)
102 (setq *decomposition-base* (make-array (ash #x110000
103 (- *page-size-exponent*))
104 :initial-element nil))
105 (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*))
106 :initial-element nil))
107 (with-open-file (*standard-input*
108 (make-pathname :name "UnicodeData"
109 :type "txt"
110 :defaults *unicode-character-database*)
111 :direction :input)
112 (loop for line = (read-line nil nil)
113 while line
114 do (slurp-ucd-line line)))
115 (second-pass)
116 (build-misc-table)
117 *decompositions*)
119 (defun split-string (line character)
120 (loop for prev-position = 0 then (1+ position)
121 for position = (position character line :start prev-position)
122 collect (subseq line prev-position position)
123 do (unless position
124 (loop-finish))))
126 (defun init-indices (strings)
127 (let ((hash (make-hash-table :test #'equal)))
128 (loop for string in strings
129 for index from 0
130 do (setf (gethash string hash) index))
131 hash))
133 (defparameter *general-categories*
134 (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
135 "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
136 "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
137 (defparameter *bidi-classes*
138 (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
139 "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
142 (defparameter *block-first* nil)
144 (defun normalize-character-name (name)
145 (when (find #\_ name)
146 (error "Bad name for a character: ~A" name))
147 (unless (or (zerop (length name)) (find #\< name) (find #\> name))
148 (substitute #\_ #\Space name)))
150 ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
151 ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
152 ;;; D800 -- F8FF : surrogates and private use
153 ;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
154 ;;; F0000 -- FFFFD : private use
155 ;;; 100000 -- 10FFFD: private use
156 (defun encode-ucd-line (line code-point)
157 (destructuring-bind (name general-category canonical-combining-class
158 bidi-class decomposition-type-and-mapping
159 decimal-digit digit numeric bidi-mirrored
160 unicode-1-name iso-10646-comment simple-uppercase
161 simple-lowercase simple-titlecase)
162 line
163 (declare (ignore unicode-1-name iso-10646-comment))
164 (if (and (> (length name) 8)
165 (string= ", First>" name :start2 (- (length name) 8)))
166 (progn
167 (setq *block-first* code-point)
168 nil)
169 (let* ((gc-index (or (gethash general-category *general-categories*)
170 (error "unknown general category ~A"
171 general-category)))
172 (bidi-index (or (gethash bidi-class *bidi-classes*)
173 (error "unknown bidirectional class ~A"
174 bidi-class)))
175 (ccc-index (parse-integer canonical-combining-class))
176 (digit-index (unless (string= "" decimal-digit)
177 (parse-integer decimal-digit)))
178 (upper-index (unless (string= "" simple-uppercase)
179 (parse-integer simple-uppercase :radix 16)))
180 (lower-index (unless (string= "" simple-lowercase)
181 (parse-integer simple-lowercase :radix 16)))
182 (title-index (unless (string= "" simple-titlecase)
183 (parse-integer simple-titlecase :radix 16)))
184 (cl-both-case-p
185 (not (null (or (and (= gc-index 0) lower-index)
186 (and (= gc-index 1) upper-index)))))
187 (misc-index (hash-misc gc-index bidi-index ccc-index
188 decimal-digit digit bidi-mirrored
189 cl-both-case-p)))
190 (declare (ignore digit-index))
191 (incf *name-size* (length name))
192 (when (string/= "" decomposition-type-and-mapping)
193 (let ((split (split-string decomposition-type-and-mapping
194 #\Space)))
195 (when (char= #\< (aref (first split) 0))
196 (setf (gethash (pop split) *decomposition-types*) t))
197 (unless (aref *decomposition-base* (cp-high code-point))
198 (setf (aref *decomposition-base* (cp-high code-point))
199 (make-array (ash 1 *page-size-exponent*)
200 :initial-element nil)))
201 (setf (aref (aref *decomposition-base* (cp-high code-point))
202 (cp-low code-point))
203 (mapcar #'(lambda (string)
204 (parse-integer string :radix 16))
205 split))
206 (setq *decomposition-length-max*
207 (max *decomposition-length-max* (length split)))
208 (incf *decompositions* (length split))))
209 (when (and (string/= "" simple-uppercase)
210 (string/= "" simple-lowercase))
211 (push (list code-point upper-index lower-index) *both-cases*))
212 (when (string/= simple-uppercase simple-titlecase)
213 (push (cons code-point title-index) *different-titlecases*))
214 (when (string/= digit numeric)
215 (push (cons code-point numeric) *different-numerics*))
216 (cond
217 ((= gc-index 8)
218 (unless *last-uppercase*
219 (incf *uppercase-transition-count*))
220 (setq *last-uppercase* t))
222 (when *last-uppercase*
223 (incf *uppercase-transition-count*))
224 (setq *last-uppercase* nil)))
225 (when (> ccc-index 255)
226 (error "canonical combining class too large ~A" ccc-index))
227 (let ((result (make-ucd :misc misc-index
228 :transform (or upper-index lower-index 0))))
229 (when (and (> (length name) 7)
230 (string= ", Last>" name :start2 (- (length name) 7)))
231 (let ((page-start (ash (+ *block-first*
232 (ash 1 *page-size-exponent*)
234 (- *page-size-exponent*)))
235 (page-end (ash code-point (- *page-size-exponent*))))
236 (loop for point from *block-first*
237 below (ash page-start *page-size-exponent*)
238 do (setf (aref (aref *ucd-base* (cp-high point))
239 (cp-low point))
240 result))
241 (loop for page from page-start below page-end
242 do (setf (aref *ucd-base* page)
243 (make-array (ash 1 *page-size-exponent*)
244 :initial-element result)))
245 (loop for point from (ash page-end *page-size-exponent*)
246 below code-point
247 do (setf (aref (aref *ucd-base* (cp-high point))
248 (cp-low point))
249 result))))
250 (values result (normalize-character-name name)))))))
252 (defun slurp-ucd-line (line)
253 (let* ((split-line (split-string line #\;))
254 (code-point (parse-integer (first split-line) :radix 16))
255 (code-high (ash code-point (- *page-size-exponent*)))
256 (code-low (ldb (byte *page-size-exponent* 0) code-point)))
257 (unless (aref *ucd-base* code-high)
258 (setf (aref *ucd-base* code-high)
259 (make-array (ash 1 *page-size-exponent*)
260 :initial-element nil)))
261 (multiple-value-bind (encoding name)
262 (encode-ucd-line (cdr split-line) code-point)
263 (setf (aref (aref *ucd-base* code-high) code-low) encoding
264 (gethash code-point *unicode-names*) name))))
266 (defun second-pass ()
267 (loop for i from 0 below (length *ucd-base*)
268 when (aref *ucd-base* i)
269 do (loop for j from 0 below (length (aref *ucd-base* i))
270 for result = (aref (aref *ucd-base* i) j)
271 when result
272 when (let* ((transform-point (ucd-transform result))
273 (transform-high (ash transform-point
274 (- *page-size-exponent*)))
275 (transform-low (ldb (byte *page-size-exponent* 0)
276 transform-point)))
277 (and (plusp transform-point)
278 (/= (ucd-transform
279 (aref (aref *ucd-base* transform-high)
280 transform-low))
281 (+ (ash i *page-size-exponent*) j))))
282 do (destructuring-bind (gc-index bidi-index ccc-index
283 decimal-digit digit bidi-mirrored
284 cl-both-case-p)
285 (aref *misc-table* (ucd-misc result))
286 (declare (ignore cl-both-case-p))
287 (format t "~A~%" (+ (ash i *page-size-exponent*) j))
288 (setf (ucd-misc result)
289 (hash-misc gc-index bidi-index ccc-index
290 decimal-digit digit bidi-mirrored
291 nil))))))
293 (defun write-3-byte (triplet stream)
294 (write-byte (ldb (byte 8 0) triplet) stream)
295 (write-byte (ldb (byte 8 8) triplet) stream)
296 (write-byte (ldb (byte 8 16) triplet) stream))
298 (defun digit-to-byte (digit)
299 (if (string= "" digit)
301 (parse-integer digit)))
303 (defun output ()
304 (let ((hash (make-hash-table :test #'equalp))
305 (index 0))
306 (loop for page across *ucd-base*
307 do (when page
308 (unless (gethash page hash)
309 (setf (gethash page hash)
310 (incf index)))))
311 (let ((array (make-array (1+ index))))
312 (maphash #'(lambda (key value)
313 (setf (aref array value) key))
314 hash)
315 (setf (aref array 0)
316 (make-array (ash 1 *page-size-exponent*) :initial-element nil))
317 (with-open-file (stream (make-pathname :name "ucd"
318 :type "dat"
319 :defaults *output-directory*)
320 :direction :output
321 :element-type '(unsigned-byte 8)
322 :if-exists :supersede
323 :if-does-not-exist :create)
324 (loop for (gc-index bidi-index ccc-index decimal-digit digit
325 bidi-mirrored)
326 across *misc-table*
327 do (write-byte gc-index stream)
328 do (write-byte bidi-index stream)
329 do (write-byte ccc-index stream)
330 do (write-byte (digit-to-byte decimal-digit) stream)
331 do (write-byte (digit-to-byte digit) stream)
332 do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
333 do (write-byte 0 stream)
334 do (write-byte 0 stream))
335 (loop for page across *ucd-base*
336 do (write-byte (if page (gethash page hash) 0) stream))
337 (loop for page across array
338 do (loop for entry across page
339 do (write-byte (if entry
340 (aref *misc-mapping* (ucd-misc entry))
341 255)
342 stream)
343 do (write-3-byte (if entry (ucd-transform entry) 0)
344 stream))))))
345 (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
346 :defaults *output-directory*)
347 :direction :output
348 :if-exists :supersede
349 :if-does-not-exist :create)
350 (with-standard-io-syntax
351 (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
352 (maphash (lambda (code name)
353 (when name
354 (print code f)
355 (prin1 name f)))
356 *unicode-names*))
357 (setf *unicode-names* nil))
358 (with-open-file (*standard-output*
359 (make-pathname :name "numerics"
360 :type "lisp-expr"
361 :defaults *output-directory*)
362 :direction :output
363 :if-exists :supersede
364 :if-does-not-exist :create)
365 (let ((*print-pretty* t))
366 (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
367 *different-numerics*))))
368 (with-open-file (*standard-output*
369 (make-pathname :name "titlecases"
370 :type "lisp-expr"
371 :defaults *output-directory*)
372 :direction :output
373 :if-exists :supersede
374 :if-does-not-exist :create)
375 (let ((*print-pretty* t))
376 (prin1 *different-titlecases*)))
377 (with-open-file (*standard-output*
378 (make-pathname :name "misc"
379 :type "lisp-expr"
380 :defaults *output-directory*)
381 :direction :output
382 :if-exists :supersede
383 :if-does-not-exist :create)
384 (let ((*print-pretty* t))
385 (prin1 `(:length ,(length *misc-table*)
386 :uppercase ,(loop for (gc-index) across *misc-table*
387 for i from 0
388 when (= gc-index 0)
389 collect i)
390 :lowercase ,(loop for (gc-index) across *misc-table*
391 for i from 0
392 when (= gc-index 1)
393 collect i)
394 :titlecase ,(loop for (gc-index) across *misc-table*
395 for i from 0
396 when (= gc-index 2)
397 collect i)))))
398 (values))
400 ;;; Use of the generated files
402 (defparameter *compiled-ucd* nil)
404 (defun read-compiled-ucd ()
405 (with-open-file (stream (make-pathname :name "ucd"
406 :type "dat"
407 :defaults *output-directory*)
408 :direction :input
409 :element-type '(unsigned-byte 8))
410 (let ((length (file-length stream)))
411 (setq *compiled-ucd*
412 (make-array length :element-type '(unsigned-byte 8)))
413 (read-sequence *compiled-ucd* stream)))
414 (values))
416 ;;; The stuff below is dependent on misc.lisp-expr being
417 ;;; (:LENGTH 186 :UPPERCASE (0) :LOWERCASE (1) :TITLECASE (2))
419 (defparameter *length* 186)
421 (defun cp-index (cp)
422 (let* ((cp-high (cp-high cp))
423 (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high))))
424 (+ (* 8 *length*)
425 (ash #x110000 (- *page-size-exponent*))
426 (* (ash 4 *page-size-exponent*) page)
427 (* 4 (cp-low cp)))))
429 (defun cp-value-0 (cp)
430 (aref *compiled-ucd* (cp-index cp)))
432 (defun cp-value-1 (cp)
433 (let ((index (cp-index cp)))
434 (dpb (aref *compiled-ucd* (+ index 3)) (byte 8 16)
435 (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8)
436 (aref *compiled-ucd* (1+ index))))))
438 (defun cp-general-category (cp)
439 (aref *compiled-ucd* (* 8 (cp-value-0 cp))))
441 (defun cp-decimal-digit (cp)
442 (let ((decimal-digit (aref *compiled-ucd* (+ 3 (* 8 (cp-value-0 cp))))))
443 (and (< decimal-digit 10)
444 decimal-digit)))
446 (defun cp-alpha-char-p (cp)
447 (< (cp-general-category cp) 5))
449 (defun cp-alphanumericp (cp)
450 (let ((gc (cp-general-category cp)))
451 (or (< gc 5)
452 (= gc 12))))
454 (defun cp-digit-char-p (cp &optional (radix 10))
455 (let ((number (or (cp-decimal-digit cp)
456 (and (<= 65 cp 90)
457 (- cp 55))
458 (and (<= 97 cp 122)
459 (- cp 87)))))
460 (when (and number (< number radix))
461 number)))
463 (defun cp-graphic-char-p (cp)
464 (or (<= 32 cp 127)
465 (<= 160 cp)))
467 (defun cp-char-upcase (cp)
468 (if (= (cp-value-0 cp) 1)
469 (cp-value-1 cp)
470 cp))
472 (defun cp-char-downcase (cp)
473 (if (= (cp-value-0 cp) 0)
474 (cp-value-1 cp)
475 cp))
477 (defun cp-upper-case-p (cp)
478 (= (cp-value-0 cp) 0))
480 (defun cp-lower-case-p (cp)
481 (= (cp-value-0 cp) 1))
483 (defun cp-both-case-p (cp)
484 (< (cp-value-0 cp) 2))