1.0.33.5: improve DESCRIBE (#lp488976)
[sbcl/llvm.git] / tools-for-build / ucd.lisp
blobb1cae492216d061b327bb402bbbae9dfbeaf1ab0
1 (in-package "SB-COLD")
3 ;;; Common
5 (defparameter *output-directory*
6 (merge-pathnames
7 (make-pathname :directory '(:relative :up "output"))
8 (make-pathname :directory (pathname-directory *load-truename*))))
10 (defparameter *page-size-exponent* 8)
12 (defun cp-high (cp)
13 (ash cp (- *page-size-exponent*)))
15 (defun cp-low (cp)
16 (ldb (byte *page-size-exponent* 0) cp))
18 ;;; Generator
20 (defstruct ucd misc transform)
22 (defparameter *unicode-character-database*
23 (make-pathname :directory (pathname-directory *load-truename*)))
25 (defparameter *ucd-base* nil)
26 (defparameter *unicode-names* (make-hash-table))
28 (defparameter *last-uppercase* nil)
29 (defparameter *uppercase-transition-count* 0)
30 (defparameter *different-titlecases* nil)
31 (defparameter *different-numerics* nil)
32 (defparameter *name-size* 0)
33 (defparameter *misc-hash* (make-hash-table :test #'equal))
34 (defparameter *misc-index* -1)
35 (defparameter *misc-table* nil)
36 (defparameter *misc-mapping* nil)
37 (defparameter *both-cases* nil)
38 (defparameter *decompositions* nil)
39 (defparameter *decomposition-length-max* nil)
40 (defparameter *decomposition-types* nil)
41 (defparameter *decomposition-base* nil)
43 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
44 bidi-mirrored cl-both-case-p)
45 (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
46 bidi-mirrored cl-both-case-p))
47 (index (gethash list *misc-hash*)))
48 (or index
49 (progn
50 (vector-push list *misc-table*)
51 (setf (gethash list *misc-hash*)
52 (incf *misc-index*))))))
54 (defun compare-misc-entry (left right)
55 (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
56 left-decimal-digit left-digit left-bidi-mirrored
57 left-cl-both-case-p)
58 left
59 (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
60 right-decimal-digit right-digit right-bidi-mirrored
61 right-cl-both-case-p)
62 right
63 (or (and left-cl-both-case-p (not right-cl-both-case-p))
64 (and (or left-cl-both-case-p (not right-cl-both-case-p))
65 (or (< left-gc-index right-gc-index)
66 (and (= left-gc-index right-gc-index)
67 (or (< left-bidi-index right-bidi-index)
68 (and (= left-bidi-index right-bidi-index)
69 (or (< left-ccc-index right-ccc-index)
70 (and (= left-ccc-index right-ccc-index)
71 (or (string< left-decimal-digit
72 right-decimal-digit)
73 (and (string= left-decimal-digit
74 right-decimal-digit)
75 (or (string< left-digit right-digit)
76 (and (string= left-digit
77 right-digit)
78 (string< left-bidi-mirrored
79 right-bidi-mirrored))))))))))))))))
81 (defun build-misc-table ()
82 (sort *misc-table* #'compare-misc-entry)
83 (setq *misc-mapping* (make-array (1+ *misc-index*)))
84 (loop for i from 0 to *misc-index*
85 do (setf (aref *misc-mapping*
86 (gethash (aref *misc-table* i) *misc-hash*))
87 i)))
89 (defun slurp-ucd ()
90 (setq *last-uppercase* nil)
91 (setq *uppercase-transition-count* 0)
92 (setq *different-titlecases* nil)
93 (setq *different-numerics* nil)
94 (setq *name-size* 0)
95 (setq *misc-hash* (make-hash-table :test #'equal))
96 (setq *misc-index* -1)
97 (setq *misc-table* (make-array 256 :fill-pointer 0))
98 (setq *both-cases* nil)
99 (setq *decompositions* 0)
100 (setq *decomposition-types* (make-hash-table :test #'equal))
101 (setq *decomposition-length-max* 0)
102 (setq *decomposition-base* (make-array (ash #x110000
103 (- *page-size-exponent*))
104 :initial-element nil))
105 (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*))
106 :initial-element nil))
107 (with-open-file (*standard-input*
108 (make-pathname :name "UnicodeData"
109 :type "txt"
110 :defaults *unicode-character-database*)
111 :direction :input)
112 (loop for line = (read-line nil nil)
113 while line
114 do (slurp-ucd-line line)))
115 (second-pass)
116 (build-misc-table)
117 (fixup-hangul-syllables)
118 *decompositions*)
120 (defun fixup-hangul-syllables ()
121 ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
122 (let* ((sbase #xac00)
123 (lbase #x1100)
124 (vbase #x1161)
125 (tbase #x11a7)
126 (scount 11172)
127 (lcount 19)
128 (vcount 21)
129 (tcount 28)
130 (ncount (* vcount tcount))
131 (table (make-hash-table)))
132 (with-open-file (*standard-input*
133 (make-pathname :name "Jamo" :type "txt"
134 :defaults *unicode-character-database*))
135 (loop for line = (read-line nil nil)
136 while line
137 if (position #\; line)
138 do (add-jamo-information line table)))
139 (dotimes (sindex scount)
140 (let* ((l (+ lbase (floor sindex ncount)))
141 (v (+ vbase (floor (mod sindex ncount) tcount)))
142 (tee (+ tbase (mod sindex tcount)))
143 (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]"
144 (gethash l table) (gethash v table)
145 (= tee tbase) (gethash tee table))))
146 (setf (gethash (+ sbase sindex) *unicode-names*) name)))))
148 (defun add-jamo-information (line table)
149 (let* ((split (split-string line #\;))
150 (code (parse-integer (first split) :radix 16))
151 (syllable (string-trim '(#\Space)
152 (subseq (second split) 0 (position #\# (second split))))))
153 (setf (gethash code table) syllable)))
155 (defun split-string (line character)
156 (loop for prev-position = 0 then (1+ position)
157 for position = (position character line :start prev-position)
158 collect (subseq line prev-position position)
159 do (unless position
160 (loop-finish))))
162 (defun init-indices (strings)
163 (let ((hash (make-hash-table :test #'equal)))
164 (loop for string in strings
165 for index from 0
166 do (setf (gethash string hash) index))
167 hash))
169 (defparameter *general-categories*
170 (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
171 "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
172 "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
173 (defparameter *bidi-classes*
174 (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
175 "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
178 (defparameter *block-first* nil)
180 (defun normalize-character-name (name)
181 (when (find #\_ name)
182 (error "Bad name for a character: ~A" name))
183 (unless (or (zerop (length name)) (find #\< name) (find #\> name))
184 (substitute #\_ #\Space name)))
186 ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
187 ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
188 ;;; D800 -- F8FF : surrogates and private use
189 ;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
190 ;;; F0000 -- FFFFD : private use
191 ;;; 100000 -- 10FFFD: private use
192 (defun encode-ucd-line (line code-point)
193 (destructuring-bind (name general-category canonical-combining-class
194 bidi-class decomposition-type-and-mapping
195 decimal-digit digit numeric bidi-mirrored
196 unicode-1-name iso-10646-comment simple-uppercase
197 simple-lowercase simple-titlecase)
198 line
199 (declare (ignore unicode-1-name iso-10646-comment))
200 (if (and (> (length name) 8)
201 (string= ", First>" name :start2 (- (length name) 8)))
202 (progn
203 (setq *block-first* code-point)
204 nil)
205 (let* ((gc-index (or (gethash general-category *general-categories*)
206 (error "unknown general category ~A"
207 general-category)))
208 (bidi-index (or (gethash bidi-class *bidi-classes*)
209 (error "unknown bidirectional class ~A"
210 bidi-class)))
211 (ccc-index (parse-integer canonical-combining-class))
212 (digit-index (unless (string= "" decimal-digit)
213 (parse-integer decimal-digit)))
214 (upper-index (unless (string= "" simple-uppercase)
215 (parse-integer simple-uppercase :radix 16)))
216 (lower-index (unless (string= "" simple-lowercase)
217 (parse-integer simple-lowercase :radix 16)))
218 (title-index (unless (string= "" simple-titlecase)
219 (parse-integer simple-titlecase :radix 16)))
220 (cl-both-case-p
221 (not (null (or (and (= gc-index 0) lower-index)
222 (and (= gc-index 1) upper-index)))))
223 (misc-index (hash-misc gc-index bidi-index ccc-index
224 decimal-digit digit bidi-mirrored
225 cl-both-case-p)))
226 (declare (ignore digit-index))
227 (when (and (not cl-both-case-p)
228 (< gc-index 2))
229 (format t "~A~%" name))
230 (incf *name-size* (length name))
231 (when (string/= "" decomposition-type-and-mapping)
232 (let ((split (split-string decomposition-type-and-mapping
233 #\Space)))
234 (when (char= #\< (aref (first split) 0))
235 (setf (gethash (pop split) *decomposition-types*) t))
236 (unless (aref *decomposition-base* (cp-high code-point))
237 (setf (aref *decomposition-base* (cp-high code-point))
238 (make-array (ash 1 *page-size-exponent*)
239 :initial-element nil)))
240 (setf (aref (aref *decomposition-base* (cp-high code-point))
241 (cp-low code-point))
242 (mapcar #'(lambda (string)
243 (parse-integer string :radix 16))
244 split))
245 (setq *decomposition-length-max*
246 (max *decomposition-length-max* (length split)))
247 (incf *decompositions* (length split))))
248 (when (and (string/= "" simple-uppercase)
249 (string/= "" simple-lowercase))
250 (push (list code-point upper-index lower-index) *both-cases*))
251 (when (string/= simple-uppercase simple-titlecase)
252 (push (cons code-point title-index) *different-titlecases*))
253 (when (string/= digit numeric)
254 (push (cons code-point numeric) *different-numerics*))
255 (cond
256 ((= gc-index 8)
257 (unless *last-uppercase*
258 (incf *uppercase-transition-count*))
259 (setq *last-uppercase* t))
261 (when *last-uppercase*
262 (incf *uppercase-transition-count*))
263 (setq *last-uppercase* nil)))
264 (when (> ccc-index 255)
265 (error "canonical combining class too large ~A" ccc-index))
266 (let ((result (make-ucd :misc misc-index
267 :transform (or upper-index lower-index 0))))
268 (when (and (> (length name) 7)
269 (string= ", Last>" name :start2 (- (length name) 7)))
270 (let ((page-start (ash (+ *block-first*
271 (ash 1 *page-size-exponent*)
273 (- *page-size-exponent*)))
274 (page-end (ash code-point (- *page-size-exponent*))))
275 (loop for point from *block-first*
276 below (ash page-start *page-size-exponent*)
277 do (setf (aref (aref *ucd-base* (cp-high point))
278 (cp-low point))
279 result))
280 (loop for page from page-start below page-end
281 do (setf (aref *ucd-base* page)
282 (make-array (ash 1 *page-size-exponent*)
283 :initial-element result)))
284 (loop for point from (ash page-end *page-size-exponent*)
285 below code-point
286 do (setf (aref (aref *ucd-base* (cp-high point))
287 (cp-low point))
288 result))))
289 (values result (normalize-character-name name)))))))
291 (defun slurp-ucd-line (line)
292 (let* ((split-line (split-string line #\;))
293 (code-point (parse-integer (first split-line) :radix 16))
294 (code-high (ash code-point (- *page-size-exponent*)))
295 (code-low (ldb (byte *page-size-exponent* 0) code-point)))
296 (unless (aref *ucd-base* code-high)
297 (setf (aref *ucd-base* code-high)
298 (make-array (ash 1 *page-size-exponent*)
299 :initial-element nil)))
300 (multiple-value-bind (encoding name)
301 (encode-ucd-line (cdr split-line) code-point)
302 (setf (aref (aref *ucd-base* code-high) code-low) encoding
303 (gethash code-point *unicode-names*) name))))
305 (defun second-pass ()
306 (loop for i from 0 below (length *ucd-base*)
307 when (aref *ucd-base* i)
308 do (loop for j from 0 below (length (aref *ucd-base* i))
309 for result = (aref (aref *ucd-base* i) j)
310 when result
311 when (let* ((transform-point (ucd-transform result))
312 (transform-high (ash transform-point
313 (- *page-size-exponent*)))
314 (transform-low (ldb (byte *page-size-exponent* 0)
315 transform-point)))
316 (and (plusp transform-point)
317 (/= (ucd-transform
318 (aref (aref *ucd-base* transform-high)
319 transform-low))
320 (+ (ash i *page-size-exponent*) j))))
321 do (destructuring-bind (gc-index bidi-index ccc-index
322 decimal-digit digit bidi-mirrored
323 cl-both-case-p)
324 (aref *misc-table* (ucd-misc result))
325 (declare (ignore cl-both-case-p))
326 (format t "~A~%" (+ (ash i *page-size-exponent*) j))
327 (setf (ucd-misc result)
328 (hash-misc gc-index bidi-index ccc-index
329 decimal-digit digit bidi-mirrored
330 nil))))))
332 (defun write-3-byte (triplet stream)
333 (write-byte (ldb (byte 8 0) triplet) stream)
334 (write-byte (ldb (byte 8 8) triplet) stream)
335 (write-byte (ldb (byte 8 16) triplet) stream))
337 (defun digit-to-byte (digit)
338 (if (string= "" digit)
340 (parse-integer digit)))
342 (defun output ()
343 (let ((hash (make-hash-table :test #'equalp))
344 (index 0))
345 (loop for page across *ucd-base*
346 do (when page
347 (unless (gethash page hash)
348 (setf (gethash page hash)
349 (incf index)))))
350 (let ((array (make-array (1+ index))))
351 (maphash #'(lambda (key value)
352 (setf (aref array value) key))
353 hash)
354 (setf (aref array 0)
355 (make-array (ash 1 *page-size-exponent*) :initial-element nil))
356 (with-open-file (stream (make-pathname :name "ucd"
357 :type "dat"
358 :defaults *output-directory*)
359 :direction :output
360 :element-type '(unsigned-byte 8)
361 :if-exists :supersede
362 :if-does-not-exist :create)
363 (loop for (gc-index bidi-index ccc-index decimal-digit digit
364 bidi-mirrored)
365 across *misc-table*
366 do (write-byte gc-index stream)
367 do (write-byte bidi-index stream)
368 do (write-byte ccc-index stream)
369 do (write-byte (digit-to-byte decimal-digit) stream)
370 do (write-byte (digit-to-byte digit) stream)
371 do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
372 do (write-byte 0 stream)
373 do (write-byte 0 stream))
374 (loop for page across *ucd-base*
375 do (write-byte (if page (gethash page hash) 0) stream))
376 (loop for page across array
377 do (loop for entry across page
378 do (write-byte (if entry
379 (aref *misc-mapping* (ucd-misc entry))
380 255)
381 stream)
382 do (write-3-byte (if entry (ucd-transform entry) 0)
383 stream))))))
384 (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
385 :defaults *output-directory*)
386 :direction :output
387 :if-exists :supersede
388 :if-does-not-exist :create)
389 (with-standard-io-syntax
390 (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
391 (maphash (lambda (code name)
392 (when name
393 (print code f)
394 (prin1 name f)))
395 *unicode-names*))
396 (setf *unicode-names* nil))
397 (with-open-file (*standard-output*
398 (make-pathname :name "numerics"
399 :type "lisp-expr"
400 :defaults *output-directory*)
401 :direction :output
402 :if-exists :supersede
403 :if-does-not-exist :create)
404 (with-standard-io-syntax
405 (let ((*print-pretty* t))
406 (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
407 *different-numerics*)))))
408 (with-open-file (*standard-output*
409 (make-pathname :name "titlecases"
410 :type "lisp-expr"
411 :defaults *output-directory*)
412 :direction :output
413 :if-exists :supersede
414 :if-does-not-exist :create)
415 (with-standard-io-syntax
416 (let ((*print-pretty* t))
417 (prin1 *different-titlecases*))))
418 (with-open-file (*standard-output*
419 (make-pathname :name "misc"
420 :type "lisp-expr"
421 :defaults *output-directory*)
422 :direction :output
423 :if-exists :supersede
424 :if-does-not-exist :create)
425 (with-standard-io-syntax
426 (let ((*print-pretty* t))
427 (prin1 `(:length ,(length *misc-table*)
428 :uppercase ,(loop for (gc-index) across *misc-table*
429 for i from 0
430 when (= gc-index 0)
431 collect i)
432 :lowercase ,(loop for (gc-index) across *misc-table*
433 for i from 0
434 when (= gc-index 1)
435 collect i)
436 :titlecase ,(loop for (gc-index) across *misc-table*
437 for i from 0
438 when (= gc-index 2)
439 collect i))))))
440 (values))
442 ;;; Use of the generated files
444 (defparameter *compiled-ucd* nil)
446 (defun read-compiled-ucd ()
447 (with-open-file (stream (make-pathname :name "ucd"
448 :type "dat"
449 :defaults *output-directory*)
450 :direction :input
451 :element-type '(unsigned-byte 8))
452 (let ((length (file-length stream)))
453 (setq *compiled-ucd*
454 (make-array length :element-type '(unsigned-byte 8)))
455 (read-sequence *compiled-ucd* stream)))
456 (values))
458 ;;; The stuff below is dependent on misc.lisp-expr being
459 ;;; (:LENGTH 215 :UPPERCASE (0 2) :LOWERCASE (1 3) :TITLECASE (4)).
461 ;;; There are two entries for UPPERCASE and LOWERCASE because some
462 ;;; characters have case (by Unicode standards) but are not
463 ;;; transformable character-by-character in a locale-independent way
464 ;;; (as CL requires for its standard operators).
466 ;;; for more details on these debugging functions, see the description
467 ;;; of the character database format in src/code/target-char.lisp
469 (defparameter *length* 215)
471 (defun cp-index (cp)
472 (let* ((cp-high (cp-high cp))
473 (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high))))
474 (+ (* 8 *length*)
475 (ash #x110000 (- *page-size-exponent*))
476 (* (ash 4 *page-size-exponent*) page)
477 (* 4 (cp-low cp)))))
479 (defun cp-value-0 (cp)
480 (aref *compiled-ucd* (cp-index cp)))
482 (defun cp-value-1 (cp)
483 (let ((index (cp-index cp)))
484 (dpb (aref *compiled-ucd* (+ index 3)) (byte 8 16)
485 (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8)
486 (aref *compiled-ucd* (1+ index))))))
488 (defun cp-general-category (cp)
489 (aref *compiled-ucd* (* 8 (cp-value-0 cp))))
491 (defun cp-decimal-digit (cp)
492 (let ((decimal-digit (aref *compiled-ucd* (+ 3 (* 8 (cp-value-0 cp))))))
493 (and (< decimal-digit 10)
494 decimal-digit)))
496 (defun cp-alpha-char-p (cp)
497 (< (cp-general-category cp) 5))
499 (defun cp-alphanumericp (cp)
500 (let ((gc (cp-general-category cp)))
501 (or (< gc 5)
502 (= gc 12))))
504 (defun cp-digit-char-p (cp &optional (radix 10))
505 (let ((number (or (cp-decimal-digit cp)
506 (and (<= 65 cp 90)
507 (- cp 55))
508 (and (<= 97 cp 122)
509 (- cp 87)))))
510 (when (and number (< number radix))
511 number)))
513 (defun cp-graphic-char-p (cp)
514 (or (<= 32 cp 127)
515 (<= 160 cp)))
517 (defun cp-char-upcase (cp)
518 (if (= (cp-value-0 cp) 1)
519 (cp-value-1 cp)
520 cp))
522 (defun cp-char-downcase (cp)
523 (if (= (cp-value-0 cp) 0)
524 (cp-value-1 cp)
525 cp))
527 (defun cp-upper-case-p (cp)
528 (= (cp-value-0 cp) 0))
530 (defun cp-lower-case-p (cp)
531 (= (cp-value-0 cp) 1))
533 (defun cp-both-case-p (cp)
534 (< (cp-value-0 cp) 2))