timevar: import from Bison.
[gnulib.git] / lib / uniname / gen-uninames.lisp
blob937b50ef5163cebc8e9498026829980c3f919222
1 #!/usr/local/bin/clisp -C
3 ;;; Creation of gnulib's uninames.h from the UnicodeData.txt table.
4 ;;; Bruno Haible 2000-12-28
6 (defparameter add-comments nil)
8 (defstruct unicode-char
9 (index nil :type integer)
10 (name nil :type string)
11 word-indices
12 word-indices-index
15 (defstruct range
16 (index nil :type integer)
17 (start-code nil :type integer)
18 (end-code nil :type integer)
21 (defstruct word-list
22 (hashed nil :type hash-table)
23 (sorted nil :type list)
24 size ; number of characters total
25 length ; number of words
28 (defun main (inputfile outputfile aliasfile)
29 (declare (type string inputfile outputfile aliasfile))
30 #+UNICODE (setq *default-file-encoding* charset:utf-8)
31 (let ((all-chars '())
32 (all-chars-hashed (make-hash-table :test #'equal))
33 (all-aliases '())
34 all-chars-and-aliases
35 (all-ranges '())
36 (name-index 0)
37 range)
38 ;; Read all characters and names from the input file.
39 (with-open-file (istream inputfile :direction :input)
40 (loop
41 (let ((line (read-line istream nil nil)))
42 (unless line (return))
43 (let* ((i1 (position #\; line))
44 (i2 (position #\; line :start (1+ i1)))
45 (code-string (subseq line 0 i1))
46 (code (parse-integer code-string :radix 16))
47 (name-string (subseq line (1+ i1) i2)))
48 ; Ignore characters whose name starts with "<".
49 (unless (eql (char name-string 0) #\<)
50 ; Also ignore Hangul syllables; they are treated specially.
51 (unless (<= #xAC00 code #xD7A3)
52 ; Also ignore CJK compatibility ideographs; they are treated
53 ; specially as well.
54 (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
55 (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
56 ;; Also ignore variationselectors; they are treated
57 ;; specially as well.
58 (unless (or (<= #xFE00 code #xFE0F) (<= #xE0100 code #xE01EF))
59 (push (make-unicode-char :index name-index
60 :name name-string)
61 all-chars)
62 (setf (gethash code all-chars-hashed) (car all-chars))
63 ;; Update the contiguous range, or start a new range.
64 (if (and range (= (1+ (range-end-code range)) code))
65 (setf (range-end-code range) code)
66 (progn
67 (when range
68 (push range all-ranges))
69 (setq range (make-range :index name-index
70 :start-code code
71 :end-code code))))
72 (incf name-index)
73 (setq last-code code)
74 ) ) ) )
75 ) ) ) )
76 (setq all-chars (nreverse all-chars))
77 (if range
78 (push range all-ranges))
79 (setq all-ranges (nreverse all-ranges))
80 (when aliasfile
81 ;; Read all characters and names from the alias file.
82 (with-open-file (istream aliasfile :direction :input)
83 (loop
84 (let ((line (read-line istream nil nil)))
85 (unless line (return))
86 (let* ((i1 (position #\; line))
87 (i2 (position #\; line :start (1+ i1)))
88 (code-string (subseq line 0 i1))
89 (code (parse-integer code-string :radix 16))
90 (name-string (subseq line (1+ i1) i2))
91 (uc (gethash code all-chars-hashed)))
92 (when uc
93 (push (make-unicode-char :index (unicode-char-index uc)
94 :name name-string)
95 all-aliases)
96 ) ) ) ) ) )
97 (setq all-aliases (nreverse all-aliases)
98 all-chars-and-aliases (append all-chars all-aliases))
99 ;; Split into words.
100 (let ((words-by-length (make-array 0 :adjustable t)))
101 (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" "VARIATION"
102 (mapcar #'unicode-char-name all-chars-and-aliases)))
103 (let ((i1 0))
104 (loop
105 (when (>= i1 (length name)) (return))
106 (let ((i2 (or (position #\Space name :start i1) (length name))))
107 (let* ((word (subseq name i1 i2))
108 (len (length word)))
109 (when (>= len (length words-by-length))
110 (adjust-array words-by-length (1+ len))
112 (unless (aref words-by-length len)
113 (setf (aref words-by-length len)
114 (make-word-list
115 :hashed (make-hash-table :test #'equal)
116 :sorted '()
117 ) ) )
118 (let ((word-list (aref words-by-length len)))
119 (unless (gethash word (word-list-hashed word-list))
120 (setf (gethash word (word-list-hashed word-list)) t)
121 (push word (word-list-sorted word-list))
124 (setq i1 (1+ i2))
125 ) ) ) )
126 ;; Sort the word lists.
127 (dotimes (len (length words-by-length))
128 (unless (aref words-by-length len)
129 (setf (aref words-by-length len)
130 (make-word-list
131 :hashed (make-hash-table :test #'equal)
132 :sorted '()
133 ) ) )
134 (let ((word-list (aref words-by-length len)))
135 (setf (word-list-sorted word-list)
136 (sort (word-list-sorted word-list) #'string<)
138 (setf (word-list-size word-list)
139 (reduce #'+ (mapcar #'length (word-list-sorted word-list)))
141 (setf (word-list-length word-list)
142 (length (word-list-sorted word-list))
143 ) ) )
144 ;; Output the tables.
145 (with-open-file (ostream outputfile :direction :output
146 #+UNICODE :external-format #+UNICODE charset:ascii)
147 (format ostream "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */~%")
148 (format ostream "/*~%")
149 (format ostream " * ~A~%" (file-namestring outputfile))
150 (format ostream " *~%")
151 (format ostream " * Unicode character name table.~%")
152 (format ostream " * Generated automatically by the gen-uninames utility.~%")
153 (format ostream " */~%")
154 (format ostream "~%")
155 (format ostream "static const char unicode_name_words[~D] = {~%"
156 (let ((sum 0))
157 (dotimes (len (length words-by-length))
158 (let ((word-list (aref words-by-length len)))
159 (incf sum (word-list-size word-list))
163 (dotimes (len (length words-by-length))
164 (let ((word-list (aref words-by-length len)))
165 (dolist (word (word-list-sorted word-list))
166 (format ostream " ~{ '~C',~}~%" (coerce word 'list))
167 ) ) )
168 (format ostream "};~%")
169 (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
170 (let ((sum 0))
171 (dotimes (len (length words-by-length))
172 (let ((word-list (aref words-by-length len)))
173 (incf sum (word-list-length word-list))
177 #| ; Redundant data
178 (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%"
179 (let ((sum 0))
180 (dotimes (len (length words-by-length))
181 (let ((word-list (aref words-by-length len)))
182 (incf sum (word-list-length word-list))
186 (dotimes (len (length words-by-length))
187 (let ((word-list (aref words-by-length len)))
188 (when (word-list-sorted word-list)
189 (format ostream " ")
190 (do ((l (word-list-sorted word-list) (cdr l))
191 (offset 0 (+ offset (length (car l)))))
192 ((endp l))
193 (format ostream "~<~% ~0,79:; ~D,~>" offset)
195 (format ostream "~%")
196 ) ) )
197 (format ostream "};~%")
199 (format ostream "static const struct { uint32_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
200 (1+ (length words-by-length))
202 (let ((extra-offset 0)
203 (ind-offset 0))
204 (dotimes (len (length words-by-length))
205 (let ((word-list (aref words-by-length len)))
206 (format ostream " { ~D, ~D },~%" extra-offset ind-offset)
207 (incf extra-offset (word-list-size word-list))
208 (incf ind-offset (word-list-length word-list))
210 (format ostream " { ~D, ~D }~%" extra-offset ind-offset)
212 (format ostream "};~%")
213 (let ((ind-offset 0))
214 (dotimes (len (length words-by-length))
215 (let ((word-list (aref words-by-length len)))
216 (dolist (word (word-list-sorted word-list))
217 (setf (gethash word (word-list-hashed word-list)) ind-offset)
218 (incf ind-offset)
219 ) ) ) )
220 (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY" "VARIATION"))
221 (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
222 (gethash word (word-list-hashed (aref words-by-length (length word))))
224 ;; Compute the word-indices for every unicode-char.
225 (dolist (uc all-chars-and-aliases)
226 (let ((name (unicode-char-name uc))
227 (indices '()))
228 (let ((i1 0))
229 (loop
230 (when (>= i1 (length name)) (return))
231 (let ((i2 (or (position #\Space name :start i1) (length name))))
232 (let* ((word (subseq name i1 i2))
233 (len (length word)))
234 (push (gethash word (word-list-hashed (aref words-by-length len)))
235 indices
238 (setq i1 (1+ i2))
239 ) ) )
240 (setf (unicode-char-word-indices uc)
241 (coerce (nreverse indices) 'vector)
244 ;; Sort the list of unicode-chars by word-indices.
245 (setq all-chars-and-aliases
246 (sort all-chars-and-aliases
247 (lambda (vec1 vec2)
248 (let ((len1 (length vec1))
249 (len2 (length vec2)))
250 (do ((i 0 (1+ i)))
251 (nil)
252 (if (< i len2)
253 (if (< i len1)
254 (cond ((< (aref vec1 i) (aref vec2 i)) (return t))
255 ((> (aref vec1 i) (aref vec2 i)) (return nil))
257 (return t)
259 (return nil)
260 ) ) ) )
261 :key #'unicode-char-word-indices
263 ;; Output the word-indices.
264 (format ostream "static const uint16_t unicode_names[~D] = {~%"
265 (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
267 (let ((i 0))
268 (dolist (uc all-chars-and-aliases)
269 (format ostream " ~{ ~D,~}"
270 (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
271 (coerce (unicode-char-word-indices uc) 'list)
274 (when add-comments
275 (format ostream "~40T/* ~A */" (unicode-char-name uc))
277 (format ostream "~%")
278 (setf (unicode-char-word-indices-index uc) i)
279 (incf i (length (unicode-char-word-indices uc)))
281 (format ostream "};~%")
282 (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%")
283 (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
284 (format ostream "unicode_name_to_index[~D] = {~%"
285 (length all-chars-and-aliases)
287 (dolist (uc all-chars-and-aliases)
288 (format ostream " { 0x~4,'0X, ~D },"
289 (unicode-char-index uc)
290 (unicode-char-word-indices-index uc)
292 (when add-comments
293 (format ostream "~21T/* ~A */" (unicode-char-name uc))
295 (format ostream "~%")
297 (format ostream "};~%")
298 (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%")
299 (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
300 (format ostream "unicode_index_to_name[~D] = {~%"
301 (length all-chars)
303 (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-index))
304 (format ostream " { 0x~4,'0X, ~D },"
305 (unicode-char-index uc)
306 (unicode-char-word-indices-index uc)
308 (when add-comments
309 (format ostream "~21T/* ~A */" (unicode-char-name uc))
311 (format ostream "~%")
313 (format ostream "};~%")
314 (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
315 (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars-and-aliases))
317 (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
318 (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
320 (format ostream "static const struct { uint16_t index; uint32_t gap; uint16_t length; } unicode_ranges[~D] = {~%"
321 (length all-ranges))
322 (dolist (range all-ranges)
323 (format ostream " { ~D, ~D, ~D },~%"
324 (range-index range)
325 (- (range-start-code range) (range-index range))
326 (1+ (- (range-end-code range) (range-start-code range))))
328 (format ostream "};~%")
330 ) ) )
332 (main (first *args*) (second *args*) (third *args*))