1 #!/usr
/local
/bin
/clisp -C
3 ;;; Creation of gnulib's uninames.h from the UnicodeData.txt table.
4 ;;; Bruno Haible 2000-12-28
6 (defparameter add-comments nil
)
8 (defstruct unicode-char
9 (index nil
:type integer
)
10 (name nil
:type string
)
16 (index nil
:type integer
)
17 (start-code nil
:type integer
)
18 (end-code nil
:type integer
)
22 (hashed nil
:type hash-table
)
23 (sorted nil
:type list
)
24 size
; number of characters total
25 length
; number of words
28 (defun main (inputfile outputfile aliasfile
)
29 (declare (type string inputfile outputfile aliasfile
))
30 #+UNICODE
(setq *default-file-encoding
* charset
:utf-8
)
32 (all-chars-hashed (make-hash-table :test
#'equal
))
38 ;; Read all characters and names from the input file.
39 (with-open-file (istream inputfile
:direction
:input
)
41 (let ((line (read-line istream nil nil
)))
42 (unless line
(return))
43 (let* ((i1 (position #\
; line))
44 (i2 (position #\
; line :start (1+ i1)))
45 (code-string (subseq line
0 i1
))
46 (code (parse-integer code-string
:radix
16))
47 (name-string (subseq line
(1+ i1
) i2
)))
48 ; Ignore characters whose name starts with "<".
49 (unless (eql (char name-string
0) #\
<)
50 ; Also ignore Hangul syllables; they are treated specially.
51 (unless (<= #xAC00 code
#xD7A3
)
52 ; Also ignore CJK compatibility ideographs; they are treated
54 (unless (or (<= #xF900 code
#xFA2D
) (<= #xFA30 code
#xFA6A
)
55 (<= #xFA70 code
#xFAD9
) (<= #x2F800 code
#x2FA1D
))
56 ;; Also ignore variationselectors; they are treated
58 (unless (or (<= #xFE00 code
#xFE0F
) (<= #xE0100 code
#xE01EF
))
59 (push (make-unicode-char :index name-index
62 (setf (gethash code all-chars-hashed
) (car all-chars
))
63 ;; Update the contiguous range, or start a new range.
64 (if (and range
(= (1+ (range-end-code range
)) code
))
65 (setf (range-end-code range
) code
)
68 (push range all-ranges
))
69 (setq range
(make-range :index name-index
76 (setq all-chars
(nreverse all-chars
))
78 (push range all-ranges
))
79 (setq all-ranges
(nreverse all-ranges
))
81 ;; Read all characters and names from the alias file.
82 (with-open-file (istream aliasfile
:direction
:input
)
84 (let ((line (read-line istream nil nil
)))
85 (unless line
(return))
86 (let* ((i1 (position #\
; line))
87 (i2 (position #\
; line :start (1+ i1)))
88 (code-string (subseq line
0 i1
))
89 (code (parse-integer code-string
:radix
16))
90 (name-string (subseq line
(1+ i1
) i2
))
91 (uc (gethash code all-chars-hashed
)))
93 (push (make-unicode-char :index
(unicode-char-index uc
)
97 (setq all-aliases
(nreverse all-aliases
)
98 all-chars-and-aliases
(append all-chars all-aliases
))
100 (let ((words-by-length (make-array 0 :adjustable t
)))
101 (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" "VARIATION"
102 (mapcar #'unicode-char-name all-chars-and-aliases
)))
105 (when (>= i1
(length name
)) (return))
106 (let ((i2 (or (position #\Space name
:start i1
) (length name
))))
107 (let* ((word (subseq name i1 i2
))
109 (when (>= len
(length words-by-length
))
110 (adjust-array words-by-length
(1+ len
))
112 (unless (aref words-by-length len
)
113 (setf (aref words-by-length len
)
115 :hashed
(make-hash-table :test
#'equal
)
118 (let ((word-list (aref words-by-length len
)))
119 (unless (gethash word
(word-list-hashed word-list
))
120 (setf (gethash word
(word-list-hashed word-list
)) t
)
121 (push word
(word-list-sorted word-list
))
126 ;; Sort the word lists.
127 (dotimes (len (length words-by-length
))
128 (unless (aref words-by-length len
)
129 (setf (aref words-by-length len
)
131 :hashed
(make-hash-table :test
#'equal
)
134 (let ((word-list (aref words-by-length len
)))
135 (setf (word-list-sorted word-list
)
136 (sort (word-list-sorted word-list
) #'string
<)
138 (setf (word-list-size word-list
)
139 (reduce #'+ (mapcar #'length
(word-list-sorted word-list
)))
141 (setf (word-list-length word-list
)
142 (length (word-list-sorted word-list
))
144 ;; Output the tables.
145 (with-open-file (ostream outputfile
:direction
:output
146 #+UNICODE
:external-format
#+UNICODE charset
:ascii
)
147 (format ostream
"/* DO NOT EDIT! GENERATED AUTOMATICALLY! */~%")
148 (format ostream
"/*~%")
149 (format ostream
" * ~A~%" (file-namestring outputfile
))
150 (format ostream
" *~%")
151 (format ostream
" * Unicode character name table.~%")
152 (format ostream
" * Generated automatically by the gen-uninames utility.~%")
153 (format ostream
" */~%")
154 (format ostream
"~%")
155 (format ostream
"static const char unicode_name_words[~D] = {~%"
157 (dotimes (len (length words-by-length
))
158 (let ((word-list (aref words-by-length len
)))
159 (incf sum
(word-list-size word-list
))
163 (dotimes (len (length words-by-length
))
164 (let ((word-list (aref words-by-length len
)))
165 (dolist (word (word-list-sorted word-list
))
166 (format ostream
" ~{ '~C',~}~%" (coerce word
'list
))
168 (format ostream
"};~%")
169 (format ostream
"#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
171 (dotimes (len (length words-by-length
))
172 (let ((word-list (aref words-by-length len
)))
173 (incf sum
(word-list-length word-list
))
178 (format ostream
"static const uint16_t unicode_name_word_offsets[~D] = {~%"
180 (dotimes (len (length words-by-length
))
181 (let ((word-list (aref words-by-length len
)))
182 (incf sum
(word-list-length word-list
))
186 (dotimes (len (length words-by-length
))
187 (let ((word-list (aref words-by-length len
)))
188 (when (word-list-sorted word-list
)
190 (do ((l (word-list-sorted word-list
) (cdr l
))
191 (offset 0 (+ offset
(length (car l
)))))
193 (format ostream
"~<~% ~0,79:; ~D,~>" offset
)
195 (format ostream
"~%")
197 (format ostream
"};~%")
199 (format ostream
"static const struct { uint32_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
200 (1+ (length words-by-length
))
202 (let ((extra-offset 0)
204 (dotimes (len (length words-by-length
))
205 (let ((word-list (aref words-by-length len
)))
206 (format ostream
" { ~D, ~D },~%" extra-offset ind-offset
)
207 (incf extra-offset
(word-list-size word-list
))
208 (incf ind-offset
(word-list-length word-list
))
210 (format ostream
" { ~D, ~D }~%" extra-offset ind-offset
)
212 (format ostream
"};~%")
213 (let ((ind-offset 0))
214 (dotimes (len (length words-by-length
))
215 (let ((word-list (aref words-by-length len
)))
216 (dolist (word (word-list-sorted word-list
))
217 (setf (gethash word
(word-list-hashed word-list
)) ind-offset
)
220 (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY" "VARIATION"))
221 (format ostream
"#define UNICODE_CHARNAME_WORD_~A ~D~%" word
222 (gethash word
(word-list-hashed (aref words-by-length
(length word
))))
224 ;; Compute the word-indices for every unicode-char.
225 (dolist (uc all-chars-and-aliases
)
226 (let ((name (unicode-char-name uc
))
230 (when (>= i1
(length name
)) (return))
231 (let ((i2 (or (position #\Space name
:start i1
) (length name
))))
232 (let* ((word (subseq name i1 i2
))
234 (push (gethash word
(word-list-hashed (aref words-by-length len
)))
240 (setf (unicode-char-word-indices uc
)
241 (coerce (nreverse indices
) 'vector
)
244 ;; Sort the list of unicode-chars by word-indices.
245 (setq all-chars-and-aliases
246 (sort all-chars-and-aliases
248 (let ((len1 (length vec1
))
249 (len2 (length vec2
)))
254 (cond ((< (aref vec1 i
) (aref vec2 i
)) (return t
))
255 ((> (aref vec1 i
) (aref vec2 i
)) (return nil
))
261 :key
#'unicode-char-word-indices
263 ;; Output the word-indices.
264 (format ostream
"static const uint16_t unicode_names[~D] = {~%"
265 (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc
))) all-chars-and-aliases
))
268 (dolist (uc all-chars-and-aliases
)
269 (format ostream
" ~{ ~D,~}"
270 (maplist (lambda (r) (+ (* 2 (car r
)) (if (cdr r
) 1 0)))
271 (coerce (unicode-char-word-indices uc
) 'list
)
275 (format ostream
"~40T/* ~A */" (unicode-char-name uc
))
277 (format ostream
"~%")
278 (setf (unicode-char-word-indices-index uc
) i
)
279 (incf i
(length (unicode-char-word-indices uc
)))
281 (format ostream
"};~%")
282 (format ostream
"static const struct { uint16_t index; uint32_t name:24; }~%")
283 (format ostream
"#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
284 (format ostream
"unicode_name_to_index[~D] = {~%"
285 (length all-chars-and-aliases
)
287 (dolist (uc all-chars-and-aliases
)
288 (format ostream
" { 0x~4,'0X, ~D },"
289 (unicode-char-index uc
)
290 (unicode-char-word-indices-index uc
)
293 (format ostream
"~21T/* ~A */" (unicode-char-name uc
))
295 (format ostream
"~%")
297 (format ostream
"};~%")
298 (format ostream
"static const struct { uint16_t index; uint32_t name:24; }~%")
299 (format ostream
"#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
300 (format ostream
"unicode_index_to_name[~D] = {~%"
303 (dolist (uc (sort (copy-list all-chars
) #'< :key
#'unicode-char-index
))
304 (format ostream
" { 0x~4,'0X, ~D },"
305 (unicode-char-index uc
)
306 (unicode-char-word-indices-index uc
)
309 (format ostream
"~21T/* ~A */" (unicode-char-name uc
))
311 (format ostream
"~%")
313 (format ostream
"};~%")
314 (format ostream
"#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
315 (reduce #'max
(mapcar (lambda (uc) (length (unicode-char-name uc
))) all-chars-and-aliases
))
317 (format ostream
"#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
318 (reduce #'max
(mapcar (lambda (uc) (length (unicode-char-word-indices uc
))) all-chars-and-aliases
))
320 (format ostream
"static const struct { uint16_t index; uint32_t gap; uint16_t length; } unicode_ranges[~D] = {~%"
322 (dolist (range all-ranges
)
323 (format ostream
" { ~D, ~D, ~D },~%"
325 (- (range-start-code range
) (range-index range
))
326 (1+ (- (range-end-code range
) (range-start-code range
))))
328 (format ostream
"};~%")
332 (main (first *args
*) (second *args
*) (third *args
*))