utf-8: faster decoding
[cl-utf8.git] / decode-sequence.lisp
bloba0a76a045909351e58feb3b5f8571fbf44ecebe9
1 (in-package "UTF-8")
3 (defmacro while (test &body body)
4 `(do ()
5 ((not ,test))
6 ,@body))
8 (defmacro code-point-1-p (a)
9 `(= (ldb (byte 1 7) ,a) #b0))
11 (defmacro code-point-2-p (a b)
12 `(and (= (ldb (byte 3 5) ,a) #b110)
13 (= (ldb (byte 2 6) ,b) #b10)))
15 (defmacro code-point-3-p (a b c)
16 `(and (= (ldb (byte 4 4) ,a) #b1110)
17 (= (ldb (byte 2 6) ,b) #b10)
18 (= (ldb (byte 2 6) ,c) #b10)))
20 (defmacro code-point-4-p (a b c d)
21 `(and (= (ldb (byte 5 3) ,a) #b11110)
22 (= (ldb (byte 2 6) ,b) #b10)
23 (= (ldb (byte 2 6) ,c) #b10)
24 (= (ldb (byte 2 6) ,d) #b10)))
26 (defmacro make-code-point-1 (a)
29 (defmacro make-code-point-2 (a b)
30 `(logior (ash (ldb (byte 5 0) ,a) 6)
31 (ldb (byte 6 0) ,b)))
33 (defmacro make-code-point-3 (a b c)
34 `(logior (ash (ldb (byte 4 0) ,a) (+ 6 6))
35 (ash (ldb (byte 6 0) ,b) 6)
36 (ldb (byte 6 0) ,c)))
38 (defmacro make-code-point-4 (a b c d)
39 `(logior (ash (ldb (byte 3 0) ,a) (+ 6 6 6))
40 (ash (ldb (byte 6 0) ,b) (+ 6 6))
41 (ash (ldb (byte 6 0) ,c) 6)
42 (ldb (byte 6 0) ,d)))
44 (define-condition decode-error (error)
45 ((seq :reader decode-error-seq :initarg :seq :initform (error ":seq required") :type sequence)
46 (pos :reader decode-error-pos :initarg :pos :initform (error ":pos required") :type 'unsigned-byte))
47 (:report (lambda (condition stream)
48 (format stream "UTF-8 decode error at position ~D" (decode-error-pos condition)))))
50 (defgeneric decode-sequence (seq &key start end))
52 (defmethod decode-sequence ((seq vector) &key (start 0) end)
53 (unless end
54 (setf end (length seq)))
55 (assert (<= 0 start end (length seq)))
56 (loop for pos from start to (1- end)
57 do (assert (<= #x00 (aref seq pos) #xFF)))
59 (let ((code-point-list ())
60 (pos start))
61 (while (< (+ pos 3) end)
62 (multiple-value-bind (code-point inc)
63 (decode-code-point-4 (aref seq pos) (aref seq (1+ pos)) (aref seq (+ pos 2)) (aref seq (+ pos 3)))
64 (unless code-point
65 (restart-case
66 (error 'decode-error :seq seq :pos pos)
67 (use-U+FFFD ()
68 :report "Substitute with U+FFFD and continue"
69 (setf code-point #xFFFD)
70 (setf inc (decode-error-width seq pos end)))))
71 (push code-point code-point-list)
72 (incf pos inc)))
74 (while (< (+ pos 2) end)
75 (multiple-value-bind (code-point inc)
76 (decode-code-point-3 (aref seq pos) (aref seq (1+ pos)) (aref seq (+ pos 2)))
77 (unless code-point
78 (restart-case
79 (error 'decode-error :seq seq :pos pos)
80 (use-U+FFFD ()
81 :report "Substitute with U+FFFD and continue"
82 (setf code-point #xFFFD)
83 (setf inc (decode-error-width seq pos end)))))
84 (push code-point code-point-list)
85 (incf pos inc)))
87 (while (< (1+ pos) end)
88 (multiple-value-bind (code-point inc)
89 (decode-code-point-2 (aref seq pos) (aref seq (1+ pos)))
90 (unless code-point
91 (restart-case
92 (error 'decode-error :seq seq :pos pos)
93 (use-U+FFFD ()
94 :report "Substitute with U+FFFD and continue"
95 (setf code-point #xFFFD)
96 (setf inc (decode-error-width seq pos end)))))
97 (push code-point code-point-list)
98 (incf pos inc)))
100 (while (< pos end)
101 (multiple-value-bind (code-point inc)
102 (decode-code-point-1 (aref seq pos))
103 (unless code-point
104 (restart-case
105 (error 'decode-error :seq seq :pos pos)
106 (use-U+FFFD ()
107 :report "Substitute with U+FFFD and continue"
108 (setf code-point #xFFFD)
109 (setf inc (decode-error-width seq pos end)))))
110 (push code-point code-point-list)
111 (incf pos inc)))
113 (coerce (nreverse code-point-list) 'vector)))
115 (defun decode-code-point-1 (a)
116 (declare (type (unsigned-byte 8) a))
117 (cond ((code-point-1-p a)
118 (values (make-code-point-1 a) 1))))
120 (defun decode-code-point-2 (a b)
121 (declare (type (unsigned-byte 8) a))
122 (declare (type (unsigned-byte 8) b))
123 (cond ((code-point-1-p a)
124 (values (make-code-point-1 a) 1))
126 ((code-point-2-p a b)
127 (let ((code-point (make-code-point-2 a b)))
128 (if (<= #x0080 code-point)
129 (values code-point 2)
130 nil)))))
132 (defun decode-code-point-3 (a b c)
133 (declare (type (unsigned-byte 8) a))
134 (declare (type (unsigned-byte 8) b))
135 (declare (type (unsigned-byte 8) c))
136 (cond ((code-point-1-p a)
137 (values (make-code-point-1 a) 1))
139 ((code-point-2-p a b)
140 (let ((code-point (make-code-point-2 a b)))
141 (if (<= #x0080 code-point)
142 (values code-point 2)
143 nil)))
145 ((code-point-3-p a b c)
146 (let ((code-point (make-code-point-3 a b c)))
147 (if (or (<= #x0800 code-point #xD7FF)
148 (<= #xE000 code-point))
149 (values code-point 3)
150 nil)))))
152 (defun decode-code-point-4 (a b c d)
153 (declare (type (unsigned-byte 8) a))
154 (declare (type (unsigned-byte 8) b))
155 (declare (type (unsigned-byte 8) c))
156 (declare (type (unsigned-byte 8) d))
157 (cond ((code-point-1-p a)
158 (values (make-code-point-1 a) 1))
160 ((code-point-2-p a b)
161 (let ((code-point (make-code-point-2 a b)))
162 (if (<= #x0080 code-point)
163 (values code-point 2)
164 nil)))
166 ((code-point-3-p a b c)
167 (let ((code-point (make-code-point-3 a b c)))
168 (if (or (<= #x0800 code-point #xD7FF)
169 (<= #xE000 code-point))
170 (values code-point 3)
171 nil)))
173 ((code-point-4-p a b c d)
174 (let ((code-point (make-code-point-4 a b c d)))
175 (if (<= #x10000 code-point #x10FFFF)
176 (values code-point 4)
177 nil)))))
179 (defun decode-error-width (seq pos end)
180 (let ((a (aref seq pos))
181 (b (if (< (1+ pos) end)
182 (aref seq (1+ pos))
183 nil))
184 (c (if (< (+ pos 2) end)
185 (aref seq (+ pos 2))
186 nil)))
187 (cond ((and (= a #xE0) b (<= #xA0 b #xBF)) 2)
188 ((and (<= #xE1 a #xEC) b (<= #x80 b #xBF)) 2)
189 ((and (= a #xED) b (<= #x80 b #x9F)) 2)
190 ((and (<= #xEE a #xEF) b (<= #x80 b #xBF)) 2)
191 ((and (= a #xF0) b (<= #x90 b #xBF) c (<= #x80 c #xBF)) 3)
192 ((and (= a #xF0) b (<= #x90 b #xBF)) 2)
193 ((and (<= #xF1 a #xF3) b (<= #x80 b #xBF) c (<= #x80 c #xBF)) 3)
194 ((and (<= #xF1 a #xF3) b (<= #x80 b #xBF)) 2)
195 ((and (= a #xF4) b (<= #x80 b #x8F) c (<= #x80 c #xBF)) 3)
196 ((and (= a #xF4) b (<= #x80 b #x8F)) 2)
197 (t 1))))