utf-8: faster decoding
[cl-utf8.git] / encode-sequence.lisp
blobb7099d91aee42b7d54a473ef582fdc0460e2037d
1 (in-package "UTF-8")
3 (define-condition encode-error (error)
4 ((seq :reader encode-error-seq :initarg :seq :initform (error ":seq required") :type sequence)
5 (pos :reader encode-error-pos :initarg :pos :initform (error ":pos required") :type 'unsigned-byte))
6 (:report (lambda (condition stream)
7 (format stream "UTF-8 encode error at position ~D" (encode-error-pos condition)))))
9 (defgeneric encode-sequence (seq &key start end byte-order-mark))
11 (defmethod encode-sequence ((seq vector) &key (start 0) end byte-order-mark)
12 (unless end
13 (setf end (length seq)))
14 (assert (<= 0 start end (length seq)))
16 (let ((octet-vector (make-array (+ (* 4 (- end start)) (if byte-order-mark
18 0))
19 :element-type '(unsigned-byte 8)))
20 (octet-pos 0))
21 (when byte-order-mark
22 (setf (aref octet-vector 0) #xEF)
23 (setf (aref octet-vector 1) #xBB)
24 (setf (aref octet-vector 2) #xBF)
25 (incf octet-pos 3))
26 (loop for pos from start to (1- end)
27 for code-point = (aref seq pos)
28 do (cond ((<= #x0000 code-point #x007F)
29 (setf (aref octet-vector octet-pos) code-point)
30 (incf octet-pos))
32 ((<= #x0080 code-point #x07FF)
33 (setf (aref octet-vector octet-pos) (logior #b11000000 (ldb (byte 5 6) code-point)))
34 (setf (aref octet-vector (1+ octet-pos)) (logior #b10000000 (ldb (byte 6 0) code-point)))
35 (incf octet-pos 2))
37 ((or (<= #x0800 code-point #xD7FF)
38 (<= #xE000 code-point #xFFFF))
39 (setf (aref octet-vector octet-pos) (logior #b11100000 (ldb (byte 4 12) code-point)))
40 (setf (aref octet-vector (1+ octet-pos)) (logior #b10000000 (ldb (byte 6 6) code-point)))
41 (setf (aref octet-vector (+ octet-pos 2)) (logior #b10000000 (ldb (byte 6 0) code-point)))
42 (incf octet-pos 3))
44 ((<= #x10000 code-point #x10FFFF)
45 (setf (aref octet-vector octet-pos) (logior #b11110000 (ldb (byte 3 18) code-point)))
46 (setf (aref octet-vector (1+ octet-pos)) (logior #b10000000 (ldb (byte 6 12) code-point)))
47 (setf (aref octet-vector (+ octet-pos 2)) (logior #b10000000 (ldb (byte 6 6) code-point)))
48 (setf (aref octet-vector (+ octet-pos 3)) (logior #b10000000 (ldb (byte 6 0) code-point)))
49 (incf octet-pos 4))
51 (t (error 'encode-error :seq seq :pos pos))))
52 (adjust-array octet-vector octet-pos)))
54 (defmethod encode-sequence ((seq list) &key (start 0) end byte-order-mark)
55 (unless end
56 (setf end (length seq)))
57 (assert (<= 0 start end (length seq)))
59 (let ((octet-list (if byte-order-mark
60 (nreverse (list #xEF #xBB #xBF))
61 ())))
62 (loop for pos from start to (1- end)
63 for code-point in (nthcdr start seq)
64 do (cond ((<= #x0000 code-point #x007F)
65 (push code-point octet-list))
67 ((<= #x0080 code-point #x07FF)
68 (push (logior #b11000000 (ldb (byte 5 6) code-point)) octet-list)
69 (push (logior #b10000000 (ldb (byte 6 0) code-point)) octet-list))
71 ((or (<= #x0800 code-point #xD7FF)
72 (<= #xE000 code-point #xFFFF))
73 (push (logior #b11100000 (ldb (byte 4 12) code-point)) octet-list)
74 (push (logior #b10000000 (ldb (byte 6 6) code-point)) octet-list)
75 (push (logior #b10000000 (ldb (byte 6 0) code-point)) octet-list))
77 ((<= #x10000 code-point #x10FFFF)
78 (push (logior #b11110000 (ldb (byte 3 18) code-point)) octet-list)
79 (push (logior #b10000000 (ldb (byte 6 12) code-point)) octet-list)
80 (push (logior #b10000000 (ldb (byte 6 6) code-point)) octet-list)
81 (push (logior #b10000000 (ldb (byte 6 0) code-point)) octet-list))
83 (t (error 'encode-error :seq seq :pos pos))))
84 (nreverse octet-list)))