1 ;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp -*-
3 ;; This code is free software; you can redistribute it and/or
4 ;; modify it under the terms of the version 2.1 of
5 ;; the GNU Lesser General Public License as published by
6 ;; the Free Software Foundation, as clarified by the
7 ;; preamble found here:
8 ;; http://opensource.franz.com/preamble.html
10 ;; This program is distributed in the hope that it will be useful,
11 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ;; GNU General Public License for more details.
15 ;; You should have received a copy of the GNU Lesser General
16 ;; Public License along with this library; if not, write to the
17 ;; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
18 ;; Boston, MA 02110-1301, USA
20 (in-package :io.encodings
)
22 (declaim (optimize (speed 0) (space 0) (safety 3) (debug 3)))
24 ;; Mostly taken from SBCL's sb-simple-streams contrib
25 ;;; **********************************************************************
26 ;;; This code was written by Paul Foley
29 ;;; Sbcl port by Rudi Schlatte.
31 (define-condition octet-encoding-error
(error)
32 ((string :initarg
:string
:reader octets-encoding-error-string
)
33 (position :initarg
:position
:reader octets-encoding-error-position
)
34 (external-format :initarg
:external-format
35 :reader octets-encoding-error-external-format
))
36 (:report
(lambda (c s
)
37 (format s
"Unable to encode character ~A as ~S."
38 (char-code (char (octets-encoding-error-string c
)
39 (octets-encoding-error-position c
)))
40 (octets-encoding-error-external-format c
)))))
42 (define-condition illegal-character
(octet-encoding-error) ())
45 (define-condition octet-decoding-error
(error)
46 ((array :initarg
:array
:accessor octet-decoding-error-array
)
47 (start :initarg
:start
:accessor octet-decoding-error-start
)
48 (end :initarg
:end
:accessor octet-decoding-error-end
)
49 (position :initarg
:position
:accessor octet-decoding-bad-byte-position
)
50 (external-format :initarg
:external-format
51 :accessor octet-decoding-error-external-format
))
54 (format s
"Illegal ~A character starting at byte position ~D: ~A."
55 (octet-decoding-error-external-format c
)
56 (octet-decoding-bad-byte-position c
)
57 (cffi:mem-aref
(octet-decoding-error-array c
) :uint8
58 (octet-decoding-bad-byte-position c
))))))
60 (define-condition end-of-input-in-character
(octet-decoding-error) ())
61 (define-condition malformed-multibyte-sequence
(octet-decoding-error) ())
62 (define-condition invalid-starter-octet
(malformed-multibyte-sequence) ())
63 (define-condition invalid-continuation-octet
(malformed-multibyte-sequence) ())
64 (define-condition overlong-octet-sequence
(malformed-multibyte-sequence) ())
65 (define-condition illegal-code-point
(octet-decoding-error) ())
73 (deftype line-terminator
()
74 '(member :unix
:mac
:dos
))
76 (defvar *default-external-format
* :utf-8
)
77 (eval-when (:compile-toplevel
:load-toplevel
:execute
)
78 (defvar *default-line-terminator
* :unix
))
80 (defvar *external-formats
* (make-hash-table))
81 (defvar *external-format-aliases
* (make-hash-table))
82 (defvar *external-format-list
* nil
)
84 (defstruct (external-format
86 (:print-function %print-external-format
)
87 (:constructor %make-external-format
(name
92 (name (missing-arg) :type keyword
:read-only t
)
93 (line-terminator (missing-arg) :type keyword
)
94 (octets-to-char (missing-arg) :type function
:read-only t
)
95 (char-to-octets (missing-arg) :type function
:read-only t
)
96 (octet-size (missing-arg) :type real
))
98 (defun %print-external-format
(ef stream depth
)
99 (declare (ignore depth
))
100 (print-unreadable-object (ef stream
:type t
:identity nil
)
101 (format stream
"~A ~S"
102 (ef-name ef
) (ef-line-terminator ef
))))
104 (defun make-external-format (name &key new-name
105 (line-terminator *default-line-terminator
*)
107 (check-type line-terminator line-terminator
)
108 (let ((ef (find-external-format name
)))
109 (%make-external-format
110 (or new-name
(ef-name ef
))
111 (or line-terminator
(ef-line-terminator ef
))
112 (if (and octet-size
(<= 1 octet-size
4))
115 (ef-octets-to-char ef
)
116 (ef-char-to-octets ef
))))
124 (deftype buffer-index
()
127 (defmacro add-external-format
(name aliases ef
)
128 (let (($alias$
(gensym "ALIAS")))
130 (setf (gethash ,name
*external-formats
*) ,ef
)
131 (setf *external-format-list
* (append *external-format-list
* (list ,name
)))
132 (dolist (,$alias$
',aliases
)
133 (assert (keywordp ,$alias$
))
134 (setf (gethash ,$alias$
*external-format-aliases
*) ,name
)))))
136 (defmacro define-external-format
(name aliases octet-size octets-to-char char-to-octets
137 &key
(line-terminator *default-line-terminator
*))
138 (let (($ef$
(gensym "EF")))
139 `(macrolet ((to-char (&body body
)
140 `(lambda (input output error-fn bytes-left
)
141 (declare (type (function () octet
) input
)
142 (type (function (character) t
) output
)
143 (type (function (symbol) character
) error-fn
)
144 (type buffer-index bytes-left
)
145 (ignorable input output error-fn bytes-left
))
147 (to-octets (&body body
)
148 `(lambda (input output error-fn chars-left
)
149 (declare (type (function () character
) input
)
150 (type (function (octet) t
) output
)
151 (type (function (symbol) character
) error-fn
)
152 (type buffer-index chars-left
)
153 (ignorable input output error-fn chars-left
))
155 (let ((,$ef$
(%make-external-format
,name
,line-terminator
,octet-size
156 ,octets-to-char
,char-to-octets
)))
157 (add-external-format ,name
,aliases
,$ef$
)))))
159 (defun find-external-format (name &optional
(error-p t
))
160 (when (external-format-p name
)
161 (return-from find-external-format name
))
163 (when (eq name
:default
)
164 (setq name
*default-external-format
*))
166 (setf name
(iolib-utils:ensure-keyword name
)))
168 (or (gethash name
*external-formats
*)
169 (gethash (gethash name
*external-format-aliases
*)
171 (if error-p
(error "External format ~S not found." name
) nil
)))
179 (define-condition void-external-format
(error) ()
181 (lambda (condition stream
)
182 (declare (ignore condition
))
183 (format stream
"Attempting I/O through void external-format."))))
185 (define-external-format :void
() 0
187 (error 'void-external-format
))
189 (error 'void-external-format
)))
191 (define-external-format :ascii
(:us-ascii
) 1
193 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
194 (let ((code (funcall input
)))
196 (funcall output
(aref +iso-8859-1-table
+ code
))
197 (funcall output
(funcall error-fn
'illegal-code-point
)))))
199 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
200 (let ((code (char-code (funcall input
))))
202 (funcall output code
)
203 (funcall output
(char-code (funcall error-fn
'illegal-character
)))))))
205 (define-external-format :iso-8859-1
(:iso8859-1
:ISO_8859-1
:latin1
:l1
206 :csISOLatin1
:iso-ir-100
:CP819
) 1
208 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
209 (let ((code (funcall input
)))
210 (funcall output
(aref +iso-8859-1-table
+ code
))))
212 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
213 (let ((code (char-code (funcall input
))))
215 (funcall output code
)
216 (funcall output
(char-code (funcall error-fn
'illegal-character
)))))))
218 (defmacro define-iso-8859-external-formats
(indexes)
219 (flet ((get-name-and-aliases (index)
225 ,@(loop :for i
:in indexes
227 (multiple-value-bind (index aliases
) (get-name-and-aliases i
)
228 (let ((table (iolib-utils:concat-symbol
"+iso-8859-" index
"-table+"))
229 (name (iolib-utils:ensure-keyword
230 (concatenate 'string
"ISO-8859-" index
))))
231 (push (iolib-utils:ensure-keyword
232 (concatenate 'string
"ISO8859-" index
))
234 `(define-external-format ,name
,aliases
1
236 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
237 (let ((code (funcall input
)))
238 (funcall output
(aref ,table code
))))
240 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
241 (let ((code (position (funcall input
) ,table
)))
243 (funcall output code
)
245 (position (funcall error-fn
'illegal-character
)
248 (define-iso-8859-external-formats
249 (("2" :ISO_8859-2
:latin2
:l2
:csISOLatin2
:iso-ir-101
)
250 ("3" :ISO_8859-3
:latin3
:l3
:csISOLatin3
:iso-ir-109
)
251 ("4" :ISO_8859-4
:latin4
:l4
:csISOLatin4
:iso-ir-110
)
252 ("5" :ISO_8859-5
:cyrillic
:csISOLatinCyrillic
:iso-ir-144
)
253 ("6" :ISO_8859-6
:arabic
:csISOLatinArabic
:iso-ir-127
)
254 ("7" :ISO_8859-7
:greek
:greek8
:csISOLatinGreek
:iso-ir-126
)
255 ("8" :ISO_8859-8
:hebrew
:csISOLatinHebrew
:iso-ir-138
)
256 ("9" :ISO_8859-9
:latin5
:l5
:csISOLatin5
:iso-ir-148
)
257 ("10" :ISO_8859-10
:latin6
:l6
:csISOLatin6
:iso-ir-157
)
258 ("11" :ISO_8859-11
:thai
:csISOLatinThai
:iso-ir-166
)
259 ("13" :ISO_8859-13
:baltic
:csISOLatinBaltic
:iso-ir-179
)
260 ("14" :ISO_8859-14
:iso-celtic
:latin8
:l8
:csISOLatinCeltic
:iso-ir-199
)
261 ("15" :ISO_8859-15
:latin9
:l9
:csISOLatin9
:iso-ir-203
)
262 ("16" :ISO_8859-16
:latin10
:l10
:csISOLatin10
:iso-ir-226
)))
264 (iolib-utils:define-constant
+max-unicode-code-point
+ #x10FFFF
)
266 (declaim (inline illegal-unicode-code-point
))
267 (defun illegal-unicode-code-point (code)
268 (declare (type (unsigned-byte 32) code
))
269 (or (<= #xD800 code
#xDFFF
)
272 (> code
+max-unicode-code-point
+)))
274 (define-external-format :utf-8
(:utf8
) 2
276 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
278 (let ((code 0) (bytes-needed nil
)
281 (declare (type octet byte0 byte1 byte2 byte3
))
282 (labels ((decode-err (sym)
283 (return-from utf-8-decode
284 (funcall output
(funcall error-fn sym
))))
285 (utf-8-byte-len (code)
286 (declare (type octet code
))
288 ((not (logbitp 7 code
)) 1)
289 ((= (logand code
#b11100000
) #b11000000
) 2)
290 ((= (logand code
#b11110000
) #b11100000
) 3)
291 ((= (logand code
#b11111000
) #b11110000
) 4)
292 (t (decode-err 'invalid-starter-octet
))))
293 (valid-secondary-check (byte)
294 (or (= (logand byte
#b11000000
) #b10000000
)
295 (decode-err 'invalid-continuation-octet
)))
296 (overlong-check (starter mask
)
297 (or (/= starter byte0
)
298 (/= (logior byte1 mask
) mask
)
299 (decode-err 'overlong-octet-sequence
))))
300 (macrolet ((put-and-check-valid-secondary-bytes (&rest places
)
301 `(progn ,@(reduce #'append places
302 :key
#'(lambda (x) `((setf ,x
(funcall input
))
303 (valid-secondary-check ,x
)))))))
304 (setf byte0
(funcall input
)
305 bytes-needed
(utf-8-byte-len byte0
))
306 (when (< bytes-left bytes-needed
)
307 (decode-err 'end-of-input-in-character
))
309 (1 (setf code byte0
))
310 (2 (put-and-check-valid-secondary-bytes byte1
)
311 (overlong-check #b11000000
#b10111111
)
312 (overlong-check #b11000001
#b10111111
)
313 (setf code
(logior (ash (ldb (byte 5 0) byte0
) 6)
314 (ldb (byte 6 0) byte1
))))
315 (3 (put-and-check-valid-secondary-bytes byte1 byte2
)
316 (overlong-check #b11100000
#b10011111
)
317 (setf code
(logior (ash (ldb (byte 4 0) byte0
) 12)
318 (ash (ldb (byte 6 0) byte1
) 6)
319 (ldb (byte 6 0) byte2
)))
320 (when (illegal-unicode-code-point code
)
321 (decode-err 'illegal-code-point
)))
322 (4 (put-and-check-valid-secondary-bytes byte1 byte2 byte3
)
323 (overlong-check #b11110000
#b10001111
)
324 (setf code
(logior (ash (ldb (byte 3 0) byte0
) 18)
325 (ash (ldb (byte 6 0) byte1
) 12)
326 (ash (ldb (byte 6 0) byte2
) 6)
327 (ldb (byte 6 0) byte3
)))))
328 (funcall output
(code-char code
)))))))
330 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
331 (let ((code (char-code (funcall input
))))
332 (when (illegal-unicode-code-point code
)
333 (setf code
(char-code (funcall error-fn
'illegal-character
))))
336 (funcall output code
))
338 (funcall output
(logior #xC0
(ldb (byte 5 6) code
)))
339 (funcall output
(logior #x80
(ldb (byte 6 0) code
))))
341 (funcall output
(logior #xE0
(ldb (byte 4 12) code
)))
342 (funcall output
(logior #x80
(ldb (byte 6 6) code
)))
343 (funcall output
(logior #x80
(ldb (byte 6 0) code
))))
345 (funcall output
(logior #xF0
(ldb (byte 3 18) code
)))
346 (funcall output
(logior #x80
(ldb (byte 6 12) code
)))
347 (funcall output
(logior #x80
(ldb (byte 6 6) code
)))
348 (funcall output
(logior #x80
(ldb (byte 6 0) code
))))))))
350 (define-external-format :utf-16
(:utf16
:utf-16be
:utf16be
) 2
352 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
355 (+ (ash (funcall input
) 8) (funcall input
)))
357 (return-from utf-16-decode
358 (funcall output
(funcall error-fn sym
)))))
359 (macrolet ((put-word (word bytes-needed
)
360 `(progn (when (> ,bytes-needed bytes-left
)
361 (decode-err 'end-of-input-in-character
))
362 (setf ,word
(read-word)))))
363 (let ((code 0) (w0 0) (w1 0))
364 (declare (type (unsigned-byte 32) code
)
365 (type (unsigned-byte 16) w0 w1
))
367 (cond ((not (<= #xD800 w0
#xDFFF
))
370 (decode-err 'invalid-starter-octet
))
372 (if (<= #xDC00 w1
#xDFFF
)
373 (setf code
(+ (ash (ldb (byte 10 0) w0
) 10)
376 (decode-err 'invalid-continuation-octet
))))
377 (funcall output
(code-char code
)))))))
379 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
380 (flet ((write-word (word)
381 (funcall output
(ldb (byte 8 8) word
))
382 (funcall output
(ldb (byte 8 0) word
))))
383 (let ((code (char-code (funcall input
))))
384 (when (illegal-unicode-code-point code
)
385 (setf code
(char-code (funcall error-fn
'illegal-character
))))
386 (cond ((< code
#x10000
)
388 (t (decf code
#x10000
)
389 (write-word (logior #xD800
(ldb (byte 10 10) code
)))
390 (write-word (logior #xDC00
(ldb (byte 10 0) code
)))))))))
392 (define-external-format :utf-16le
(:utf16le
) 2
394 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
397 (+ (funcall input
) (ash (funcall input
) 8)))
399 (return-from utf-16-decode
400 (funcall output
(funcall error-fn sym
)))))
401 (macrolet ((put-word (word bytes-needed
)
402 `(progn (when (> ,bytes-needed bytes-left
)
403 (decode-err 'end-of-input-in-character
))
404 (setf ,word
(read-word)))))
405 (let ((code 0) (w0 0) (w1 0))
406 (declare (type (unsigned-byte 32) code
)
407 (type (unsigned-byte 16) w0 w1
))
409 (cond ((not (<= #xD800 w0
#xDFFF
))
412 (decode-err 'invalid-starter-octet
))
414 (if (<= #xDC00 w1
#xDFFF
)
415 (setf code
(+ (ash (ldb (byte 10 0) w0
) 10)
418 (decode-err 'invalid-continuation-octet
))))
419 (funcall output
(code-char code
)))))))
421 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
422 (flet ((write-word (word)
423 (funcall output
(ldb (byte 8 0) word
))
424 (funcall output
(ldb (byte 8 8) word
))))
425 (let ((code (char-code (funcall input
))))
426 (when (illegal-unicode-code-point code
)
427 (setf code
(char-code (funcall error-fn
'illegal-character
))))
428 (cond ((< code
#x10000
)
430 (t (decf code
#x10000
)
431 (write-word (logior #xD800
(ldb (byte 10 10) code
)))
432 (write-word (logior #xDC00
(ldb (byte 10 0) code
)))))))))
436 ;;; CONVERSION FUNCTIONS
444 (defmacro octets-to-char
(external-format input output error-fn bytes-left
)
445 `(funcall (ef-octets-to-char ,external-format
) ,input
,output
,error-fn
448 (defun read-replacement-char ()
449 (format *query-io
* "Enter a replacement character(evaluated): ")
450 (finish-output *query-io
*)
451 (list (eval (read *query-io
*))))
453 (defun %octets-to-string
(buffer string start end ef
&optional max-char-num
)
454 (declare (type et
:foreign-pointer buffer
)
455 (type buffer-index start end
)
456 (type external-format ef
)
457 (type (or null signed-byte
) max-char-num
)
458 (optimize (speed 3) (space 0) (safety 0) (debug 0)))
459 (unless max-char-num
(setf max-char-num -
1))
466 (prog1 (cffi:mem-aref buffer
:uint8 ptr
) (incf ptr
)))
468 (setf (char string
(incf pos
)) char
))
471 (error symbol
:array buffer
472 :start start
:end end
474 :external-format
(ef-name ef
))
476 :report
"Supply a replacement character."
477 :interactive read-replacement-char
479 (use-standard-unicode-replacement ()
480 :report
"Use standard UCS replacement character"
483 :report
"Stop decoding and return to last good offset."
486 (loop :while
(and (< ptr end
)
487 (/= (incf char-count
) max-char-num
))
490 (octets-to-char ef
#'input
#'output
#'error-fn
(- end ptr
))))
492 (return-from %octets-to-string
(values (1+ pos
) (- ptr start
))))))
494 (defun octets-to-string (octets
496 (external-format :default
)
498 (setf octets
(coerce octets
'(simple-array octet
(*))))
499 (check-type start buffer-index
)
500 (check-type end
(or null buffer-index
))
501 (let ((ef (find-external-format external-format
))
502 (end (or end
(length octets
)))
504 (assert (<= start end
))
505 (setf string
(make-string (- end start
)))
506 (cffi:with-pointer-to-vector-data
(octets-ptr octets
)
507 (let ((pos (if auto-correct
508 (handler-bind ((octet-decoding-error
510 (declare (ignore error
))
511 (invoke-restart 'use-value
#\?))))
512 (%octets-to-string octets-ptr string start end ef
))
513 (%octets-to-string octets-ptr string start end ef
))))
514 (shrink-vector string pos
)))))
520 (defmacro char-to-octets
(ef input output error-fn chars-left
)
521 `(funcall (ef-char-to-octets ,ef
) ,input
,output
,error-fn
524 (defun string-to-octets (string &key
(start 0) end
525 (external-format :default
)
527 (declare (type string string
)
528 (type buffer-index start
)
529 (type (or null buffer-index
) end
)
530 (type (or null real
) adjust-factor
)
531 (optimize (speed 3) (space 0) (safety 0) (debug 0)))
532 (let* ((ef (find-external-format external-format
))
533 (buffer (make-array (1+ (length string
))
536 (adjust-threshold (length string
))
540 (setf adjust-factor
(if (and adjust-factor
(<= 1 adjust-factor
4))
543 end
(or end
(length string
)))
546 (prog1 (char string ptr
) (incf ptr
)))
548 (setf (aref buffer
(incf pos
)) octet
)
549 (when (= pos adjust-threshold
)
550 (setf adjust-threshold
(truncate (* adjust-factor
(1+ pos
))))
551 (setf buffer
(adjust-array buffer adjust-threshold
))))
554 (error symbol
:array buffer
555 :start start
:end end
557 :external-format
(ef-name ef
))
559 :report
"Supply a replacement character."
560 :interactive read-replacement-char
562 (use-standard-unicode-replacement ()
563 :report
"Use standard UCS replacement character"
566 :report
"Stop decoding and return to last good offset."
569 (loop :while
(< ptr end
)
572 (char-to-octets ef
#'input
#'output
#'error-fn
(- end ptr
))))
573 :exit
(return-from string-to-octets
(shrink-vector buffer
(1+ pos
))))))