1 ;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp -*-
3 ;; This code is free software; you can redistribute it and/or
4 ;; modify it under the terms of the version 2.1 of
5 ;; the GNU Lesser General Public License as published by
6 ;; the Free Software Foundation, as clarified by the
7 ;; preamble found here:
8 ;; http://opensource.franz.com/preamble.html
10 ;; This program is distributed in the hope that it will be useful,
11 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ;; GNU General Public License for more details.
15 ;; You should have received a copy of the GNU Lesser General
16 ;; Public License along with this library; if not, write to the
17 ;; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
18 ;; Boston, MA 02110-1301, USA
20 (in-package :io.encodings
)
22 (declaim (optimize (speed 0) (space 0) (safety 3) (debug 3)))
24 ;; Mostly taken from SBCL's sb-simple-streams contrib
25 ;;; **********************************************************************
26 ;;; This code was written by Paul Foley
29 ;;; Sbcl port by Rudi Schlatte.
31 (define-condition octet-encoding-error
(error)
32 ((string :initarg
:string
:reader octets-encoding-error-string
)
33 (start :initarg
:start
:accessor octet-encoding-error-start
)
34 (end :initarg
:end
:accessor octet-encoding-error-end
)
35 (position :initarg
:position
:reader octets-encoding-error-position
)
36 (external-format :initarg
:external-format
37 :reader octets-encoding-error-external-format
))
38 (:report
(lambda (c s
)
39 (format s
"Unable to encode character ~A as ~S."
40 (char-code (char (octets-encoding-error-string c
)
41 (octets-encoding-error-position c
)))
42 (octets-encoding-error-external-format c
)))))
44 (define-condition illegal-character
(octet-encoding-error) ())
47 (define-condition octet-decoding-error
(error)
48 ((array :initarg
:array
:accessor octet-decoding-error-array
)
49 (start :initarg
:start
:accessor octet-decoding-error-start
)
50 (end :initarg
:end
:accessor octet-decoding-error-end
)
51 (position :initarg
:position
:accessor octet-decoding-bad-byte-position
)
52 (external-format :initarg
:external-format
53 :accessor octet-decoding-error-external-format
))
56 (format s
"Illegal ~A character starting at byte position ~D: ~A."
57 (octet-decoding-error-external-format c
)
58 (octet-decoding-bad-byte-position c
)
59 (cffi:mem-aref
(octet-decoding-error-array c
) :uint8
60 (octet-decoding-bad-byte-position c
))))))
62 (define-condition end-of-input-in-character
(octet-decoding-error) ())
63 (define-condition malformed-multibyte-sequence
(octet-decoding-error) ())
64 (define-condition invalid-starter-octet
(malformed-multibyte-sequence) ())
65 (define-condition invalid-continuation-octet
(malformed-multibyte-sequence) ())
66 (define-condition overlong-octet-sequence
(malformed-multibyte-sequence) ())
67 (define-condition illegal-code-point
(octet-decoding-error) ())
75 (deftype line-terminator
()
76 '(member :unix
:mac
:dos
))
78 (defvar *default-external-format
* #+ucs-chars
:utf-8
79 #-ucs-chars
:iso-8859-1
)
80 (eval-when (:compile-toplevel
:load-toplevel
:execute
)
81 (defvar *default-line-terminator
* :unix
))
83 (defvar *external-formats
* (make-hash-table))
84 (defvar *external-format-aliases
* (make-hash-table))
85 (defvar *external-format-list
* nil
)
87 (defstruct (external-format
89 (:print-function %print-external-format
)
90 (:constructor %make-external-format
(name
95 (name (missing-arg) :type keyword
:read-only t
)
96 (line-terminator (missing-arg) :type keyword
)
97 (octets-to-char (missing-arg) :type function
:read-only t
)
98 (char-to-octets (missing-arg) :type function
:read-only t
)
99 (octet-size (missing-arg) :type real
))
101 (defun %print-external-format
(ef stream depth
)
102 (declare (ignore depth
))
103 (print-unreadable-object (ef stream
:type t
:identity nil
)
104 (format stream
"~A ~S"
105 (ef-name ef
) (ef-line-terminator ef
))))
107 (defun make-external-format (name &key new-name
108 (line-terminator *default-line-terminator
*)
110 (check-type line-terminator line-terminator
)
111 (let ((ef (find-external-format name
)))
112 (%make-external-format
113 (or new-name
(ef-name ef
))
114 (or line-terminator
(ef-line-terminator ef
))
115 (if (and octet-size
(<= 1 octet-size
4))
118 (ef-octets-to-char ef
)
119 (ef-char-to-octets ef
))))
127 (deftype buffer-index
()
130 (defmacro add-external-format
(name aliases ef
)
131 (let (($alias$
(gensym "ALIAS")))
133 (setf (gethash ,name
*external-formats
*) ,ef
)
134 (setf *external-format-list
* (append *external-format-list
* (list ,name
)))
135 (dolist (,$alias$
',aliases
)
136 (assert (keywordp ,$alias$
))
137 (setf (gethash ,$alias$
*external-format-aliases
*) ,name
)))))
139 (defmacro define-external-format
(name aliases octet-size octets-to-char char-to-octets
140 &key
(line-terminator *default-line-terminator
*))
141 (let (($ef$
(gensym "EF")))
142 `(macrolet ((to-char (&body body
)
143 `(lambda (input output error-fn bytes-left
)
144 (declare (type (function () octet
) input
)
145 (type (function (character) t
) output
)
146 (type (function (symbol) character
) error-fn
)
147 (type buffer-index bytes-left
)
148 (ignorable input output error-fn bytes-left
))
150 (to-octets (&body body
)
151 `(lambda (input output error-fn chars-left
)
152 (declare (type (function () character
) input
)
153 (type (function (octet) t
) output
)
154 (type (function (symbol) character
) error-fn
)
155 (type buffer-index chars-left
)
156 (ignorable input output error-fn chars-left
))
158 (let ((,$ef$
(%make-external-format
,name
,line-terminator
,octet-size
159 ,octets-to-char
,char-to-octets
)))
160 (add-external-format ,name
,aliases
,$ef$
)))))
162 (defun find-external-format (name &optional
(error-p t
))
163 (when (external-format-p name
)
164 (return-from find-external-format name
))
166 (when (eq name
:default
)
167 (setq name
*default-external-format
*))
169 (setf name
(iolib-utils:ensure-keyword name
)))
171 (or (gethash name
*external-formats
*)
172 (gethash (gethash name
*external-format-aliases
*)
174 (if error-p
(error "External format ~S not found." name
) nil
)))
182 (define-condition void-external-format
(error) ()
184 (lambda (condition stream
)
185 (declare (ignore condition
))
186 (format stream
"Attempting I/O through void external-format."))))
188 (define-external-format :void
() 0
190 (error 'void-external-format
))
192 (error 'void-external-format
)))
194 (define-external-format :ascii
(:us-ascii
) 1
196 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
197 (let ((code (funcall input
)))
199 (funcall output
(aref +iso-8859-1-table
+ code
))
200 (funcall output
(funcall error-fn
'illegal-code-point
)))))
202 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
203 (let ((code (char-code (funcall input
))))
205 (funcall output code
)
206 (funcall output
(char-code (funcall error-fn
'illegal-character
)))))))
208 (define-external-format :iso-8859-1
(:iso8859-1
:ISO_8859-1
:latin1
:l1
209 :csISOLatin1
:iso-ir-100
:CP819
) 1
211 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
212 (let ((code (funcall input
)))
213 (funcall output
(aref +iso-8859-1-table
+ code
))))
215 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
216 (let ((code (char-code (funcall input
))))
218 (funcall output code
)
219 (funcall output
(char-code (funcall error-fn
'illegal-character
)))))))
222 (defmacro define-iso-8859-external-formats
(indexes)
223 (flet ((get-name-and-aliases (index)
229 ,@(loop :for i
:in indexes
231 (multiple-value-bind (index aliases
) (get-name-and-aliases i
)
232 (let ((table (iolib-utils:concat-symbol
"+iso-8859-" index
"-table+"))
233 (name (iolib-utils:ensure-keyword
234 (concatenate 'string
"ISO-8859-" index
))))
235 (push (iolib-utils:ensure-keyword
236 (concatenate 'string
"ISO8859-" index
))
238 `(define-external-format ,name
,aliases
1
240 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
241 (let ((code (funcall input
)))
242 (funcall output
(aref ,table code
))))
244 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
245 (let ((code (position (funcall input
) ,table
)))
247 (funcall output code
)
249 (position (funcall error-fn
'illegal-character
)
253 (define-iso-8859-external-formats
254 (("2" :ISO_8859-2
:latin2
:l2
:csISOLatin2
:iso-ir-101
)
255 ("3" :ISO_8859-3
:latin3
:l3
:csISOLatin3
:iso-ir-109
)
256 ("4" :ISO_8859-4
:latin4
:l4
:csISOLatin4
:iso-ir-110
)
257 ("5" :ISO_8859-5
:cyrillic
:csISOLatinCyrillic
:iso-ir-144
)
258 ("6" :ISO_8859-6
:arabic
:csISOLatinArabic
:iso-ir-127
)
259 ("7" :ISO_8859-7
:greek
:greek8
:csISOLatinGreek
:iso-ir-126
)
260 ("8" :ISO_8859-8
:hebrew
:csISOLatinHebrew
:iso-ir-138
)
261 ("9" :ISO_8859-9
:latin5
:l5
:csISOLatin5
:iso-ir-148
)
262 ("10" :ISO_8859-10
:latin6
:l6
:csISOLatin6
:iso-ir-157
)
263 ("11" :ISO_8859-11
:thai
:csISOLatinThai
:iso-ir-166
)
264 ("13" :ISO_8859-13
:baltic
:csISOLatinBaltic
:iso-ir-179
)
265 ("14" :ISO_8859-14
:iso-celtic
:latin8
:l8
:csISOLatinCeltic
:iso-ir-199
)
266 ("15" :ISO_8859-15
:latin9
:l9
:csISOLatin9
:iso-ir-203
)
267 ("16" :ISO_8859-16
:latin10
:l10
:csISOLatin10
:iso-ir-226
)))
269 (iolib-utils:define-constant
+max-unicode-code-point
+ #x10FFFF
)
271 #+ucs-chars
(declaim (inline illegal-unicode-code-point
))
273 (defun illegal-unicode-code-point (code)
274 (declare (type (unsigned-byte 32) code
))
275 (or (<= #xD800 code
#xDFFF
)
278 (> code
+max-unicode-code-point
+)))
281 (define-external-format :utf-8
(:utf8
) 2
283 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
285 (let ((code 0) (bytes-needed nil
)
288 (declare (type octet byte0 byte1 byte2 byte3
))
289 (labels ((decode-err (sym)
290 (return-from utf-8-decode
291 (funcall output
(funcall error-fn sym
))))
292 (utf-8-byte-len (code)
293 (declare (type octet code
))
295 ((not (logbitp 7 code
)) 1)
296 ((= (logand code
#b11100000
) #b11000000
) 2)
297 ((= (logand code
#b11110000
) #b11100000
) 3)
298 ((= (logand code
#b11111000
) #b11110000
) 4)
299 (t (decode-err 'invalid-starter-octet
))))
300 (valid-secondary-check (byte)
301 (or (= (logand byte
#b11000000
) #b10000000
)
302 (decode-err 'invalid-continuation-octet
)))
303 (overlong-check (starter mask
)
304 (or (/= starter byte0
)
305 (/= (logior byte1 mask
) mask
)
306 (decode-err 'overlong-octet-sequence
))))
307 (macrolet ((put-and-check-valid-secondary-bytes (&rest places
)
308 `(progn ,@(reduce #'append places
309 :key
#'(lambda (x) `((setf ,x
(funcall input
))
310 (valid-secondary-check ,x
)))))))
311 (setf byte0
(funcall input
)
312 bytes-needed
(utf-8-byte-len byte0
))
313 (when (< bytes-left bytes-needed
)
314 (decode-err 'end-of-input-in-character
))
316 (1 (setf code byte0
))
317 (2 (put-and-check-valid-secondary-bytes byte1
)
318 (overlong-check #b11000000
#b10111111
)
319 (overlong-check #b11000001
#b10111111
)
320 (setf code
(logior (ash (ldb (byte 5 0) byte0
) 6)
321 (ldb (byte 6 0) byte1
))))
322 (3 (put-and-check-valid-secondary-bytes byte1 byte2
)
323 (overlong-check #b11100000
#b10011111
)
324 (setf code
(logior (ash (ldb (byte 4 0) byte0
) 12)
325 (ash (ldb (byte 6 0) byte1
) 6)
326 (ldb (byte 6 0) byte2
)))
327 (when (illegal-unicode-code-point code
)
328 (decode-err 'illegal-code-point
)))
329 (4 (put-and-check-valid-secondary-bytes byte1 byte2 byte3
)
330 (overlong-check #b11110000
#b10001111
)
331 (setf code
(logior (ash (ldb (byte 3 0) byte0
) 18)
332 (ash (ldb (byte 6 0) byte1
) 12)
333 (ash (ldb (byte 6 0) byte2
) 6)
334 (ldb (byte 6 0) byte3
)))))
335 (funcall output
(code-char code
)))))))
337 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
338 (let ((code (char-code (funcall input
))))
339 (when (illegal-unicode-code-point code
)
340 (setf code
(char-code (funcall error-fn
'illegal-character
))))
343 (funcall output code
))
345 (funcall output
(logior #xC0
(ldb (byte 5 6) code
)))
346 (funcall output
(logior #x80
(ldb (byte 6 0) code
))))
348 (funcall output
(logior #xE0
(ldb (byte 4 12) code
)))
349 (funcall output
(logior #x80
(ldb (byte 6 6) code
)))
350 (funcall output
(logior #x80
(ldb (byte 6 0) code
))))
352 (funcall output
(logior #xF0
(ldb (byte 3 18) code
)))
353 (funcall output
(logior #x80
(ldb (byte 6 12) code
)))
354 (funcall output
(logior #x80
(ldb (byte 6 6) code
)))
355 (funcall output
(logior #x80
(ldb (byte 6 0) code
))))))))
358 (define-external-format :utf-16
(:utf16
:utf-16be
:utf16be
) 2
360 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
363 (+ (ash (funcall input
) 8) (funcall input
)))
365 (return-from utf-16-decode
366 (funcall output
(funcall error-fn sym
)))))
367 (macrolet ((put-word (word bytes-needed
)
368 `(progn (when (> ,bytes-needed bytes-left
)
369 (decode-err 'end-of-input-in-character
))
370 (setf ,word
(read-word)))))
371 (let ((code 0) (w0 0) (w1 0))
372 (declare (type (unsigned-byte 32) code
)
373 (type (unsigned-byte 16) w0 w1
))
375 (cond ((not (<= #xD800 w0
#xDFFF
))
378 (decode-err 'invalid-starter-octet
))
380 (if (<= #xDC00 w1
#xDFFF
)
381 (setf code
(+ (ash (ldb (byte 10 0) w0
) 10)
384 (decode-err 'invalid-continuation-octet
))))
385 (funcall output
(code-char code
)))))))
387 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
388 (flet ((write-word (word)
389 (funcall output
(ldb (byte 8 8) word
))
390 (funcall output
(ldb (byte 8 0) word
))))
391 (let ((code (char-code (funcall input
))))
392 (when (illegal-unicode-code-point code
)
393 (setf code
(char-code (funcall error-fn
'illegal-character
))))
394 (cond ((< code
#x10000
)
396 (t (decf code
#x10000
)
397 (write-word (logior #xD800
(ldb (byte 10 10) code
)))
398 (write-word (logior #xDC00
(ldb (byte 10 0) code
)))))))))
401 (define-external-format :utf-16le
(:utf16le
) 2
403 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
406 (+ (funcall input
) (ash (funcall input
) 8)))
408 (return-from utf-16-decode
409 (funcall output
(funcall error-fn sym
)))))
410 (macrolet ((put-word (word bytes-needed
)
411 `(progn (when (> ,bytes-needed bytes-left
)
412 (decode-err 'end-of-input-in-character
))
413 (setf ,word
(read-word)))))
414 (let ((code 0) (w0 0) (w1 0))
415 (declare (type (unsigned-byte 32) code
)
416 (type (unsigned-byte 16) w0 w1
))
418 (cond ((not (<= #xD800 w0
#xDFFF
))
421 (decode-err 'invalid-starter-octet
))
423 (if (<= #xDC00 w1
#xDFFF
)
424 (setf code
(+ (ash (ldb (byte 10 0) w0
) 10)
427 (decode-err 'invalid-continuation-octet
))))
428 (funcall output
(code-char code
)))))))
430 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
431 (flet ((write-word (word)
432 (funcall output
(ldb (byte 8 0) word
))
433 (funcall output
(ldb (byte 8 8) word
))))
434 (let ((code (char-code (funcall input
))))
435 (when (illegal-unicode-code-point code
)
436 (setf code
(char-code (funcall error-fn
'illegal-character
))))
437 (cond ((< code
#x10000
)
439 (t (decf code
#x10000
)
440 (write-word (logior #xD800
(ldb (byte 10 10) code
)))
441 (write-word (logior #xDC00
(ldb (byte 10 0) code
)))))))))
445 ;;; CONVERSION FUNCTIONS
449 (iolib-utils:define-constant
+replacement-char
+
451 #-ucs-chars
(char-code #\?))
457 (defmacro octets-to-char
(external-format input output error-fn bytes-left
)
458 `(funcall (ef-octets-to-char ,external-format
) ,input
,output
,error-fn
461 (defun read-replacement-char ()
462 (format *query-io
* "Enter a replacement character(evaluated): ")
463 (finish-output *query-io
*)
464 (list (eval (read *query-io
*))))
466 (defun %octets-to-string
(buffer string start end ef
&optional max-char-num
(prevptr start
))
467 (declare (type et
:foreign-pointer buffer
)
468 (type buffer-index start end
)
469 (type external-format ef
)
470 (type (or null signed-byte
) max-char-num
)
471 (optimize (speed 3) (space 0) (safety 0) (debug 0)))
472 (unless max-char-num
(setf max-char-num -
1))
479 (prog1 (cffi:mem-aref buffer
:uint8 ptr
) (incf ptr
)))
481 (setf (char string
(incf pos
)) char
))
484 (error symbol
:array buffer
485 :start start
:end end
487 :external-format
(ef-name ef
))
489 :report
"Supply a replacement character."
490 :interactive read-replacement-char
492 (use-standard-unicode-replacement ()
493 :report
"Use standard UCS replacement character"
494 (code-char +replacement-char
+))
496 :report
"Stop decoding and return to last good offset."
500 (return-from %octets-to-string
(values (1+ pos
) (- ptr start
) prevptr
))))
501 (loop :while
(and (< ptr end
)
502 (/= (incf char-count
) max-char-num
))
505 (octets-to-char ef
#'input
#'output
#'error-fn
(- end ptr
))
506 (setf prevptr oldptr
))
510 (defun octets-to-string (octets
512 (external-format :default
)
514 (setf octets
(coerce octets
'(simple-array octet
(*))))
515 (check-type start buffer-index
)
516 (check-type end
(or null buffer-index
))
517 (let ((ef (find-external-format external-format
))
518 (end (or end
(length octets
)))
520 (assert (<= start end
))
521 (setf string
(make-string (- end start
)))
522 (cffi:with-pointer-to-vector-data
(octets-ptr octets
)
523 (let ((pos (if auto-correct
524 (handler-bind ((octet-decoding-error
526 (declare (ignore error
))
527 (invoke-restart 'use-value
#\?))))
528 (%octets-to-string octets-ptr string start end ef
))
529 (%octets-to-string octets-ptr string start end ef
))))
530 (shrink-vector string pos
)))))
536 (defmacro char-to-octets
(ef input output error-fn chars-left
)
537 `(funcall (ef-char-to-octets ,ef
) ,input
,output
,error-fn
540 (defun string-to-octets (string &key start end
541 (external-format :default
)
543 (declare (type string string
)
544 (type (or null buffer-index
) start
)
545 (type (or null buffer-index
) end
)
546 (type (or null real
) adjust-factor
)
547 (optimize (speed 3) (space 0) (safety 0) (debug 0)))
549 (setf start
(or start
0)
550 end
(or end
(length string
)))
551 (let* ((ef (find-external-format external-format
))
552 (buffer (make-array (1+ (length string
))
555 (adjust-threshold (length string
))
558 (setf adjust-factor
(if (and adjust-factor
(<= 1 adjust-factor
4))
563 (prog1 (char string ptr
) (incf ptr
)))
565 (setf (aref buffer
(incf pos
)) octet
)
566 (when (= pos adjust-threshold
)
567 (setf adjust-threshold
(truncate (* adjust-factor
(1+ pos
))))
568 (setf buffer
(adjust-array buffer adjust-threshold
))))
571 (error symbol
:string buffer
572 :start start
:end end
574 :external-format
(ef-name ef
))
576 :report
"Supply a replacement character."
577 :interactive read-replacement-char
579 (use-standard-unicode-replacement ()
580 :report
"Use standard UCS replacement character"
581 (code-char +replacement-char
+))
583 :report
"Stop decoding and return to last good offset."
586 (loop :while
(< ptr end
)
587 :do
(setf oldpos pos oldptr ptr
)
588 (char-to-octets ef
#'input
#'output
#'error-fn
(- end ptr
))))
589 :exit
(return-from string-to-octets
(shrink-vector buffer
(1+ pos
))))))