1 ;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp -*-
3 ;; This code is free software; you can redistribute it and/or
4 ;; modify it under the terms of the version 2.1 of
5 ;; the GNU Lesser General Public License as published by
6 ;; the Free Software Foundation, as clarified by the
7 ;; preamble found here:
8 ;; http://opensource.franz.com/preamble.html
10 ;; This program is distributed in the hope that it will be useful,
11 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ;; GNU General Public License for more details.
15 ;; You should have received a copy of the GNU Lesser General
16 ;; Public License along with this library; if not, write to the
17 ;; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
18 ;; Boston, MA 02110-1301, USA
20 (in-package :io.encodings
)
22 (declaim (optimize (speed 0) (space 0) (safety 3) (debug 3)))
24 ;; Mostly taken from SBCL's sb-simple-streams contrib
25 ;;; **********************************************************************
26 ;;; This code was written by Paul Foley
29 ;;; Sbcl port by Rudi Schlatte.
31 (define-condition octet-encoding-error
(error)
32 ((string :initarg
:string
:reader octets-encoding-error-string
)
33 (start :initarg
:start
:accessor octet-encoding-error-start
)
34 (end :initarg
:end
:accessor octet-encoding-error-end
)
35 (position :initarg
:position
:reader octets-encoding-error-position
)
36 (external-format :initarg
:external-format
37 :reader octets-encoding-error-external-format
))
38 (:report
(lambda (c s
)
39 (format s
"Unable to encode character ~A as ~S."
40 (char-code (char (octets-encoding-error-string c
)
41 (octets-encoding-error-position c
)))
42 (octets-encoding-error-external-format c
)))))
44 (define-condition illegal-character
(octet-encoding-error) ())
47 (define-condition octet-decoding-error
(error)
48 ((array :initarg
:array
:accessor octet-decoding-error-array
)
49 (start :initarg
:start
:accessor octet-decoding-error-start
)
50 (end :initarg
:end
:accessor octet-decoding-error-end
)
51 (position :initarg
:position
:accessor octet-decoding-bad-byte-position
)
52 (external-format :initarg
:external-format
53 :accessor octet-decoding-error-external-format
))
56 (format s
"Illegal ~A character starting at byte position ~D: ~A."
57 (octet-decoding-error-external-format c
)
58 (octet-decoding-bad-byte-position c
)
59 (cffi:mem-aref
(octet-decoding-error-array c
) :uint8
60 (octet-decoding-bad-byte-position c
))))))
62 (define-condition end-of-input-in-character
(octet-decoding-error) ())
63 (define-condition malformed-multibyte-sequence
(octet-decoding-error) ())
64 (define-condition invalid-starter-octet
(malformed-multibyte-sequence) ())
65 (define-condition invalid-continuation-octet
(malformed-multibyte-sequence) ())
66 (define-condition overlong-octet-sequence
(malformed-multibyte-sequence) ())
67 (define-condition illegal-code-point
(octet-decoding-error) ())
75 (deftype line-terminator
()
76 '(member :unix
:mac
:dos
))
78 (defvar *default-external-format
* :ascii
)
79 (eval-when (:compile-toplevel
:load-toplevel
:execute
)
80 (defvar *default-line-terminator
* :unix
))
82 (defvar *external-formats
* (make-hash-table))
83 (defvar *external-format-aliases
* (make-hash-table))
84 (defvar *external-format-list
* nil
)
86 (defstruct (external-format
88 (:print-function %print-external-format
)
89 (:constructor %make-external-format
(name
94 (name (missing-arg) :type keyword
:read-only t
)
95 (line-terminator (missing-arg) :type keyword
)
96 (octets-to-char (missing-arg) :type function
:read-only t
)
97 (char-to-octets (missing-arg) :type function
:read-only t
)
98 (octet-size (missing-arg) :type real
))
100 (defun %print-external-format
(ef stream depth
)
101 (declare (ignore depth
))
102 (print-unreadable-object (ef stream
:type t
:identity nil
)
103 (format stream
"~A ~S"
104 (ef-name ef
) (ef-line-terminator ef
))))
106 (defun make-external-format (name &key new-name
107 (line-terminator *default-line-terminator
*)
109 (check-type line-terminator line-terminator
)
110 (let ((ef (find-external-format name
)))
111 (%make-external-format
112 (or new-name
(ef-name ef
))
113 (or line-terminator
(ef-line-terminator ef
))
114 (if (and octet-size
(<= 1 octet-size
4))
117 (ef-octets-to-char ef
)
118 (ef-char-to-octets ef
))))
126 (deftype buffer-index
()
129 (defmacro add-external-format
(name aliases ef
)
130 (let (($alias$
(gensym "ALIAS")))
132 (setf (gethash ,name
*external-formats
*) ,ef
)
133 (setf *external-format-list
* (append *external-format-list
* (list ,name
)))
134 (dolist (,$alias$
',aliases
)
135 (assert (keywordp ,$alias$
))
136 (setf (gethash ,$alias$
*external-format-aliases
*) ,name
)))))
138 (defmacro define-external-format
(name aliases octet-size octets-to-char char-to-octets
139 &key
(line-terminator *default-line-terminator
*))
140 (let (($ef$
(gensym "EF")))
141 `(macrolet ((to-char (&body body
)
142 `(lambda (input output error-fn bytes-left
)
143 (declare (type (function () octet
) input
)
144 (type (function (character) t
) output
)
145 (type (function (symbol) character
) error-fn
)
146 (type buffer-index bytes-left
)
147 (ignorable input output error-fn bytes-left
))
149 (to-octets (&body body
)
150 `(lambda (input output error-fn chars-left
)
151 (declare (type (function () character
) input
)
152 (type (function (octet) t
) output
)
153 (type (function (symbol) character
) error-fn
)
154 (type buffer-index chars-left
)
155 (ignorable input output error-fn chars-left
))
157 (let ((,$ef$
(%make-external-format
,name
,line-terminator
,octet-size
158 ,octets-to-char
,char-to-octets
)))
159 (add-external-format ,name
,aliases
,$ef$
)))))
161 (defun find-external-format (name &optional
(error-p t
))
162 (when (external-format-p name
)
163 (return-from find-external-format name
))
165 (when (eq name
:default
)
166 (setq name
*default-external-format
*))
168 (setf name
(iolib-utils:ensure-keyword name
)))
170 (or (gethash name
*external-formats
*)
171 (gethash (gethash name
*external-format-aliases
*)
173 (if error-p
(error "External format ~S not found." name
) nil
)))
181 (define-condition void-external-format
(error) ()
183 (lambda (condition stream
)
184 (declare (ignore condition
))
185 (format stream
"Attempting I/O through void external-format."))))
187 (define-external-format :void
() 0
189 (error 'void-external-format
))
191 (error 'void-external-format
)))
193 (define-external-format :ascii
(:us-ascii
) 1
195 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
196 (let ((code (funcall input
)))
198 (funcall output
(aref +iso-8859-1-table
+ code
))
199 (funcall output
(funcall error-fn
'illegal-code-point
)))))
201 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
202 (let ((code (char-code (funcall input
))))
204 (funcall output code
)
205 (funcall output
(char-code (funcall error-fn
'illegal-character
)))))))
207 (define-external-format :iso-8859-1
(:iso8859-1
:ISO_8859-1
:latin1
:l1
208 :csISOLatin1
:iso-ir-100
:CP819
) 1
210 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
211 (let ((code (funcall input
)))
212 (funcall output
(aref +iso-8859-1-table
+ code
))))
214 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
215 (let ((code (char-code (funcall input
))))
217 (funcall output code
)
218 (funcall output
(char-code (funcall error-fn
'illegal-character
)))))))
220 (defmacro define-iso-8859-external-formats
(indexes)
221 (flet ((get-name-and-aliases (index)
227 ,@(loop :for i
:in indexes
229 (multiple-value-bind (index aliases
) (get-name-and-aliases i
)
230 (let ((table (iolib-utils:concat-symbol
"+iso-8859-" index
"-table+"))
231 (name (iolib-utils:ensure-keyword
232 (concatenate 'string
"ISO-8859-" index
))))
233 (push (iolib-utils:ensure-keyword
234 (concatenate 'string
"ISO8859-" index
))
236 `(define-external-format ,name
,aliases
1
238 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
239 (let ((code (funcall input
)))
240 (funcall output
(aref ,table code
))))
242 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
243 (let ((code (position (funcall input
) ,table
)))
245 (funcall output code
)
247 (position (funcall error-fn
'illegal-character
)
250 (define-iso-8859-external-formats
251 (("2" :ISO_8859-2
:latin2
:l2
:csISOLatin2
:iso-ir-101
)
252 ("3" :ISO_8859-3
:latin3
:l3
:csISOLatin3
:iso-ir-109
)
253 ("4" :ISO_8859-4
:latin4
:l4
:csISOLatin4
:iso-ir-110
)
254 ("5" :ISO_8859-5
:cyrillic
:csISOLatinCyrillic
:iso-ir-144
)
255 ("6" :ISO_8859-6
:arabic
:csISOLatinArabic
:iso-ir-127
)
256 ("7" :ISO_8859-7
:greek
:greek8
:csISOLatinGreek
:iso-ir-126
)
257 ("8" :ISO_8859-8
:hebrew
:csISOLatinHebrew
:iso-ir-138
)
258 ("9" :ISO_8859-9
:latin5
:l5
:csISOLatin5
:iso-ir-148
)
259 ("10" :ISO_8859-10
:latin6
:l6
:csISOLatin6
:iso-ir-157
)
260 ("11" :ISO_8859-11
:thai
:csISOLatinThai
:iso-ir-166
)
261 ("13" :ISO_8859-13
:baltic
:csISOLatinBaltic
:iso-ir-179
)
262 ("14" :ISO_8859-14
:iso-celtic
:latin8
:l8
:csISOLatinCeltic
:iso-ir-199
)
263 ("15" :ISO_8859-15
:latin9
:l9
:csISOLatin9
:iso-ir-203
)
264 ("16" :ISO_8859-16
:latin10
:l10
:csISOLatin10
:iso-ir-226
)))
266 (iolib-utils:define-constant
+max-unicode-code-point
+ #x10FFFF
)
268 (declaim (inline illegal-unicode-code-point
))
269 (defun illegal-unicode-code-point (code)
270 (declare (type (unsigned-byte 32) code
))
271 (or (<= #xD800 code
#xDFFF
)
274 (> code
+max-unicode-code-point
+)))
276 (define-external-format :utf-8
(:utf8
) 2
278 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
280 (let ((code 0) (bytes-needed nil
)
283 (declare (type octet byte0 byte1 byte2 byte3
))
284 (labels ((decode-err (sym)
285 (return-from utf-8-decode
286 (funcall output
(funcall error-fn sym
))))
287 (utf-8-byte-len (code)
288 (declare (type octet code
))
290 ((not (logbitp 7 code
)) 1)
291 ((= (logand code
#b11100000
) #b11000000
) 2)
292 ((= (logand code
#b11110000
) #b11100000
) 3)
293 ((= (logand code
#b11111000
) #b11110000
) 4)
294 (t (decode-err 'invalid-starter-octet
))))
295 (valid-secondary-check (byte)
296 (or (= (logand byte
#b11000000
) #b10000000
)
297 (decode-err 'invalid-continuation-octet
)))
298 (overlong-check (starter mask
)
299 (or (/= starter byte0
)
300 (/= (logior byte1 mask
) mask
)
301 (decode-err 'overlong-octet-sequence
))))
302 (macrolet ((put-and-check-valid-secondary-bytes (&rest places
)
303 `(progn ,@(reduce #'append places
304 :key
#'(lambda (x) `((setf ,x
(funcall input
))
305 (valid-secondary-check ,x
)))))))
306 (setf byte0
(funcall input
)
307 bytes-needed
(utf-8-byte-len byte0
))
308 (when (< bytes-left bytes-needed
)
309 (decode-err 'end-of-input-in-character
))
311 (1 (setf code byte0
))
312 (2 (put-and-check-valid-secondary-bytes byte1
)
313 (overlong-check #b11000000
#b10111111
)
314 (overlong-check #b11000001
#b10111111
)
315 (setf code
(logior (ash (ldb (byte 5 0) byte0
) 6)
316 (ldb (byte 6 0) byte1
))))
317 (3 (put-and-check-valid-secondary-bytes byte1 byte2
)
318 (overlong-check #b11100000
#b10011111
)
319 (setf code
(logior (ash (ldb (byte 4 0) byte0
) 12)
320 (ash (ldb (byte 6 0) byte1
) 6)
321 (ldb (byte 6 0) byte2
)))
322 (when (illegal-unicode-code-point code
)
323 (decode-err 'illegal-code-point
)))
324 (4 (put-and-check-valid-secondary-bytes byte1 byte2 byte3
)
325 (overlong-check #b11110000
#b10001111
)
326 (setf code
(logior (ash (ldb (byte 3 0) byte0
) 18)
327 (ash (ldb (byte 6 0) byte1
) 12)
328 (ash (ldb (byte 6 0) byte2
) 6)
329 (ldb (byte 6 0) byte3
)))))
330 (funcall output
(code-char code
)))))))
332 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
333 (let ((code (char-code (funcall input
))))
334 (when (illegal-unicode-code-point code
)
335 (setf code
(char-code (funcall error-fn
'illegal-character
))))
338 (funcall output code
))
340 (funcall output
(logior #xC0
(ldb (byte 5 6) code
)))
341 (funcall output
(logior #x80
(ldb (byte 6 0) code
))))
343 (funcall output
(logior #xE0
(ldb (byte 4 12) code
)))
344 (funcall output
(logior #x80
(ldb (byte 6 6) code
)))
345 (funcall output
(logior #x80
(ldb (byte 6 0) code
))))
347 (funcall output
(logior #xF0
(ldb (byte 3 18) code
)))
348 (funcall output
(logior #x80
(ldb (byte 6 12) code
)))
349 (funcall output
(logior #x80
(ldb (byte 6 6) code
)))
350 (funcall output
(logior #x80
(ldb (byte 6 0) code
))))))))
352 (define-external-format :utf-16
(:utf16
:utf-16be
:utf16be
) 2
354 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
357 (+ (ash (funcall input
) 8) (funcall input
)))
359 (return-from utf-16-decode
360 (funcall output
(funcall error-fn sym
)))))
361 (macrolet ((put-word (word bytes-needed
)
362 `(progn (when (> ,bytes-needed bytes-left
)
363 (decode-err 'end-of-input-in-character
))
364 (setf ,word
(read-word)))))
365 (let ((code 0) (w0 0) (w1 0))
366 (declare (type (unsigned-byte 32) code
)
367 (type (unsigned-byte 16) w0 w1
))
369 (cond ((not (<= #xD800 w0
#xDFFF
))
372 (decode-err 'invalid-starter-octet
))
374 (if (<= #xDC00 w1
#xDFFF
)
375 (setf code
(+ (ash (ldb (byte 10 0) w0
) 10)
378 (decode-err 'invalid-continuation-octet
))))
379 (funcall output
(code-char code
)))))))
381 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
382 (flet ((write-word (word)
383 (funcall output
(ldb (byte 8 8) word
))
384 (funcall output
(ldb (byte 8 0) word
))))
385 (let ((code (char-code (funcall input
))))
386 (when (illegal-unicode-code-point code
)
387 (setf code
(char-code (funcall error-fn
'illegal-character
))))
388 (cond ((< code
#x10000
)
390 (t (decf code
#x10000
)
391 (write-word (logior #xD800
(ldb (byte 10 10) code
)))
392 (write-word (logior #xDC00
(ldb (byte 10 0) code
)))))))))
394 (define-external-format :utf-16le
(:utf16le
) 2
396 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
399 (+ (funcall input
) (ash (funcall input
) 8)))
401 (return-from utf-16-decode
402 (funcall output
(funcall error-fn sym
)))))
403 (macrolet ((put-word (word bytes-needed
)
404 `(progn (when (> ,bytes-needed bytes-left
)
405 (decode-err 'end-of-input-in-character
))
406 (setf ,word
(read-word)))))
407 (let ((code 0) (w0 0) (w1 0))
408 (declare (type (unsigned-byte 32) code
)
409 (type (unsigned-byte 16) w0 w1
))
411 (cond ((not (<= #xD800 w0
#xDFFF
))
414 (decode-err 'invalid-starter-octet
))
416 (if (<= #xDC00 w1
#xDFFF
)
417 (setf code
(+ (ash (ldb (byte 10 0) w0
) 10)
420 (decode-err 'invalid-continuation-octet
))))
421 (funcall output
(code-char code
)))))))
423 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
424 (flet ((write-word (word)
425 (funcall output
(ldb (byte 8 0) word
))
426 (funcall output
(ldb (byte 8 8) word
))))
427 (let ((code (char-code (funcall input
))))
428 (when (illegal-unicode-code-point code
)
429 (setf code
(char-code (funcall error-fn
'illegal-character
))))
430 (cond ((< code
#x10000
)
432 (t (decf code
#x10000
)
433 (write-word (logior #xD800
(ldb (byte 10 10) code
)))
434 (write-word (logior #xDC00
(ldb (byte 10 0) code
)))))))))
438 ;;; CONVERSION FUNCTIONS
446 (defmacro octets-to-char
(external-format input output error-fn bytes-left
)
447 `(funcall (ef-octets-to-char ,external-format
) ,input
,output
,error-fn
450 (defun read-replacement-char ()
451 (format *query-io
* "Enter a replacement character(evaluated): ")
452 (finish-output *query-io
*)
453 (list (eval (read *query-io
*))))
455 (defun %octets-to-string
(buffer string start end ef
&optional max-char-num
)
456 (declare (type et
:foreign-pointer buffer
)
457 (type buffer-index start end
)
458 (type external-format ef
)
459 (type (or null signed-byte
) max-char-num
)
460 (optimize (speed 3) (space 0) (safety 0) (debug 0)))
461 (unless max-char-num
(setf max-char-num -
1))
468 (prog1 (cffi:mem-aref buffer
:uint8 ptr
) (incf ptr
)))
470 (setf (char string
(incf pos
)) char
))
473 (error symbol
:array buffer
474 :start start
:end end
476 :external-format
(ef-name ef
))
478 :report
"Supply a replacement character."
479 :interactive read-replacement-char
481 (use-standard-unicode-replacement ()
482 :report
"Use standard UCS replacement character"
485 :report
"Stop decoding and return to last good offset."
488 (loop :while
(and (< ptr end
)
489 (/= (incf char-count
) max-char-num
))
492 (octets-to-char ef
#'input
#'output
#'error-fn
(- end ptr
))))
494 (return-from %octets-to-string
(values (1+ pos
) (- ptr start
))))))
496 (defun octets-to-string (octets
498 (external-format :default
)
500 (setf octets
(coerce octets
'(simple-array octet
(*))))
501 (check-type start buffer-index
)
502 (check-type end
(or null buffer-index
))
503 (let ((ef (find-external-format external-format
))
504 (end (or end
(length octets
)))
506 (assert (<= start end
))
507 (setf string
(make-string (- end start
)))
508 (cffi:with-pointer-to-vector-data
(octets-ptr octets
)
509 (let ((pos (if auto-correct
510 (handler-bind ((octet-decoding-error
512 (declare (ignore error
))
513 (invoke-restart 'use-value
#\?))))
514 (%octets-to-string octets-ptr string start end ef
))
515 (%octets-to-string octets-ptr string start end ef
))))
516 (shrink-vector string pos
)))))
522 (defmacro char-to-octets
(ef input output error-fn chars-left
)
523 `(funcall (ef-char-to-octets ,ef
) ,input
,output
,error-fn
526 (defun string-to-octets (string &key
(start 0) end
527 (external-format :default
)
529 (declare (type string string
)
530 (type buffer-index start
)
531 (type (or null buffer-index
) end
)
532 (type (or null real
) adjust-factor
)
533 (optimize (speed 3) (space 0) (safety 0) (debug 0)))
534 (let* ((ef (find-external-format external-format
))
535 (buffer (make-array (1+ (length string
))
538 (adjust-threshold (length string
))
541 (setf adjust-factor
(if (and adjust-factor
(<= 1 adjust-factor
4))
544 end
(or end
(length string
)))
547 (prog1 (char string ptr
) (incf ptr
)))
549 (setf (aref buffer
(incf pos
)) octet
)
550 (when (= pos adjust-threshold
)
551 (setf adjust-threshold
(truncate (* adjust-factor
(1+ pos
))))
552 (setf buffer
(adjust-array buffer adjust-threshold
))))
555 (error symbol
:string buffer
556 :start start
:end end
558 :external-format
(ef-name ef
))
560 :report
"Supply a replacement character."
561 :interactive read-replacement-char
563 (use-standard-unicode-replacement ()
564 :report
"Use standard UCS replacement character"
567 :report
"Stop decoding and return to last good offset."
570 (loop :while
(< ptr end
)
571 :do
(setf oldpos pos oldptr ptr
)
572 (char-to-octets ef
#'input
#'output
#'error-fn
(- end ptr
))))
573 :exit
(return-from string-to-octets
(shrink-vector buffer
(1+ pos
))))))