Minor fixes.
[iolib.git] / io.encodings / external-format.lisp
blobabb933d9ae736482e6f69fa5ddf5f6c0eb56c617
1 ;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp -*-
3 ;; This code is free software; you can redistribute it and/or
4 ;; modify it under the terms of the version 2.1 of
5 ;; the GNU Lesser General Public License as published by
6 ;; the Free Software Foundation, as clarified by the
7 ;; preamble found here:
8 ;; http://opensource.franz.com/preamble.html
9 ;;
10 ;; This program is distributed in the hope that it will be useful,
11 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ;; GNU General Public License for more details.
15 ;; You should have received a copy of the GNU Lesser General
16 ;; Public License along with this library; if not, write to the
17 ;; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
18 ;; Boston, MA 02110-1301, USA
20 (in-package :io.encodings)
22 (declaim (optimize (speed 0) (space 0) (safety 3) (debug 3)))
24 ;; Mostly taken from SBCL's sb-simple-streams contrib
25 ;;; **********************************************************************
26 ;;; This code was written by Paul Foley
27 ;;;
29 ;;; Sbcl port by Rudi Schlatte.
31 (define-condition octet-encoding-error (error)
32 ((string :initarg :string :reader octets-encoding-error-string)
33 (start :initarg :start :accessor octet-encoding-error-start)
34 (end :initarg :end :accessor octet-encoding-error-end)
35 (position :initarg :position :reader octets-encoding-error-position)
36 (external-format :initarg :external-format
37 :reader octets-encoding-error-external-format))
38 (:report (lambda (c s)
39 (format s "Unable to encode character ~A as ~S."
40 (char-code (char (octets-encoding-error-string c)
41 (octets-encoding-error-position c)))
42 (octets-encoding-error-external-format c)))))
44 (define-condition illegal-character (octet-encoding-error) ())
47 (define-condition octet-decoding-error (error)
48 ((array :initarg :array :accessor octet-decoding-error-array)
49 (start :initarg :start :accessor octet-decoding-error-start)
50 (end :initarg :end :accessor octet-decoding-error-end)
51 (position :initarg :position :accessor octet-decoding-bad-byte-position)
52 (external-format :initarg :external-format
53 :accessor octet-decoding-error-external-format))
54 (:report
55 (lambda (c s)
56 (format s "Illegal ~A character starting at byte position ~D: ~A."
57 (octet-decoding-error-external-format c)
58 (octet-decoding-bad-byte-position c)
59 (cffi:mem-aref (octet-decoding-error-array c) :uint8
60 (octet-decoding-bad-byte-position c))))))
62 (define-condition end-of-input-in-character (octet-decoding-error) ())
63 (define-condition malformed-multibyte-sequence (octet-decoding-error) ())
64 (define-condition invalid-starter-octet (malformed-multibyte-sequence) ())
65 (define-condition invalid-continuation-octet (malformed-multibyte-sequence) ())
66 (define-condition overlong-octet-sequence (malformed-multibyte-sequence) ())
67 (define-condition illegal-code-point (octet-decoding-error) ())
69 ;;;
70 ;;;
71 ;;; EXTERNAL-FORMAT
72 ;;;
73 ;;;
75 (deftype line-terminator ()
76 '(member :unix :mac :dos))
78 (defvar *default-external-format* :ascii)
79 (eval-when (:compile-toplevel :load-toplevel :execute)
80 (defvar *default-line-terminator* :unix))
82 (defvar *external-formats* (make-hash-table))
83 (defvar *external-format-aliases* (make-hash-table))
84 (defvar *external-format-list* nil)
86 (defstruct (external-format
87 (:conc-name ef-)
88 (:print-function %print-external-format)
89 (:constructor %make-external-format (name
90 line-terminator
91 octet-size
92 octets-to-char
93 char-to-octets)))
94 (name (missing-arg) :type keyword :read-only t)
95 (line-terminator (missing-arg) :type keyword)
96 (octets-to-char (missing-arg) :type function :read-only t)
97 (char-to-octets (missing-arg) :type function :read-only t)
98 (octet-size (missing-arg) :type real))
100 (defun %print-external-format (ef stream depth)
101 (declare (ignore depth))
102 (print-unreadable-object (ef stream :type t :identity nil)
103 (format stream "~A ~S"
104 (ef-name ef) (ef-line-terminator ef))))
106 (defun make-external-format (name &key new-name
107 (line-terminator *default-line-terminator*)
108 (octet-size 1.5))
109 (check-type line-terminator line-terminator)
110 (let ((ef (find-external-format name)))
111 (%make-external-format
112 (or new-name (ef-name ef))
113 (or line-terminator (ef-line-terminator ef))
114 (if (and octet-size (<= 1 octet-size 4))
115 octet-size
116 (ef-octet-size ef))
117 (ef-octets-to-char ef)
118 (ef-char-to-octets ef))))
121 ;;; UTILS
123 (deftype octet ()
124 '(unsigned-byte 8))
126 (deftype buffer-index ()
127 '(unsigned-byte 24))
129 (defmacro add-external-format (name aliases ef)
130 (let (($alias$ (gensym "ALIAS")))
131 `(progn
132 (setf (gethash ,name *external-formats*) ,ef)
133 (setf *external-format-list* (append *external-format-list* (list ,name)))
134 (dolist (,$alias$ ',aliases)
135 (assert (keywordp ,$alias$))
136 (setf (gethash ,$alias$ *external-format-aliases*) ,name)))))
138 (defmacro define-external-format (name aliases octet-size octets-to-char char-to-octets
139 &key (line-terminator *default-line-terminator*))
140 (let (($ef$ (gensym "EF")))
141 `(macrolet ((to-char (&body body)
142 `(lambda (input output error-fn bytes-left)
143 (declare (type (function () octet) input)
144 (type (function (character) t) output)
145 (type (function (symbol) character) error-fn)
146 (type buffer-index bytes-left)
147 (ignorable input output error-fn bytes-left))
148 ,@body))
149 (to-octets (&body body)
150 `(lambda (input output error-fn chars-left)
151 (declare (type (function () character) input)
152 (type (function (octet) t) output)
153 (type (function (symbol) character) error-fn)
154 (type buffer-index chars-left)
155 (ignorable input output error-fn chars-left))
156 ,@body)))
157 (let ((,$ef$ (%make-external-format ,name ,line-terminator ,octet-size
158 ,octets-to-char ,char-to-octets)))
159 (add-external-format ,name ,aliases ,$ef$)))))
161 (defun find-external-format (name &optional (error-p t))
162 (when (external-format-p name)
163 (return-from find-external-format name))
165 (when (eq name :default)
166 (setq name *default-external-format*))
167 (when (stringp name)
168 (setf name (iolib-utils:ensure-keyword name)))
170 (or (gethash name *external-formats*)
171 (gethash (gethash name *external-format-aliases*)
172 *external-formats*)
173 (if error-p (error "External format ~S not found." name) nil)))
177 ;;; EXTERNAL FORMATS
181 (define-condition void-external-format (error) ()
182 (:report
183 (lambda (condition stream)
184 (declare (ignore condition))
185 (format stream "Attempting I/O through void external-format."))))
187 (define-external-format :void () 0
188 (to-char
189 (error 'void-external-format))
190 (to-octets
191 (error 'void-external-format)))
193 (define-external-format :ascii (:us-ascii) 1
194 (to-char
195 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
196 (let ((code (funcall input)))
197 (if (< code 128)
198 (funcall output (aref +iso-8859-1-table+ code))
199 (funcall output (funcall error-fn 'illegal-code-point)))))
200 (to-octets
201 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
202 (let ((code (char-code (funcall input))))
203 (if (< code 128)
204 (funcall output code)
205 (funcall output (char-code (funcall error-fn 'illegal-character)))))))
207 (define-external-format :iso-8859-1 (:iso8859-1 :ISO_8859-1 :latin1 :l1
208 :csISOLatin1 :iso-ir-100 :CP819) 1
209 (to-char
210 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
211 (let ((code (funcall input)))
212 (funcall output (aref +iso-8859-1-table+ code))))
213 (to-octets
214 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
215 (let ((code (char-code (funcall input))))
216 (if (< code 256)
217 (funcall output code)
218 (funcall output (char-code (funcall error-fn 'illegal-character)))))))
220 (defmacro define-iso-8859-external-formats (indexes)
221 (flet ((get-name-and-aliases (index)
222 (if (endp index)
223 (values index nil)
224 (values (car index)
225 (cdr index)))))
226 `(progn
227 ,@(loop :for i :in indexes
228 :collect
229 (multiple-value-bind (index aliases) (get-name-and-aliases i)
230 (let ((table (iolib-utils:concat-symbol "+iso-8859-" index "-table+"))
231 (name (iolib-utils:ensure-keyword
232 (concatenate 'string "ISO-8859-" index))))
233 (push (iolib-utils:ensure-keyword
234 (concatenate 'string "ISO8859-" index))
235 aliases)
236 `(define-external-format ,name ,aliases 1
237 (to-char
238 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
239 (let ((code (funcall input)))
240 (funcall output (aref ,table code))))
241 (to-octets
242 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
243 (let ((code (position (funcall input) ,table)))
244 (if code
245 (funcall output code)
246 (funcall output
247 (position (funcall error-fn 'illegal-character)
248 ,table))))))))))))
250 (define-iso-8859-external-formats
251 (("2" :ISO_8859-2 :latin2 :l2 :csISOLatin2 :iso-ir-101)
252 ("3" :ISO_8859-3 :latin3 :l3 :csISOLatin3 :iso-ir-109)
253 ("4" :ISO_8859-4 :latin4 :l4 :csISOLatin4 :iso-ir-110)
254 ("5" :ISO_8859-5 :cyrillic :csISOLatinCyrillic :iso-ir-144)
255 ("6" :ISO_8859-6 :arabic :csISOLatinArabic :iso-ir-127)
256 ("7" :ISO_8859-7 :greek :greek8 :csISOLatinGreek :iso-ir-126)
257 ("8" :ISO_8859-8 :hebrew :csISOLatinHebrew :iso-ir-138)
258 ("9" :ISO_8859-9 :latin5 :l5 :csISOLatin5 :iso-ir-148)
259 ("10" :ISO_8859-10 :latin6 :l6 :csISOLatin6 :iso-ir-157)
260 ("11" :ISO_8859-11 :thai :csISOLatinThai :iso-ir-166)
261 ("13" :ISO_8859-13 :baltic :csISOLatinBaltic :iso-ir-179)
262 ("14" :ISO_8859-14 :iso-celtic :latin8 :l8 :csISOLatinCeltic :iso-ir-199)
263 ("15" :ISO_8859-15 :latin9 :l9 :csISOLatin9 :iso-ir-203)
264 ("16" :ISO_8859-16 :latin10 :l10 :csISOLatin10 :iso-ir-226)))
266 (iolib-utils:define-constant +max-unicode-code-point+ #x10FFFF)
268 (declaim (inline illegal-unicode-code-point))
269 (defun illegal-unicode-code-point (code)
270 (declare (type (unsigned-byte 32) code))
271 (or (<= #xD800 code #xDFFF)
272 (= code #xFFFE)
273 (= code #xFFFF)
274 (> code +max-unicode-code-point+)))
276 (define-external-format :utf-8 (:utf8) 2
277 (to-char
278 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
279 (block utf-8-decode
280 (let ((code 0) (bytes-needed nil)
281 (byte0 0) (byte1 0)
282 (byte2 0) (byte3 0))
283 (declare (type octet byte0 byte1 byte2 byte3))
284 (labels ((decode-err (sym)
285 (return-from utf-8-decode
286 (funcall output (funcall error-fn sym))))
287 (utf-8-byte-len (code)
288 (declare (type octet code))
289 (cond
290 ((not (logbitp 7 code)) 1)
291 ((= (logand code #b11100000) #b11000000) 2)
292 ((= (logand code #b11110000) #b11100000) 3)
293 ((= (logand code #b11111000) #b11110000) 4)
294 (t (decode-err 'invalid-starter-octet))))
295 (valid-secondary-check (byte)
296 (or (= (logand byte #b11000000) #b10000000)
297 (decode-err 'invalid-continuation-octet)))
298 (overlong-check (starter mask)
299 (or (/= starter byte0)
300 (/= (logior byte1 mask) mask)
301 (decode-err 'overlong-octet-sequence))))
302 (macrolet ((put-and-check-valid-secondary-bytes (&rest places)
303 `(progn ,@(reduce #'append places
304 :key #'(lambda (x) `((setf ,x (funcall input))
305 (valid-secondary-check ,x)))))))
306 (setf byte0 (funcall input)
307 bytes-needed (utf-8-byte-len byte0))
308 (when (< bytes-left bytes-needed)
309 (decode-err 'end-of-input-in-character))
310 (case bytes-needed
311 (1 (setf code byte0))
312 (2 (put-and-check-valid-secondary-bytes byte1)
313 (overlong-check #b11000000 #b10111111)
314 (overlong-check #b11000001 #b10111111)
315 (setf code (logior (ash (ldb (byte 5 0) byte0) 6)
316 (ldb (byte 6 0) byte1))))
317 (3 (put-and-check-valid-secondary-bytes byte1 byte2)
318 (overlong-check #b11100000 #b10011111)
319 (setf code (logior (ash (ldb (byte 4 0) byte0) 12)
320 (ash (ldb (byte 6 0) byte1) 6)
321 (ldb (byte 6 0) byte2)))
322 (when (illegal-unicode-code-point code)
323 (decode-err 'illegal-code-point)))
324 (4 (put-and-check-valid-secondary-bytes byte1 byte2 byte3)
325 (overlong-check #b11110000 #b10001111)
326 (setf code (logior (ash (ldb (byte 3 0) byte0) 18)
327 (ash (ldb (byte 6 0) byte1) 12)
328 (ash (ldb (byte 6 0) byte2) 6)
329 (ldb (byte 6 0) byte3)))))
330 (funcall output (code-char code)))))))
331 (to-octets
332 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
333 (let ((code (char-code (funcall input))))
334 (when (illegal-unicode-code-point code)
335 (setf code (char-code (funcall error-fn 'illegal-character))))
336 (cond
337 ((< code #x80)
338 (funcall output code))
339 ((< code #x800)
340 (funcall output (logior #xC0 (ldb (byte 5 6) code)))
341 (funcall output (logior #x80 (ldb (byte 6 0) code))))
342 ((< code #x10000)
343 (funcall output (logior #xE0 (ldb (byte 4 12) code)))
344 (funcall output (logior #x80 (ldb (byte 6 6) code)))
345 (funcall output (logior #x80 (ldb (byte 6 0) code))))
346 ((< code #x200000)
347 (funcall output (logior #xF0 (ldb (byte 3 18) code)))
348 (funcall output (logior #x80 (ldb (byte 6 12) code)))
349 (funcall output (logior #x80 (ldb (byte 6 6) code)))
350 (funcall output (logior #x80 (ldb (byte 6 0) code))))))))
352 (define-external-format :utf-16 (:utf16 :utf-16be :utf16be) 2
353 (to-char
354 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
355 (block utf-16-decode
356 (flet ((read-word ()
357 (+ (ash (funcall input) 8) (funcall input)))
358 (decode-err (sym)
359 (return-from utf-16-decode
360 (funcall output (funcall error-fn sym)))))
361 (macrolet ((put-word (word bytes-needed)
362 `(progn (when (> ,bytes-needed bytes-left)
363 (decode-err 'end-of-input-in-character))
364 (setf ,word (read-word)))))
365 (let ((code 0) (w0 0) (w1 0))
366 (declare (type (unsigned-byte 32) code)
367 (type (unsigned-byte 16) w0 w1))
368 (put-word w0 2)
369 (cond ((not (<= #xD800 w0 #xDFFF))
370 (setf code w0))
371 ((> w0 #xDBFF)
372 (decode-err 'invalid-starter-octet))
373 (t (put-word w1 4)
374 (if (<= #xDC00 w1 #xDFFF)
375 (setf code (+ (ash (ldb (byte 10 0) w0) 10)
376 (ldb (byte 10 0) w1)
377 #x10000))
378 (decode-err 'invalid-continuation-octet))))
379 (funcall output (code-char code)))))))
380 (to-octets
381 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
382 (flet ((write-word (word)
383 (funcall output (ldb (byte 8 8) word))
384 (funcall output (ldb (byte 8 0) word))))
385 (let ((code (char-code (funcall input))))
386 (when (illegal-unicode-code-point code)
387 (setf code (char-code (funcall error-fn 'illegal-character))))
388 (cond ((< code #x10000)
389 (write-word code))
390 (t (decf code #x10000)
391 (write-word (logior #xD800 (ldb (byte 10 10) code)))
392 (write-word (logior #xDC00 (ldb (byte 10 0) code)))))))))
394 (define-external-format :utf-16le (:utf16le) 2
395 (to-char
396 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
397 (block utf-16-decode
398 (flet ((read-word ()
399 (+ (funcall input) (ash (funcall input) 8)))
400 (decode-err (sym)
401 (return-from utf-16-decode
402 (funcall output (funcall error-fn sym)))))
403 (macrolet ((put-word (word bytes-needed)
404 `(progn (when (> ,bytes-needed bytes-left)
405 (decode-err 'end-of-input-in-character))
406 (setf ,word (read-word)))))
407 (let ((code 0) (w0 0) (w1 0))
408 (declare (type (unsigned-byte 32) code)
409 (type (unsigned-byte 16) w0 w1))
410 (put-word w0 2)
411 (cond ((not (<= #xD800 w0 #xDFFF))
412 (setf code w0))
413 ((> w0 #xDBFF)
414 (decode-err 'invalid-starter-octet))
415 (t (put-word w1 4)
416 (if (<= #xDC00 w1 #xDFFF)
417 (setf code (+ (ash (ldb (byte 10 0) w0) 10)
418 (ldb (byte 10 0) w1)
419 #x10000))
420 (decode-err 'invalid-continuation-octet))))
421 (funcall output (code-char code)))))))
422 (to-octets
423 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
424 (flet ((write-word (word)
425 (funcall output (ldb (byte 8 0) word))
426 (funcall output (ldb (byte 8 8) word))))
427 (let ((code (char-code (funcall input))))
428 (when (illegal-unicode-code-point code)
429 (setf code (char-code (funcall error-fn 'illegal-character))))
430 (cond ((< code #x10000)
431 (write-word code))
432 (t (decf code #x10000)
433 (write-word (logior #xD800 (ldb (byte 10 10) code)))
434 (write-word (logior #xDC00 (ldb (byte 10 0) code)))))))))
438 ;;; CONVERSION FUNCTIONS
443 ;; OCTETS-TO-CHAR
446 (defmacro octets-to-char (external-format input output error-fn bytes-left)
447 `(funcall (ef-octets-to-char ,external-format) ,input ,output ,error-fn
448 ,bytes-left))
450 (defun read-replacement-char ()
451 (format *query-io* "Enter a replacement character(evaluated): ")
452 (finish-output *query-io*)
453 (list (eval (read *query-io*))))
455 (defun %octets-to-string (buffer string start end ef &optional max-char-num)
456 (declare (type et:foreign-pointer buffer)
457 (type buffer-index start end)
458 (type external-format ef)
459 (type (or null signed-byte) max-char-num)
460 (optimize (speed 3) (space 0) (safety 0) (debug 0)))
461 (unless max-char-num (setf max-char-num -1))
462 (let ((ptr start)
463 (pos -1)
464 (char-count -1)
465 oldpos oldptr)
466 (tagbody
467 (flet ((input ()
468 (prog1 (cffi:mem-aref buffer :uint8 ptr) (incf ptr)))
469 (output (char)
470 (setf (char string (incf pos)) char))
471 (error-fn (symbol)
472 (restart-case
473 (error symbol :array buffer
474 :start start :end end
475 :position oldptr
476 :external-format (ef-name ef))
477 (use-value (s)
478 :report "Supply a replacement character."
479 :interactive read-replacement-char
481 (use-standard-unicode-replacement ()
482 :report "Use standard UCS replacement character"
483 (code-char #xFFFD))
484 (stop-decoding ()
485 :report "Stop decoding and return to last good offset."
486 (setf pos oldpos)
487 (go :exit)))))
488 (loop :while (and (< ptr end)
489 (/= (incf char-count) max-char-num))
490 :do (setf oldpos pos
491 oldptr ptr)
492 (octets-to-char ef #'input #'output #'error-fn (- end ptr))))
493 :exit
494 (return-from %octets-to-string (values (1+ pos) (- ptr start))))))
496 (defun octets-to-string (octets
497 &key (start 0) end
498 (external-format :default)
499 (auto-correct nil))
500 (setf octets (coerce octets '(simple-array octet (*))))
501 (check-type start buffer-index)
502 (check-type end (or null buffer-index))
503 (let ((ef (find-external-format external-format))
504 (end (or end (length octets)))
505 (string nil))
506 (assert (<= start end))
507 (setf string (make-string (- end start)))
508 (cffi:with-pointer-to-vector-data (octets-ptr octets)
509 (let ((pos (if auto-correct
510 (handler-bind ((octet-decoding-error
511 #'(lambda (error)
512 (declare (ignore error))
513 (invoke-restart 'use-value #\?))))
514 (%octets-to-string octets-ptr string start end ef))
515 (%octets-to-string octets-ptr string start end ef))))
516 (shrink-vector string pos)))))
519 ;; CHAR-TO-OCTETS
522 (defmacro char-to-octets (ef input output error-fn chars-left)
523 `(funcall (ef-char-to-octets ,ef) ,input ,output ,error-fn
524 ,chars-left))
526 (defun string-to-octets (string &key (start 0) end
527 (external-format :default)
528 adjust-factor)
529 (declare (type string string)
530 (type buffer-index start)
531 (type (or null buffer-index) end)
532 (type (or null real) adjust-factor)
533 (optimize (speed 3) (space 0) (safety 0) (debug 0)))
534 (let* ((ef (find-external-format external-format))
535 (buffer (make-array (1+ (length string))
536 :element-type 'octet
537 :adjustable t))
538 (adjust-threshold (length string))
539 (ptr start) oldptr
540 (pos -1) oldpos)
541 (setf adjust-factor (if (and adjust-factor (<= 1 adjust-factor 4))
542 adjust-factor
543 (ef-octet-size ef))
544 end (or end (length string)))
545 (tagbody
546 (flet ((input ()
547 (prog1 (char string ptr) (incf ptr)))
548 (output (octet)
549 (setf (aref buffer (incf pos)) octet)
550 (when (= pos adjust-threshold)
551 (setf adjust-threshold (truncate (* adjust-factor (1+ pos))))
552 (setf buffer (adjust-array buffer adjust-threshold))))
553 (error-fn (symbol)
554 (restart-case
555 (error symbol :string buffer
556 :start start :end end
557 :position oldptr
558 :external-format (ef-name ef))
559 (use-value (s)
560 :report "Supply a replacement character."
561 :interactive read-replacement-char
563 (use-standard-unicode-replacement ()
564 :report "Use standard UCS replacement character"
565 (code-char #xFFFD))
566 (stop-decoding ()
567 :report "Stop decoding and return to last good offset."
568 (setf pos oldpos)
569 (go :exit)))))
570 (loop :while (< ptr end)
571 :do (setf oldpos pos oldptr ptr)
572 (char-to-octets ef #'input #'output #'error-fn (- end ptr))))
573 :exit (return-from string-to-octets (shrink-vector buffer (1+ pos))))))