Added IO.ENCODINGS
[iolib.git] / io.encodings / external-format.lisp
blob5c9f378fde1d404a51bd5a985cba0a50c578d28d
1 ;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp -*-
3 ;; This code is free software; you can redistribute it and/or
4 ;; modify it under the terms of the version 2.1 of
5 ;; the GNU Lesser General Public License as published by
6 ;; the Free Software Foundation, as clarified by the
7 ;; preamble found here:
8 ;; http://opensource.franz.com/preamble.html
9 ;;
10 ;; This program is distributed in the hope that it will be useful,
11 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ;; GNU General Public License for more details.
15 ;; You should have received a copy of the GNU Lesser General
16 ;; Public License along with this library; if not, write to the
17 ;; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
18 ;; Boston, MA 02110-1301, USA
20 (in-package :io.encodings)
22 (declaim (optimize (speed 0) (space 0) (safety 3) (debug 3)))
24 ;; Mostly taken from SBCL's sb-simple-streams contrib
25 ;;; **********************************************************************
26 ;;; This code was written by Paul Foley
27 ;;;
29 ;;; Sbcl port by Rudi Schlatte.
31 (define-condition octet-encoding-error (error)
32 ((string :initarg :string :reader octets-encoding-error-string)
33 (position :initarg :position :reader octets-encoding-error-position)
34 (external-format :initarg :external-format
35 :reader octets-encoding-error-external-format))
36 (:report (lambda (c s)
37 (format s "Unable to encode character ~A as ~S."
38 (char-code (char (octets-encoding-error-string c)
39 (octets-encoding-error-position c)))
40 (octets-encoding-error-external-format c)))))
42 (define-condition illegal-character (octet-encoding-error) ())
45 (define-condition octet-decoding-error (error)
46 ((array :initarg :array :accessor octet-decoding-error-array)
47 (start :initarg :start :accessor octet-decoding-error-start)
48 (end :initarg :end :accessor octet-decoding-error-end)
49 (position :initarg :position :accessor octet-decoding-bad-byte-position)
50 (external-format :initarg :external-format
51 :accessor octet-decoding-error-external-format))
52 (:report
53 (lambda (c s)
54 (format s "Illegal ~A character starting at byte position ~D: ~A."
55 (octet-decoding-error-external-format c)
56 (octet-decoding-bad-byte-position c)
57 (cffi:mem-aref (octet-decoding-error-array c) :uint8
58 (octet-decoding-bad-byte-position c))))))
60 (define-condition end-of-input-in-character (octet-decoding-error) ())
61 (define-condition malformed-multibyte-sequence (octet-decoding-error) ())
62 (define-condition invalid-starter-octet (malformed-multibyte-sequence) ())
63 (define-condition invalid-continuation-octet (malformed-multibyte-sequence) ())
64 (define-condition overlong-octet-sequence (malformed-multibyte-sequence) ())
65 (define-condition illegal-code-point (octet-decoding-error) ())
67 ;;;
68 ;;;
69 ;;; EXTERNAL-FORMAT
70 ;;;
71 ;;;
73 (deftype line-terminator ()
74 '(member :unix :mac :dos))
76 (defvar *default-external-format* :utf-8)
77 (eval-when (:compile-toplevel :load-toplevel :execute)
78 (defvar *default-line-terminator* :unix))
80 (defvar *external-formats* (make-hash-table))
81 (defvar *external-format-aliases* (make-hash-table))
82 (defvar *external-format-list* nil)
84 (defstruct (external-format
85 (:conc-name ef-)
86 (:print-function %print-external-format)
87 (:constructor %make-external-format (name
88 line-terminator
89 octet-size
90 octets-to-char
91 char-to-octets)))
92 (name (missing-arg) :type keyword :read-only t)
93 (line-terminator (missing-arg) :type keyword)
94 (octets-to-char (missing-arg) :type function :read-only t)
95 (char-to-octets (missing-arg) :type function :read-only t)
96 (octet-size (missing-arg) :type real))
98 (defun %print-external-format (ef stream depth)
99 (declare (ignore depth))
100 (print-unreadable-object (ef stream :type t :identity nil)
101 (format stream "~A ~S"
102 (ef-name ef) (ef-line-terminator ef))))
104 (defun make-external-format (name &key new-name
105 (line-terminator *default-line-terminator*)
106 (octet-size 1.5))
107 (check-type line-terminator line-terminator)
108 (let ((ef (find-external-format name)))
109 (%make-external-format
110 (or new-name (ef-name ef))
111 (or line-terminator (ef-line-terminator ef))
112 (if (and octet-size (<= 1 octet-size 4))
113 octet-size
114 (ef-octet-size ef))
115 (ef-octets-to-char ef)
116 (ef-char-to-octets ef))))
119 ;;; UTILS
121 (deftype octet ()
122 '(unsigned-byte 8))
124 (deftype buffer-index ()
125 'fixnum)
127 (defmacro add-external-format (name aliases ef)
128 (let (($alias$ (gensym "ALIAS")))
129 `(progn
130 (setf (gethash ,name *external-formats*) ,ef)
131 (setf *external-format-list* (append *external-format-list* (list ,name)))
132 (dolist (,$alias$ ',aliases)
133 (assert (keywordp ,$alias$))
134 (setf (gethash ,$alias$ *external-format-aliases*) ,name)))))
136 (defmacro define-external-format (name aliases octet-size octets-to-char char-to-octets
137 &key (line-terminator *default-line-terminator*))
138 (let (($ef$ (gensym "EF")))
139 `(macrolet ((to-char (&body body)
140 `(lambda (input output error-fn bytes-left)
141 (declare (type (function () octet) input)
142 (type (function (character) t) output)
143 (type (function (symbol) character) error-fn)
144 (type buffer-index bytes-left)
145 (ignorable input output error-fn bytes-left))
146 ,@body))
147 (to-octets (&body body)
148 `(lambda (input output error-fn chars-left)
149 (declare (type (function () character) input)
150 (type (function (octet) t) output)
151 (type (function (symbol) character) error-fn)
152 (type buffer-index chars-left)
153 (ignorable input output error-fn chars-left))
154 ,@body)))
155 (let ((,$ef$ (%make-external-format ,name ,line-terminator ,octet-size
156 ,octets-to-char ,char-to-octets)))
157 (add-external-format ,name ,aliases ,$ef$)))))
159 (defun find-external-format (name &optional (error-p t))
160 (when (external-format-p name)
161 (return-from find-external-format name))
163 (when (eq name :default)
164 (setq name *default-external-format*))
165 (when (stringp name)
166 (setf name (iolib-utils:ensure-keyword name)))
168 (or (gethash name *external-formats*)
169 (gethash (gethash name *external-format-aliases*)
170 *external-formats*)
171 (if error-p (error "External format ~S not found." name) nil)))
175 ;;; EXTERNAL FORMATS
179 (define-condition void-external-format (error) ()
180 (:report
181 (lambda (condition stream)
182 (declare (ignore condition))
183 (format stream "Attempting I/O through void external-format."))))
185 (define-external-format :void () 0
186 (to-char
187 (error 'void-external-format))
188 (to-octets
189 (error 'void-external-format)))
191 (define-external-format :ascii (:us-ascii) 1
192 (to-char
193 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
194 (let ((code (funcall input)))
195 (if (< code 128)
196 (funcall output (aref +iso-8859-1-table+ code))
197 (funcall output (funcall error-fn 'illegal-code-point)))))
198 (to-octets
199 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
200 (let ((code (char-code (funcall input))))
201 (if (< code 128)
202 (funcall output code)
203 (funcall output (char-code (funcall error-fn 'illegal-character)))))))
205 (define-external-format :iso-8859-1 (:iso8859-1 :ISO_8859-1 :latin1 :l1
206 :csISOLatin1 :iso-ir-100 :CP819) 1
207 (to-char
208 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
209 (let ((code (funcall input)))
210 (funcall output (aref +iso-8859-1-table+ code))))
211 (to-octets
212 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
213 (let ((code (char-code (funcall input))))
214 (if (< code 256)
215 (funcall output code)
216 (funcall output (char-code (funcall error-fn 'illegal-character)))))))
218 (defmacro define-iso-8859-external-formats (indexes)
219 (flet ((get-name-and-aliases (index)
220 (if (endp index)
221 (values index nil)
222 (values (car index)
223 (cdr index)))))
224 `(progn
225 ,@(loop :for i :in indexes
226 :collect
227 (multiple-value-bind (index aliases) (get-name-and-aliases i)
228 (let ((table (iolib-utils:concat-symbol "+iso-8859-" index "-table+"))
229 (name (iolib-utils:ensure-keyword
230 (concatenate 'string "ISO-8859-" index))))
231 (push (iolib-utils:ensure-keyword
232 (concatenate 'string "ISO8859-" index))
233 aliases)
234 `(define-external-format ,name ,aliases 1
235 (to-char
236 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
237 (let ((code (funcall input)))
238 (funcall output (aref ,table code))))
239 (to-octets
240 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
241 (let ((code (position (funcall input) ,table)))
242 (if code
243 (funcall output code)
244 (funcall output
245 (position (funcall error-fn 'illegal-character)
246 ,table))))))))))))
248 (define-iso-8859-external-formats
249 (("2" :ISO_8859-2 :latin2 :l2 :csISOLatin2 :iso-ir-101)
250 ("3" :ISO_8859-3 :latin3 :l3 :csISOLatin3 :iso-ir-109)
251 ("4" :ISO_8859-4 :latin4 :l4 :csISOLatin4 :iso-ir-110)
252 ("5" :ISO_8859-5 :cyrillic :csISOLatinCyrillic :iso-ir-144)
253 ("6" :ISO_8859-6 :arabic :csISOLatinArabic :iso-ir-127)
254 ("7" :ISO_8859-7 :greek :greek8 :csISOLatinGreek :iso-ir-126)
255 ("8" :ISO_8859-8 :hebrew :csISOLatinHebrew :iso-ir-138)
256 ("9" :ISO_8859-9 :latin5 :l5 :csISOLatin5 :iso-ir-148)
257 ("10" :ISO_8859-10 :latin6 :l6 :csISOLatin6 :iso-ir-157)
258 ("11" :ISO_8859-11 :thai :csISOLatinThai :iso-ir-166)
259 ("13" :ISO_8859-13 :baltic :csISOLatinBaltic :iso-ir-179)
260 ("14" :ISO_8859-14 :iso-celtic :latin8 :l8 :csISOLatinCeltic :iso-ir-199)
261 ("15" :ISO_8859-15 :latin9 :l9 :csISOLatin9 :iso-ir-203)
262 ("16" :ISO_8859-16 :latin10 :l10 :csISOLatin10 :iso-ir-226)))
264 (iolib-utils:define-constant +max-unicode-code-point+ #x10FFFF)
266 (declaim (inline illegal-unicode-code-point))
267 (defun illegal-unicode-code-point (code)
268 (declare (type (unsigned-byte 32) code))
269 (or (<= #xD800 code #xDFFF)
270 (= code #xFFFE)
271 (= code #xFFFF)
272 (> code +max-unicode-code-point+)))
274 (define-external-format :utf-8 (:utf8) 2
275 (to-char
276 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
277 (block utf-8-decode
278 (let ((code 0) (bytes-needed nil)
279 (byte0 0) (byte1 0)
280 (byte2 0) (byte3 0))
281 (declare (type octet byte0 byte1 byte2 byte3))
282 (labels ((decode-err (sym)
283 (return-from utf-8-decode
284 (funcall output (funcall error-fn sym))))
285 (utf-8-byte-len (code)
286 (declare (type octet code))
287 (cond
288 ((not (logbitp 7 code)) 1)
289 ((= (logand code #b11100000) #b11000000) 2)
290 ((= (logand code #b11110000) #b11100000) 3)
291 ((= (logand code #b11111000) #b11110000) 4)
292 (t (decode-err 'invalid-starter-octet))))
293 (valid-secondary-check (byte)
294 (or (= (logand byte #b11000000) #b10000000)
295 (decode-err 'invalid-continuation-octet)))
296 (overlong-check (starter mask)
297 (or (/= starter byte0)
298 (/= (logior byte1 mask) mask)
299 (decode-err 'overlong-octet-sequence))))
300 (macrolet ((put-and-check-valid-secondary-bytes (&rest places)
301 `(progn ,@(reduce #'append places
302 :key #'(lambda (x) `((setf ,x (funcall input))
303 (valid-secondary-check ,x)))))))
304 (setf byte0 (funcall input)
305 bytes-needed (utf-8-byte-len byte0))
306 (when (< bytes-left bytes-needed)
307 (decode-err 'end-of-input-in-character))
308 (case bytes-needed
309 (1 (setf code byte0))
310 (2 (put-and-check-valid-secondary-bytes byte1)
311 (overlong-check #b11000000 #b10111111)
312 (overlong-check #b11000001 #b10111111)
313 (setf code (logior (ash (ldb (byte 5 0) byte0) 6)
314 (ldb (byte 6 0) byte1))))
315 (3 (put-and-check-valid-secondary-bytes byte1 byte2)
316 (overlong-check #b11100000 #b10011111)
317 (setf code (logior (ash (ldb (byte 4 0) byte0) 12)
318 (ash (ldb (byte 6 0) byte1) 6)
319 (ldb (byte 6 0) byte2)))
320 (when (illegal-unicode-code-point code)
321 (decode-err 'illegal-code-point)))
322 (4 (put-and-check-valid-secondary-bytes byte1 byte2 byte3)
323 (overlong-check #b11110000 #b10001111)
324 (setf code (logior (ash (ldb (byte 3 0) byte0) 18)
325 (ash (ldb (byte 6 0) byte1) 12)
326 (ash (ldb (byte 6 0) byte2) 6)
327 (ldb (byte 6 0) byte3)))))
328 (funcall output (code-char code)))))))
329 (to-octets
330 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
331 (let ((code (char-code (funcall input))))
332 (when (illegal-unicode-code-point code)
333 (setf code (char-code (funcall error-fn 'illegal-character))))
334 (cond
335 ((< code #x80)
336 (funcall output code))
337 ((< code #x800)
338 (funcall output (logior #xC0 (ldb (byte 5 6) code)))
339 (funcall output (logior #x80 (ldb (byte 6 0) code))))
340 ((< code #x10000)
341 (funcall output (logior #xE0 (ldb (byte 4 12) code)))
342 (funcall output (logior #x80 (ldb (byte 6 6) code)))
343 (funcall output (logior #x80 (ldb (byte 6 0) code))))
344 ((< code #x200000)
345 (funcall output (logior #xF0 (ldb (byte 3 18) code)))
346 (funcall output (logior #x80 (ldb (byte 6 12) code)))
347 (funcall output (logior #x80 (ldb (byte 6 6) code)))
348 (funcall output (logior #x80 (ldb (byte 6 0) code))))))))
350 (define-external-format :utf-16 (:utf16 :utf-16be :utf16be) 2
351 (to-char
352 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
353 (block utf-16-decode
354 (flet ((read-word ()
355 (+ (ash (funcall input) 8) (funcall input)))
356 (decode-err (sym)
357 (return-from utf-16-decode
358 (funcall output (funcall error-fn sym)))))
359 (macrolet ((put-word (word bytes-needed)
360 `(progn (when (> ,bytes-needed bytes-left)
361 (decode-err 'end-of-input-in-character))
362 (setf ,word (read-word)))))
363 (let ((code 0) (w0 0) (w1 0))
364 (declare (type (unsigned-byte 32) code)
365 (type (unsigned-byte 16) w0 w1))
366 (put-word w0 2)
367 (cond ((not (<= #xD800 w0 #xDFFF))
368 (setf code w0))
369 ((> w0 #xDBFF)
370 (decode-err 'invalid-starter-octet))
371 (t (put-word w1 4)
372 (if (<= #xDC00 w1 #xDFFF)
373 (setf code (+ (ash (ldb (byte 10 0) w0) 10)
374 (ldb (byte 10 0) w1)
375 #x10000))
376 (decode-err 'invalid-continuation-octet))))
377 (funcall output (code-char code)))))))
378 (to-octets
379 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
380 (flet ((write-word (word)
381 (funcall output (ldb (byte 8 8) word))
382 (funcall output (ldb (byte 8 0) word))))
383 (let ((code (char-code (funcall input))))
384 (when (illegal-unicode-code-point code)
385 (setf code (char-code (funcall error-fn 'illegal-character))))
386 (cond ((< code #x10000)
387 (write-word code))
388 (t (decf code #x10000)
389 (write-word (logior #xD800 (ldb (byte 10 10) code)))
390 (write-word (logior #xDC00 (ldb (byte 10 0) code)))))))))
392 (define-external-format :utf-16le (:utf16le) 2
393 (to-char
394 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
395 (block utf-16-decode
396 (flet ((read-word ()
397 (+ (funcall input) (ash (funcall input) 8)))
398 (decode-err (sym)
399 (return-from utf-16-decode
400 (funcall output (funcall error-fn sym)))))
401 (macrolet ((put-word (word bytes-needed)
402 `(progn (when (> ,bytes-needed bytes-left)
403 (decode-err 'end-of-input-in-character))
404 (setf ,word (read-word)))))
405 (let ((code 0) (w0 0) (w1 0))
406 (declare (type (unsigned-byte 32) code)
407 (type (unsigned-byte 16) w0 w1))
408 (put-word w0 2)
409 (cond ((not (<= #xD800 w0 #xDFFF))
410 (setf code w0))
411 ((> w0 #xDBFF)
412 (decode-err 'invalid-starter-octet))
413 (t (put-word w1 4)
414 (if (<= #xDC00 w1 #xDFFF)
415 (setf code (+ (ash (ldb (byte 10 0) w0) 10)
416 (ldb (byte 10 0) w1)
417 #x10000))
418 (decode-err 'invalid-continuation-octet))))
419 (funcall output (code-char code)))))))
420 (to-octets
421 (declare (optimize (speed 3) (space 0) (safety 0) (debug 0)))
422 (flet ((write-word (word)
423 (funcall output (ldb (byte 8 0) word))
424 (funcall output (ldb (byte 8 8) word))))
425 (let ((code (char-code (funcall input))))
426 (when (illegal-unicode-code-point code)
427 (setf code (char-code (funcall error-fn 'illegal-character))))
428 (cond ((< code #x10000)
429 (write-word code))
430 (t (decf code #x10000)
431 (write-word (logior #xD800 (ldb (byte 10 10) code)))
432 (write-word (logior #xDC00 (ldb (byte 10 0) code)))))))))
436 ;;; CONVERSION FUNCTIONS
441 ;; OCTETS-TO-CHAR
444 (defmacro octets-to-char (external-format input output error-fn bytes-left)
445 `(funcall (ef-octets-to-char ,external-format) ,input ,output ,error-fn
446 ,bytes-left))
448 (defun read-replacement-char ()
449 (format *query-io* "Enter a replacement character(evaluated): ")
450 (finish-output *query-io*)
451 (list (eval (read *query-io*))))
453 (defun %octets-to-string (buffer string start end ef &optional max-char-num)
454 (declare (type et:foreign-pointer buffer)
455 (type buffer-index start end)
456 (type external-format ef)
457 (type (or null signed-byte) max-char-num)
458 (optimize (speed 3) (space 0) (safety 0) (debug 0)))
459 (unless max-char-num (setf max-char-num -1))
460 (let ((ptr start)
461 (pos -1)
462 (char-count -1)
463 oldpos oldptr)
464 (tagbody
465 (flet ((input ()
466 (prog1 (cffi:mem-aref buffer :uint8 ptr) (incf ptr)))
467 (output (char)
468 (setf (char string (incf pos)) char))
469 (error-fn (symbol)
470 (restart-case
471 (error symbol :array buffer
472 :start start :end end
473 :position oldptr
474 :external-format (ef-name ef))
475 (use-value (s)
476 :report "Supply a replacement character."
477 :interactive read-replacement-char
479 (use-standard-unicode-replacement ()
480 :report "Use standard UCS replacement character"
481 (code-char #xFFFD))
482 (stop-decoding ()
483 :report "Stop decoding and return to last good offset."
484 (setf pos oldpos)
485 (go :exit)))))
486 (loop :while (and (< ptr end)
487 (/= (incf char-count) max-char-num))
488 :do (setf oldpos pos
489 oldptr ptr)
490 (octets-to-char ef #'input #'output #'error-fn (- end ptr))))
491 :exit
492 (return-from %octets-to-string (values (1+ pos) (- ptr start))))))
494 (defun octets-to-string (octets
495 &key (start 0) end
496 (external-format :default)
497 (auto-correct nil))
498 (setf octets (coerce octets '(simple-array octet (*))))
499 (check-type start buffer-index)
500 (check-type end (or null buffer-index))
501 (let ((ef (find-external-format external-format))
502 (end (or end (length octets)))
503 (string nil))
504 (assert (<= start end))
505 (setf string (make-string (- end start)))
506 (cffi:with-pointer-to-vector-data (octets-ptr octets)
507 (let ((pos (if auto-correct
508 (handler-bind ((octet-decoding-error
509 #'(lambda (error)
510 (declare (ignore error))
511 (invoke-restart 'use-value #\?))))
512 (%octets-to-string octets-ptr string start end ef))
513 (%octets-to-string octets-ptr string start end ef))))
514 (shrink-vector string pos)))))
517 ;; CHAR-TO-OCTETS
520 (defmacro char-to-octets (ef input output error-fn chars-left)
521 `(funcall (ef-char-to-octets ,ef) ,input ,output ,error-fn
522 ,chars-left))
524 (defun string-to-octets (string &key (start 0) end
525 (external-format :default)
526 adjust-factor)
527 (declare (type string string)
528 (type buffer-index start)
529 (type (or null buffer-index) end)
530 (type (or null real) adjust-factor)
531 (optimize (speed 3) (space 0) (safety 0) (debug 0)))
532 (let* ((ef (find-external-format external-format))
533 (buffer (make-array (1+ (length string))
534 :element-type 'octet
535 :adjustable t))
536 (adjust-threshold (length string))
537 (ptr start)
538 (pos -1)
539 oldpos oldptr)
540 (setf adjust-factor (if (and adjust-factor (<= 1 adjust-factor 4))
541 adjust-factor
542 (ef-octet-size ef))
543 end (or end (length string)))
544 (tagbody
545 (flet ((input ()
546 (prog1 (char string ptr) (incf ptr)))
547 (output (octet)
548 (setf (aref buffer (incf pos)) octet)
549 (when (= pos adjust-threshold)
550 (setf adjust-threshold (truncate (* adjust-factor (1+ pos))))
551 (setf buffer (adjust-array buffer adjust-threshold))))
552 (error-fn (symbol)
553 (restart-case
554 (error symbol :array buffer
555 :start start :end end
556 :position oldptr
557 :external-format (ef-name ef))
558 (use-value (s)
559 :report "Supply a replacement character."
560 :interactive read-replacement-char
562 (use-standard-unicode-replacement ()
563 :report "Use standard UCS replacement character"
564 (code-char #xFFFD))
565 (stop-decoding ()
566 :report "Stop decoding and return to last good offset."
567 (setf pos oldpos)
568 (go :exit)))))
569 (loop :while (< ptr end)
570 :do (setf oldpos pos
571 oldptr ptr)
572 (char-to-octets ef #'input #'output #'error-fn (- end ptr))))
573 :exit (return-from string-to-octets (shrink-vector buffer (1+ pos))))))