From 3cfadca790a8817b9a02aa7fc29df3cf6a4658c8 Mon Sep 17 00:00:00 2001 From: Stelian Ionescu Date: Sat, 24 Feb 2007 01:20:50 +0100 Subject: [PATCH] Added IO.ENCODINGS --- io.encodings.asd | 24 ++ io.encodings/common.lisp | 33 +++ io.encodings/defpackage.lisp | 21 ++ io.encodings/external-format.lisp | 573 ++++++++++++++++++++++++++++++++++++++ io.encodings/iso-8859-tables.lisp | 44 +++ io.encodings/parse-mappings.lisp | 49 ++++ 6 files changed, 744 insertions(+) create mode 100644 io.encodings.asd create mode 100644 io.encodings/common.lisp create mode 100644 io.encodings/defpackage.lisp create mode 100644 io.encodings/external-format.lisp create mode 100644 io.encodings/iso-8859-tables.lisp create mode 100644 io.encodings/parse-mappings.lisp diff --git a/io.encodings.asd b/io.encodings.asd new file mode 100644 index 0000000..f9579db --- /dev/null +++ b/io.encodings.asd @@ -0,0 +1,24 @@ +;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp -*- + +(in-package :common-lisp-user) + +(defpackage #:io.encodings.system + (:use #:common-lisp #:asdf)) + +(in-package #:io.encodings.system) + +(defsystem :io.encodings + :description "Charset encoding/decoding library." + :maintainer "Stelian Ionescu " + :licence "LLGPL" + :depends-on (:iolib-utils-symbols + :iolib-utils-misc + :cffi + :iolib-posix) + :pathname (merge-pathnames (make-pathname :directory '(:relative "io.encodings")) + *load-truename*) + :components ((:file "defpackage") + (:file "common" :depends-on ("defpackage")) + (:file "iso-8859-tables" :depends-on ("defpackage")) + (:file "external-format" + :depends-on ("defpackage" "common" "iso-8859-tables")))) diff --git a/io.encodings/common.lisp b/io.encodings/common.lisp new file mode 100644 index 0000000..bc82d48 --- /dev/null +++ b/io.encodings/common.lisp @@ -0,0 +1,33 @@ +;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp -*- + +;; This code is free software; you can redistribute it and/or +;; modify it under the terms of the version 2.1 of +;; the GNU Lesser General Public License as published by +;; the Free Software Foundation, as clarified by the +;; preamble found here: +;; http://opensource.franz.com/preamble.html +;; +;; This program is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU Lesser General +;; Public License along with this library; if not, write to the +;; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, +;; Boston, MA 02110-1301, USA + +(in-package :io.encodings) + +;; taken from kmrcl +(defmacro shrink-vector (str size) + #+allegro `(excl::.primcall 'sys::shrink-svector ,str ,size) + #+cmu `(lisp::shrink-vector ,str ,size) + #+lispworks `(system::shrink-vector$vector ,str ,size) + #+sbcl `(sb-kernel:shrink-vector ,str ,size) + #+scl `(common-lisp::shrink-vector ,str ,size) + #-(or allegro cmu lispworks sbcl scl) `(subseq ,str 0 ,size)) + +(defun missing-arg () + (error "A required &KEY or &OPTIONAL argument was not supplied.")) + diff --git a/io.encodings/defpackage.lisp b/io.encodings/defpackage.lisp new file mode 100644 index 0000000..ddfc35c --- /dev/null +++ b/io.encodings/defpackage.lisp @@ -0,0 +1,21 @@ +;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp -*- + +(in-package :common-lisp-user) + +(defpackage :io.encodings + (:nicknames #:ioenc) + (:use #:common-lisp) + (:export ;; Stream classes + ;; External-format handling + #:external-format #:make-external-format #:find-external-format + #:ef-name #:ef-line-terminator #:ef-octet-size + #:*external-format-list* + #:*default-external-format* #:*default-line-terminator* + #:octets-to-string #:string-to-octets + ;; External format error conditions + #:void-external-format + #:octets-encoding-error + #:octet-decoding-error #:end-of-input-in-character + #:malformed-multibyte-sequence #:invalid-utf-8-starter-byte + #:invalid-utf-8-continuation-byte #:overlong-utf-8-sequence + #:illegal-code-point)) diff --git a/io.encodings/external-format.lisp b/io.encodings/external-format.lisp new file mode 100644 index 0000000..5c9f378 --- /dev/null +++ b/io.encodings/external-format.lisp @@ -0,0 +1,573 @@ +;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp -*- + +;; This code is free software; you can redistribute it and/or +;; modify it under the terms of the version 2.1 of +;; the GNU Lesser General Public License as published by +;; the Free Software Foundation, as clarified by the +;; preamble found here: +;; http://opensource.franz.com/preamble.html +;; +;; This program is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU Lesser General +;; Public License along with this library; if not, write to the +;; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, +;; Boston, MA 02110-1301, USA + +(in-package :io.encodings) + +(declaim (optimize (speed 0) (space 0) (safety 3) (debug 3))) + +;; Mostly taken from SBCL's sb-simple-streams contrib +;;; ********************************************************************** +;;; This code was written by Paul Foley +;;; + +;;; Sbcl port by Rudi Schlatte. + +(define-condition octet-encoding-error (error) + ((string :initarg :string :reader octets-encoding-error-string) + (position :initarg :position :reader octets-encoding-error-position) + (external-format :initarg :external-format + :reader octets-encoding-error-external-format)) + (:report (lambda (c s) + (format s "Unable to encode character ~A as ~S." + (char-code (char (octets-encoding-error-string c) + (octets-encoding-error-position c))) + (octets-encoding-error-external-format c))))) + +(define-condition illegal-character (octet-encoding-error) ()) + + +(define-condition octet-decoding-error (error) + ((array :initarg :array :accessor octet-decoding-error-array) + (start :initarg :start :accessor octet-decoding-error-start) + (end :initarg :end :accessor octet-decoding-error-end) + (position :initarg :position :accessor octet-decoding-bad-byte-position) + (external-format :initarg :external-format + :accessor octet-decoding-error-external-format)) + (:report + (lambda (c s) + (format s "Illegal ~A character starting at byte position ~D: ~A." + (octet-decoding-error-external-format c) + (octet-decoding-bad-byte-position c) + (cffi:mem-aref (octet-decoding-error-array c) :uint8 + (octet-decoding-bad-byte-position c)))))) + +(define-condition end-of-input-in-character (octet-decoding-error) ()) +(define-condition malformed-multibyte-sequence (octet-decoding-error) ()) +(define-condition invalid-starter-octet (malformed-multibyte-sequence) ()) +(define-condition invalid-continuation-octet (malformed-multibyte-sequence) ()) +(define-condition overlong-octet-sequence (malformed-multibyte-sequence) ()) +(define-condition illegal-code-point (octet-decoding-error) ()) + +;;; +;;; +;;; EXTERNAL-FORMAT +;;; +;;; + +(deftype line-terminator () + '(member :unix :mac :dos)) + +(defvar *default-external-format* :utf-8) +(eval-when (:compile-toplevel :load-toplevel :execute) + (defvar *default-line-terminator* :unix)) + +(defvar *external-formats* (make-hash-table)) +(defvar *external-format-aliases* (make-hash-table)) +(defvar *external-format-list* nil) + +(defstruct (external-format + (:conc-name ef-) + (:print-function %print-external-format) + (:constructor %make-external-format (name + line-terminator + octet-size + octets-to-char + char-to-octets))) + (name (missing-arg) :type keyword :read-only t) + (line-terminator (missing-arg) :type keyword) + (octets-to-char (missing-arg) :type function :read-only t) + (char-to-octets (missing-arg) :type function :read-only t) + (octet-size (missing-arg) :type real)) + +(defun %print-external-format (ef stream depth) + (declare (ignore depth)) + (print-unreadable-object (ef stream :type t :identity nil) + (format stream "~A ~S" + (ef-name ef) (ef-line-terminator ef)))) + +(defun make-external-format (name &key new-name + (line-terminator *default-line-terminator*) + (octet-size 1.5)) + (check-type line-terminator line-terminator) + (let ((ef (find-external-format name))) + (%make-external-format + (or new-name (ef-name ef)) + (or line-terminator (ef-line-terminator ef)) + (if (and octet-size (<= 1 octet-size 4)) + octet-size + (ef-octet-size ef)) + (ef-octets-to-char ef) + (ef-char-to-octets ef)))) + +;;; +;;; UTILS +;;; +(deftype octet () + '(unsigned-byte 8)) + +(deftype buffer-index () + 'fixnum) + +(defmacro add-external-format (name aliases ef) + (let (($alias$ (gensym "ALIAS"))) + `(progn + (setf (gethash ,name *external-formats*) ,ef) + (setf *external-format-list* (append *external-format-list* (list ,name))) + (dolist (,$alias$ ',aliases) + (assert (keywordp ,$alias$)) + (setf (gethash ,$alias$ *external-format-aliases*) ,name))))) + +(defmacro define-external-format (name aliases octet-size octets-to-char char-to-octets + &key (line-terminator *default-line-terminator*)) + (let (($ef$ (gensym "EF"))) + `(macrolet ((to-char (&body body) + `(lambda (input output error-fn bytes-left) + (declare (type (function () octet) input) + (type (function (character) t) output) + (type (function (symbol) character) error-fn) + (type buffer-index bytes-left) + (ignorable input output error-fn bytes-left)) + ,@body)) + (to-octets (&body body) + `(lambda (input output error-fn chars-left) + (declare (type (function () character) input) + (type (function (octet) t) output) + (type (function (symbol) character) error-fn) + (type buffer-index chars-left) + (ignorable input output error-fn chars-left)) + ,@body))) + (let ((,$ef$ (%make-external-format ,name ,line-terminator ,octet-size + ,octets-to-char ,char-to-octets))) + (add-external-format ,name ,aliases ,$ef$))))) + +(defun find-external-format (name &optional (error-p t)) + (when (external-format-p name) + (return-from find-external-format name)) + + (when (eq name :default) + (setq name *default-external-format*)) + (when (stringp name) + (setf name (iolib-utils:ensure-keyword name))) + + (or (gethash name *external-formats*) + (gethash (gethash name *external-format-aliases*) + *external-formats*) + (if error-p (error "External format ~S not found." name) nil))) + +;;; +;;; +;;; EXTERNAL FORMATS +;;; +;;; + +(define-condition void-external-format (error) () + (:report + (lambda (condition stream) + (declare (ignore condition)) + (format stream "Attempting I/O through void external-format.")))) + +(define-external-format :void () 0 + (to-char + (error 'void-external-format)) + (to-octets + (error 'void-external-format))) + +(define-external-format :ascii (:us-ascii) 1 + (to-char + (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))) + (let ((code (funcall input))) + (if (< code 128) + (funcall output (aref +iso-8859-1-table+ code)) + (funcall output (funcall error-fn 'illegal-code-point))))) + (to-octets + (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))) + (let ((code (char-code (funcall input)))) + (if (< code 128) + (funcall output code) + (funcall output (char-code (funcall error-fn 'illegal-character))))))) + +(define-external-format :iso-8859-1 (:iso8859-1 :ISO_8859-1 :latin1 :l1 + :csISOLatin1 :iso-ir-100 :CP819) 1 + (to-char + (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))) + (let ((code (funcall input))) + (funcall output (aref +iso-8859-1-table+ code)))) + (to-octets + (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))) + (let ((code (char-code (funcall input)))) + (if (< code 256) + (funcall output code) + (funcall output (char-code (funcall error-fn 'illegal-character))))))) + +(defmacro define-iso-8859-external-formats (indexes) + (flet ((get-name-and-aliases (index) + (if (endp index) + (values index nil) + (values (car index) + (cdr index))))) + `(progn + ,@(loop :for i :in indexes + :collect + (multiple-value-bind (index aliases) (get-name-and-aliases i) + (let ((table (iolib-utils:concat-symbol "+iso-8859-" index "-table+")) + (name (iolib-utils:ensure-keyword + (concatenate 'string "ISO-8859-" index)))) + (push (iolib-utils:ensure-keyword + (concatenate 'string "ISO8859-" index)) + aliases) + `(define-external-format ,name ,aliases 1 + (to-char + (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))) + (let ((code (funcall input))) + (funcall output (aref ,table code)))) + (to-octets + (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))) + (let ((code (position (funcall input) ,table))) + (if code + (funcall output code) + (funcall output + (position (funcall error-fn 'illegal-character) + ,table)))))))))))) + +(define-iso-8859-external-formats + (("2" :ISO_8859-2 :latin2 :l2 :csISOLatin2 :iso-ir-101) + ("3" :ISO_8859-3 :latin3 :l3 :csISOLatin3 :iso-ir-109) + ("4" :ISO_8859-4 :latin4 :l4 :csISOLatin4 :iso-ir-110) + ("5" :ISO_8859-5 :cyrillic :csISOLatinCyrillic :iso-ir-144) + ("6" :ISO_8859-6 :arabic :csISOLatinArabic :iso-ir-127) + ("7" :ISO_8859-7 :greek :greek8 :csISOLatinGreek :iso-ir-126) + ("8" :ISO_8859-8 :hebrew :csISOLatinHebrew :iso-ir-138) + ("9" :ISO_8859-9 :latin5 :l5 :csISOLatin5 :iso-ir-148) + ("10" :ISO_8859-10 :latin6 :l6 :csISOLatin6 :iso-ir-157) + ("11" :ISO_8859-11 :thai :csISOLatinThai :iso-ir-166) + ("13" :ISO_8859-13 :baltic :csISOLatinBaltic :iso-ir-179) + ("14" :ISO_8859-14 :iso-celtic :latin8 :l8 :csISOLatinCeltic :iso-ir-199) + ("15" :ISO_8859-15 :latin9 :l9 :csISOLatin9 :iso-ir-203) + ("16" :ISO_8859-16 :latin10 :l10 :csISOLatin10 :iso-ir-226))) + +(iolib-utils:define-constant +max-unicode-code-point+ #x10FFFF) + +(declaim (inline illegal-unicode-code-point)) +(defun illegal-unicode-code-point (code) + (declare (type (unsigned-byte 32) code)) + (or (<= #xD800 code #xDFFF) + (= code #xFFFE) + (= code #xFFFF) + (> code +max-unicode-code-point+))) + +(define-external-format :utf-8 (:utf8) 2 + (to-char + (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))) + (block utf-8-decode + (let ((code 0) (bytes-needed nil) + (byte0 0) (byte1 0) + (byte2 0) (byte3 0)) + (declare (type octet byte0 byte1 byte2 byte3)) + (labels ((decode-err (sym) + (return-from utf-8-decode + (funcall output (funcall error-fn sym)))) + (utf-8-byte-len (code) + (declare (type octet code)) + (cond + ((not (logbitp 7 code)) 1) + ((= (logand code #b11100000) #b11000000) 2) + ((= (logand code #b11110000) #b11100000) 3) + ((= (logand code #b11111000) #b11110000) 4) + (t (decode-err 'invalid-starter-octet)))) + (valid-secondary-check (byte) + (or (= (logand byte #b11000000) #b10000000) + (decode-err 'invalid-continuation-octet))) + (overlong-check (starter mask) + (or (/= starter byte0) + (/= (logior byte1 mask) mask) + (decode-err 'overlong-octet-sequence)))) + (macrolet ((put-and-check-valid-secondary-bytes (&rest places) + `(progn ,@(reduce #'append places + :key #'(lambda (x) `((setf ,x (funcall input)) + (valid-secondary-check ,x))))))) + (setf byte0 (funcall input) + bytes-needed (utf-8-byte-len byte0)) + (when (< bytes-left bytes-needed) + (decode-err 'end-of-input-in-character)) + (case bytes-needed + (1 (setf code byte0)) + (2 (put-and-check-valid-secondary-bytes byte1) + (overlong-check #b11000000 #b10111111) + (overlong-check #b11000001 #b10111111) + (setf code (logior (ash (ldb (byte 5 0) byte0) 6) + (ldb (byte 6 0) byte1)))) + (3 (put-and-check-valid-secondary-bytes byte1 byte2) + (overlong-check #b11100000 #b10011111) + (setf code (logior (ash (ldb (byte 4 0) byte0) 12) + (ash (ldb (byte 6 0) byte1) 6) + (ldb (byte 6 0) byte2))) + (when (illegal-unicode-code-point code) + (decode-err 'illegal-code-point))) + (4 (put-and-check-valid-secondary-bytes byte1 byte2 byte3) + (overlong-check #b11110000 #b10001111) + (setf code (logior (ash (ldb (byte 3 0) byte0) 18) + (ash (ldb (byte 6 0) byte1) 12) + (ash (ldb (byte 6 0) byte2) 6) + (ldb (byte 6 0) byte3))))) + (funcall output (code-char code))))))) + (to-octets + (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))) + (let ((code (char-code (funcall input)))) + (when (illegal-unicode-code-point code) + (setf code (char-code (funcall error-fn 'illegal-character)))) + (cond + ((< code #x80) + (funcall output code)) + ((< code #x800) + (funcall output (logior #xC0 (ldb (byte 5 6) code))) + (funcall output (logior #x80 (ldb (byte 6 0) code)))) + ((< code #x10000) + (funcall output (logior #xE0 (ldb (byte 4 12) code))) + (funcall output (logior #x80 (ldb (byte 6 6) code))) + (funcall output (logior #x80 (ldb (byte 6 0) code)))) + ((< code #x200000) + (funcall output (logior #xF0 (ldb (byte 3 18) code))) + (funcall output (logior #x80 (ldb (byte 6 12) code))) + (funcall output (logior #x80 (ldb (byte 6 6) code))) + (funcall output (logior #x80 (ldb (byte 6 0) code)))))))) + +(define-external-format :utf-16 (:utf16 :utf-16be :utf16be) 2 + (to-char + (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))) + (block utf-16-decode + (flet ((read-word () + (+ (ash (funcall input) 8) (funcall input))) + (decode-err (sym) + (return-from utf-16-decode + (funcall output (funcall error-fn sym))))) + (macrolet ((put-word (word bytes-needed) + `(progn (when (> ,bytes-needed bytes-left) + (decode-err 'end-of-input-in-character)) + (setf ,word (read-word))))) + (let ((code 0) (w0 0) (w1 0)) + (declare (type (unsigned-byte 32) code) + (type (unsigned-byte 16) w0 w1)) + (put-word w0 2) + (cond ((not (<= #xD800 w0 #xDFFF)) + (setf code w0)) + ((> w0 #xDBFF) + (decode-err 'invalid-starter-octet)) + (t (put-word w1 4) + (if (<= #xDC00 w1 #xDFFF) + (setf code (+ (ash (ldb (byte 10 0) w0) 10) + (ldb (byte 10 0) w1) + #x10000)) + (decode-err 'invalid-continuation-octet)))) + (funcall output (code-char code))))))) + (to-octets + (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))) + (flet ((write-word (word) + (funcall output (ldb (byte 8 8) word)) + (funcall output (ldb (byte 8 0) word)))) + (let ((code (char-code (funcall input)))) + (when (illegal-unicode-code-point code) + (setf code (char-code (funcall error-fn 'illegal-character)))) + (cond ((< code #x10000) + (write-word code)) + (t (decf code #x10000) + (write-word (logior #xD800 (ldb (byte 10 10) code))) + (write-word (logior #xDC00 (ldb (byte 10 0) code))))))))) + +(define-external-format :utf-16le (:utf16le) 2 + (to-char + (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))) + (block utf-16-decode + (flet ((read-word () + (+ (funcall input) (ash (funcall input) 8))) + (decode-err (sym) + (return-from utf-16-decode + (funcall output (funcall error-fn sym))))) + (macrolet ((put-word (word bytes-needed) + `(progn (when (> ,bytes-needed bytes-left) + (decode-err 'end-of-input-in-character)) + (setf ,word (read-word))))) + (let ((code 0) (w0 0) (w1 0)) + (declare (type (unsigned-byte 32) code) + (type (unsigned-byte 16) w0 w1)) + (put-word w0 2) + (cond ((not (<= #xD800 w0 #xDFFF)) + (setf code w0)) + ((> w0 #xDBFF) + (decode-err 'invalid-starter-octet)) + (t (put-word w1 4) + (if (<= #xDC00 w1 #xDFFF) + (setf code (+ (ash (ldb (byte 10 0) w0) 10) + (ldb (byte 10 0) w1) + #x10000)) + (decode-err 'invalid-continuation-octet)))) + (funcall output (code-char code))))))) + (to-octets + (declare (optimize (speed 3) (space 0) (safety 0) (debug 0))) + (flet ((write-word (word) + (funcall output (ldb (byte 8 0) word)) + (funcall output (ldb (byte 8 8) word)))) + (let ((code (char-code (funcall input)))) + (when (illegal-unicode-code-point code) + (setf code (char-code (funcall error-fn 'illegal-character)))) + (cond ((< code #x10000) + (write-word code)) + (t (decf code #x10000) + (write-word (logior #xD800 (ldb (byte 10 10) code))) + (write-word (logior #xDC00 (ldb (byte 10 0) code))))))))) + +;;; +;;; +;;; CONVERSION FUNCTIONS +;;; +;;; + +;; +;; OCTETS-TO-CHAR +;; + +(defmacro octets-to-char (external-format input output error-fn bytes-left) + `(funcall (ef-octets-to-char ,external-format) ,input ,output ,error-fn + ,bytes-left)) + +(defun read-replacement-char () + (format *query-io* "Enter a replacement character(evaluated): ") + (finish-output *query-io*) + (list (eval (read *query-io*)))) + +(defun %octets-to-string (buffer string start end ef &optional max-char-num) + (declare (type et:foreign-pointer buffer) + (type buffer-index start end) + (type external-format ef) + (type (or null signed-byte) max-char-num) + (optimize (speed 3) (space 0) (safety 0) (debug 0))) + (unless max-char-num (setf max-char-num -1)) + (let ((ptr start) + (pos -1) + (char-count -1) + oldpos oldptr) + (tagbody + (flet ((input () + (prog1 (cffi:mem-aref buffer :uint8 ptr) (incf ptr))) + (output (char) + (setf (char string (incf pos)) char)) + (error-fn (symbol) + (restart-case + (error symbol :array buffer + :start start :end end + :position oldptr + :external-format (ef-name ef)) + (use-value (s) + :report "Supply a replacement character." + :interactive read-replacement-char + s) + (use-standard-unicode-replacement () + :report "Use standard UCS replacement character" + (code-char #xFFFD)) + (stop-decoding () + :report "Stop decoding and return to last good offset." + (setf pos oldpos) + (go :exit))))) + (loop :while (and (< ptr end) + (/= (incf char-count) max-char-num)) + :do (setf oldpos pos + oldptr ptr) + (octets-to-char ef #'input #'output #'error-fn (- end ptr)))) + :exit + (return-from %octets-to-string (values (1+ pos) (- ptr start)))))) + +(defun octets-to-string (octets + &key (start 0) end + (external-format :default) + (auto-correct nil)) + (setf octets (coerce octets '(simple-array octet (*)))) + (check-type start buffer-index) + (check-type end (or null buffer-index)) + (let ((ef (find-external-format external-format)) + (end (or end (length octets))) + (string nil)) + (assert (<= start end)) + (setf string (make-string (- end start))) + (cffi:with-pointer-to-vector-data (octets-ptr octets) + (let ((pos (if auto-correct + (handler-bind ((octet-decoding-error + #'(lambda (error) + (declare (ignore error)) + (invoke-restart 'use-value #\?)))) + (%octets-to-string octets-ptr string start end ef)) + (%octets-to-string octets-ptr string start end ef)))) + (shrink-vector string pos))))) + +;; +;; CHAR-TO-OCTETS +;; + +(defmacro char-to-octets (ef input output error-fn chars-left) + `(funcall (ef-char-to-octets ,ef) ,input ,output ,error-fn + ,chars-left)) + +(defun string-to-octets (string &key (start 0) end + (external-format :default) + adjust-factor) + (declare (type string string) + (type buffer-index start) + (type (or null buffer-index) end) + (type (or null real) adjust-factor) + (optimize (speed 3) (space 0) (safety 0) (debug 0))) + (let* ((ef (find-external-format external-format)) + (buffer (make-array (1+ (length string)) + :element-type 'octet + :adjustable t)) + (adjust-threshold (length string)) + (ptr start) + (pos -1) + oldpos oldptr) + (setf adjust-factor (if (and adjust-factor (<= 1 adjust-factor 4)) + adjust-factor + (ef-octet-size ef)) + end (or end (length string))) + (tagbody + (flet ((input () + (prog1 (char string ptr) (incf ptr))) + (output (octet) + (setf (aref buffer (incf pos)) octet) + (when (= pos adjust-threshold) + (setf adjust-threshold (truncate (* adjust-factor (1+ pos)))) + (setf buffer (adjust-array buffer adjust-threshold)))) + (error-fn (symbol) + (restart-case + (error symbol :array buffer + :start start :end end + :position oldptr + :external-format (ef-name ef)) + (use-value (s) + :report "Supply a replacement character." + :interactive read-replacement-char + s) + (use-standard-unicode-replacement () + :report "Use standard UCS replacement character" + (code-char #xFFFD)) + (stop-decoding () + :report "Stop decoding and return to last good offset." + (setf pos oldpos) + (go :exit))))) + (loop :while (< ptr end) + :do (setf oldpos pos + oldptr ptr) + (char-to-octets ef #'input #'output #'error-fn (- end ptr)))) + :exit (return-from string-to-octets (shrink-vector buffer (1+ pos)))))) diff --git a/io.encodings/iso-8859-tables.lisp b/io.encodings/iso-8859-tables.lisp new file mode 100644 index 0000000..24825c1 --- /dev/null +++ b/io.encodings/iso-8859-tables.lisp @@ -0,0 +1,44 @@ +;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp -*- + +;; This code is free software; you can redistribute it and/or +;; modify it under the terms of the version 2.1 of +;; the GNU Lesser General Public License as published by +;; the Free Software Foundation, as clarified by the +;; preamble found here: +;; http://opensource.franz.com/preamble.html +;; +;; This program is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU Lesser General +;; Public License along with this library; if not, write to the +;; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, +;; Boston, MA 02110-1301, USA + +(in-package :io.encodings) + +(defmacro define-iso-8859-tables (tables) + `(progn + ,@(loop :for (index table) :in tables + :collect + (let ((name (iolib-utils:concat-symbol "+iso-8859-" index "-table+"))) + `(iolib-utils:define-constant ,name (map 'string #'code-char ,table)))))) + +(define-iso-8859-tables + (("1" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255)) + ("2" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 260 728 321 164 317 346 167 168 352 350 356 377 173 381 379 176 261 731 322 180 318 347 711 184 353 351 357 378 733 382 380 340 193 194 258 196 313 262 199 268 201 280 203 282 205 206 270 272 323 327 211 212 336 214 215 344 366 218 368 220 221 354 223 341 225 226 259 228 314 263 231 269 233 281 235 283 237 238 271 273 324 328 243 244 337 246 247 345 367 250 369 252 253 355 729)) + ("3" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 294 728 163 164 65533 292 167 168 304 350 286 308 173 65533 379 176 295 178 179 180 181 293 183 184 305 351 287 309 189 65533 380 192 193 194 65533 196 266 264 199 200 201 202 203 204 205 206 207 65533 209 210 211 212 288 214 215 284 217 218 219 220 364 348 223 224 225 226 65533 228 267 265 231 232 233 234 235 236 237 238 239 65533 241 242 243 244 289 246 247 285 249 250 251 252 365 349 729)) + ("4" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 260 312 342 164 296 315 167 168 352 274 290 358 173 381 175 176 261 731 343 180 297 316 711 184 353 275 291 359 330 382 331 256 193 194 195 196 197 198 302 268 201 280 203 278 205 206 298 272 325 332 310 212 213 214 215 216 370 218 219 220 360 362 223 257 225 226 227 228 229 230 303 269 233 281 235 279 237 238 299 273 326 333 311 244 245 246 247 248 371 250 251 252 361 363 729)) + ("5" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 173 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 8470 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 167 1118 1119)) + ("6" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 65533 65533 65533 164 65533 65533 65533 65533 65533 65533 65533 1548 173 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 1563 65533 65533 65533 1567 65533 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 65533 65533 65533 65533 65533 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533)) + ("7" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 8216 8217 163 8364 8367 166 167 168 169 890 171 172 173 65533 8213 176 177 178 179 900 901 902 183 904 905 906 187 908 189 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 65533 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 65533)) + ("8" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 65533 162 163 164 165 166 167 168 169 215 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 247 187 188 189 190 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 65533 8215 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 65533 65533 8206 8207 65533)) + ("9" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 286 209 210 211 212 213 214 215 216 217 218 219 220 304 350 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 287 241 242 243 244 245 246 247 248 249 250 251 252 305 351 255)) + ("10" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 260 274 290 298 296 310 167 315 272 352 358 381 173 362 330 176 261 275 291 299 297 311 183 316 273 353 359 382 8213 363 331 256 193 194 195 196 197 198 302 268 201 280 203 278 205 206 207 208 325 332 211 212 213 214 360 216 370 218 219 220 221 222 223 257 225 226 227 228 229 230 303 269 233 281 235 279 237 238 239 240 326 333 243 244 245 246 361 248 371 250 251 252 253 254 312)) + ("11" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 65533 65533 65533 65533 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 65533 65533 65533 65533)) + ("13" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 8221 162 163 164 8222 166 167 216 169 342 171 172 173 174 198 176 177 178 179 8220 181 182 183 248 185 343 187 188 189 190 230 260 302 256 262 196 197 280 274 268 201 377 278 290 310 298 315 352 323 325 211 332 213 214 215 370 321 346 362 220 379 381 223 261 303 257 263 228 229 281 275 269 233 378 279 291 311 299 316 353 324 326 243 333 245 246 247 371 322 347 363 252 380 382 8217)) + ("14" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 7682 7683 163 266 267 7690 167 7808 169 7810 7691 7922 173 174 376 7710 7711 288 289 7744 7745 182 7766 7809 7767 7811 7776 7923 7812 7813 7777 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 372 209 210 211 212 213 214 7786 216 217 218 219 220 221 374 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 373 241 242 243 244 245 246 7787 248 249 250 251 252 253 375 255)) + ("15" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 8364 165 352 167 353 169 170 171 172 173 174 175 176 177 178 179 381 181 182 183 382 185 186 187 338 339 376 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255)) + ("16" #(0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 260 261 321 8364 8222 352 167 353 169 536 171 377 173 378 379 176 177 268 322 381 8221 182 183 382 269 537 187 338 339 376 380 192 193 194 258 196 262 198 199 200 201 202 203 204 205 206 207 272 323 210 211 212 336 214 346 368 217 218 219 220 280 538 223 224 225 226 259 228 263 230 231 232 233 234 235 236 237 238 239 273 324 242 243 244 337 246 347 369 249 250 251 252 281 539 255)))) diff --git a/io.encodings/parse-mappings.lisp b/io.encodings/parse-mappings.lisp new file mode 100644 index 0000000..79e3850 --- /dev/null +++ b/io.encodings/parse-mappings.lisp @@ -0,0 +1,49 @@ +;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp -*- + +;; This code is free software; you can redistribute it and/or +;; modify it under the terms of the version 2.1 of +;; the GNU Lesser General Public License as published by +;; the Free Software Foundation, as clarified by the +;; preamble found here: +;; http://opensource.franz.com/preamble.html +;; +;; This program is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU Lesser General +;; Public License along with this library; if not, write to the +;; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, +;; Boston, MA 02110-1301, USA + +(eval-when (:compile-toplevel :load-toplevel :execute) + (asdf:oos 'asdf:load-op :split-sequence)) + +(defpackage :parse-encodings + (:use :cl :split-sequence) + (:export #:parse-unicode-mapping)) + +(in-package :parse-encodings) + +(defvar +unicode-replacement-char-code+ #xFFFD) + +(defun parse-unicode-mapping (file &optional (array-size 256)) + (flet ((parse-hex (str) + (assert (string-equal "0x" (string-downcase (subseq str 0 2)))) + (parse-integer (subseq str 2) :radix 16)) + (sanitize-line (line) + (when line + (string-left-trim '(#\space) + (nsubstitute #\space #\tab line))))) + (let ((arr (make-array array-size :element-type '(unsigned-byte 32) + :initial-element +unicode-replacement-char-code+))) + (with-open-file (fin file) + (loop :for line = (sanitize-line (read-line fin nil nil)) + :while line :do + (when (and (plusp (length line)) (char= #\0 (char line 0))) + (let* ((split (split-sequence #\space line :remove-empty-subseqs t)) + (index (parse-hex (first split))) + (code (parse-hex (second split)))) + (setf (aref arr index) code))))) + arr))) -- 2.11.4.GIT