Draft NEWS for sbcl-2.4.9
[sbcl.git] / src / code / octets.lisp
blobe00016a823fa9ba79aa40c319159d4be1d3180ba
1 ;;;; code for string to octet conversion
3 ;;;; This software is part of the SBCL system. See the README file for
4 ;;;; more information.
5 ;;;;
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
12 ;;; FIXME: The latin9 stuff is currently #+sb-unicode, because I
13 ;;; don't like the idea of trying to do CODE-CHAR #x<big>. Is that a
14 ;;; justified fear? Can we arrange that it's caught and converted to
15 ;;; a decoding error error? Or should we just give up on non-Unicode
16 ;;; builds?
18 (in-package "SB-IMPL")
21 ;;;; conditions
23 ;;; encoding condition
25 (eval-when (:compile-toplevel :load-toplevel :execute)
26 (defparameter *safety-0* '(safety 0)))
28 (define-condition octets-encoding-error (character-encoding-error)
29 ((string :initarg :string :reader octets-encoding-error-string)
30 (position :initarg :position :reader octets-encoding-error-position)
31 (external-format :initarg :external-format
32 :reader octets-encoding-error-external-format))
33 (:report (lambda (c s)
34 (format s "Unable to encode character ~A as ~S."
35 (char-code (char (octets-encoding-error-string c)
36 (octets-encoding-error-position c)))
37 (octets-encoding-error-external-format c)))))
39 (defun encoding-replacement-octetify (thing external-format)
40 (etypecase thing
41 ;; TODO: make sure EXTERNAL-FORMAT here does not have a REPLACEMENT, and handle
42 ;; encoding errors explicitly
43 (character
44 (string-to-octets (string thing) :external-format external-format))
45 (string
46 (string-to-octets thing :external-format external-format))
47 ((unsigned-byte 8) (make-array 1 :element-type '(unsigned-byte 8) :initial-element thing))
48 (sequence (coerce thing '(simple-array (unsigned-byte 8) 1)))))
50 (declaim (ftype (sfunction (t t t t) (simple-array (unsigned-byte 8) 1))
51 encoding-error))
52 (defun encoding-error (external-format replacement string pos)
53 (flet ((replacement (replacement) (encoding-replacement-octetify replacement external-format)))
54 (if replacement
55 (replacement replacement)
56 (restart-case
57 (error 'octets-encoding-error
58 :external-format external-format
59 :string string
60 :position pos)
61 (use-value (replacement)
62 :report "Supply a set of bytes to use in place of the invalid one."
63 :interactive
64 (lambda ()
65 (read-evaluated-form
66 "Replacement byte, bytes, character, or string (evaluated): "))
67 (replacement replacement))))))
69 ;;; decoding condition
71 ;;; for UTF8, the specific condition signalled will be a generalized
72 ;;; instance of one of the following:
73 ;;;
74 ;;; end-of-input-in-character
75 ;;; character-out-of-range
76 ;;; invalid-utf8-starter-byte
77 ;;; invalid-utf8-continuation-byte
78 ;;;
79 ;;; Of these, the only one truly likely to be of interest to calling
80 ;;; code is end-of-input-in-character (in which case it's likely to
81 ;;; want to make a note of octet-decoding-error-start, supply "" as a
82 ;;; replacement string, and then move that last chunk of bytes to the
83 ;;; beginning of its buffer for the next go round) but they're all
84 ;;; provided on the off chance they're of interest.
86 (define-condition octet-decoding-error (character-decoding-error)
87 ((array :initarg :array :accessor octet-decoding-error-array)
88 (start :initarg :start :accessor octet-decoding-error-start)
89 (end :initarg :end :accessor octet-decoding-error-end)
90 (position :initarg :pos :accessor octet-decoding-bad-byte-position)
91 (external-format :initarg :external-format
92 :accessor octet-decoding-error-external-format))
93 (:report
94 (lambda (condition stream)
95 (format stream "Illegal ~S character starting at byte position ~D."
96 (octet-decoding-error-external-format condition)
97 (octet-decoding-error-start condition)))))
99 (define-condition end-of-input-in-character (octet-decoding-error) ())
100 (define-condition character-out-of-range (octet-decoding-error) ())
101 (define-condition invalid-utf8-starter-byte (octet-decoding-error) ())
102 (define-condition invalid-utf8-continuation-byte (octet-decoding-error) ())
103 (define-condition overlong-utf8-sequence (octet-decoding-error) ())
105 (defun decoding-replacement-stringify (thing external-format)
106 (etypecase thing
107 (character (string thing))
108 (string thing)
109 ;; TODO: make sure EXTERNAL-FORMAT here does not have a REPLACEMENT, and handle
110 ;; decoding errors explicitly
111 ((unsigned-byte 8)
112 (let ((octets (make-array 1 :element-type '(unsigned-byte 8) :initial-element thing)))
113 (octets-to-string octets :external-format external-format)))
114 (sequence
115 (let ((octets (coerce thing '(simple-array (unsigned-byte 8) 1))))
116 (octets-to-string octets :external-format external-format)))))
118 (defun decoding-error (array start end external-format replacement reason pos)
119 (flet ((replacement (thing) (decoding-replacement-stringify thing external-format)))
120 (if replacement
121 (replacement replacement)
122 (restart-case
123 (error reason
124 :external-format external-format
125 :array array
126 :start start
127 :end end
128 :pos pos)
129 (use-value (replacement)
130 :report "Supply a replacement string."
131 :interactive
132 (lambda ()
133 (read-evaluated-form
134 "Replacement byte, bytes, character, or string (evaluated): "))
135 (replacement replacement))))))
137 ;;; Utilities used in both to-string and to-octet conversions
139 (defmacro instantiate-octets-definition (definer)
140 `(progn
141 (,definer aref (simple-array (unsigned-byte 8) (*)))
142 (,definer sap-ref-8 system-area-pointer)))
144 (eval-when (:compile-toplevel :load-toplevel :execute)
145 (defun make-od-name (sym1 sym2)
146 (package-symbolicate (cl:symbol-package sym1) sym1 "-" sym2)))
148 ;;;; to-octets conversions
150 ;;; to latin (including ascii)
152 ;;; Converting bytes to character codes is easy: just use a 256-element
153 ;;; lookup table that maps each possible byte to its corresponding
154 ;;; character code.
156 ;;; Converting character codes to bytes is a little harder, since the
157 ;;; codes may be spare (e.g. we use codes 0-127, 3490, and 4598). The
158 ;;; previous version of this macro utilized a gigantic CASE expression
159 ;;; to do the hard work, with the result that the code was huge (since
160 ;;; SBCL's then-current compilation strategy for CASE expressions was
161 ;;; (and still is) converting CASE into COND into if-the-elses--which is
162 ;;; also inefficient unless your code happens to occur very early in the
163 ;;; chain.
165 ;;; The current strategy is to build a table:
167 ;;; [ ... code_1 byte_1 code_2 byte_2 ... code_n byte_n ... ]
169 ;;; such that the codes are sorted in order from lowest to highest. We
170 ;;; can then binary search the table to discover the appropriate byte
171 ;;; for a character code. We also implement an optimization: all unibyte
172 ;;; mappings do not remap ASCII (0-127) and some do not remap part of
173 ;;; the range beyond character code 127. So we check to see if the
174 ;;; character code falls into that range first (a quick check, since
175 ;;; character codes are guaranteed to be positive) and then do the binary
176 ;;; search if not. This optimization also enables us to cut down on the
177 ;;; size of our lookup table.
178 (defmacro define-unibyte-mapper (byte-char-name code-byte-name &rest exceptions)
179 (let* (;; Build a list of (CODE BYTE) pairs
180 (pairs (loop for byte below 256
181 for code = (let ((exception (cdr (assoc byte exceptions))))
182 (cond
183 ((car exception) (car exception))
184 ((null exception) byte)
185 (t nil)))
186 when code collect (list code byte) into elements
187 finally (return elements)))
188 ;; Find the smallest character code such that the corresponding
189 ;; byte is != to the code.
190 (lowest-non-equivalent-code
191 (caar (sort (copy-seq exceptions) #'< :key #'car)))
192 ;; Sort them for our lookup table.
193 (sorted-pairs (sort (subseq pairs lowest-non-equivalent-code)
194 #'< :key #'car))
195 ;; Create the lookup table.
196 (sorted-lookup-table
197 (reduce #'append sorted-pairs :from-end t :initial-value nil)))
198 `(progn
199 ;; We *could* inline this, but it's not obviously the right thing,
200 ;; because each use of the inlined function in a different file
201 ;; would be forced to dump the large-ish array. To do things like
202 ;; this, you generally want a load-time ref to a global constant.
203 ;(declaim (inline ,byte-char-name))
204 (defun ,byte-char-name (byte)
205 (declare (optimize speed #.*safety-0*)
206 (type (unsigned-byte 8) byte))
207 ,(let ((byte-to-code
208 (loop for byte below 256
209 collect (let ((exception (cdr (assoc byte exceptions))))
210 (if exception
211 (car exception)
212 byte)))))
213 (if (position nil byte-to-code)
214 ;; There are bytes with no translation. Represent "missing"
215 ;; as -1 when stored, convert to NIL when accessed.
216 ;; We could use a single otherwise-unused point to mean NIL,
217 ;; but it would be confusing if in one table #xFFFF represents
218 ;; NIL and another #xF00D represents NIL.
219 `(let ((code (aref ,(sb-c::coerce-to-smallest-eltype
220 (substitute -1 nil byte-to-code))
221 byte)))
222 (if (>= code 0) code))
223 ;; Every byte has a translation
224 `(aref ,(sb-c::coerce-to-smallest-eltype byte-to-code)
225 byte))))
226 (defun ,code-byte-name (code)
227 (declare (optimize speed #.*safety-0*)
228 (%char-code code))
229 (if (< code ,lowest-non-equivalent-code)
230 code
231 (loop with code-to-byte-table =
232 ,(sb-c::coerce-to-smallest-eltype sorted-lookup-table)
233 with low = 0
234 with high = (- (length code-to-byte-table) 2)
235 while (< low high)
236 do (let ((mid (logandc2 (truncate (+ low high 2) 2) 1)))
237 (if (< code (aref code-to-byte-table mid))
238 (setf high (- mid 2))
239 (setf low mid)))
240 finally (return (if (eql code (aref code-to-byte-table low))
241 (aref code-to-byte-table (1+ low))
242 nil))))))))
244 (declaim (inline get-latin-bytes))
245 (defun get-latin-bytes (mapper external-format replacement string pos)
246 (let ((code (funcall mapper (char-code (char string pos)))))
247 (declare (type (or null %char-code) code))
248 (values (cond
249 ((and code (< code 256)) code)
251 (encoding-error external-format replacement string pos)))
252 1)))
254 (declaim (inline string->latin%))
255 (defun string->latin% (string sstart send get-bytes null-padding replacement)
256 (declare (optimize speed)
257 (type simple-string string)
258 (type index sstart send)
259 (type (integer 0 1) null-padding)
260 (type function get-bytes))
261 ;; The latin encodings are all unibyte encodings, so just directly
262 ;; compute the number of octets we're going to generate.
263 (let ((octets (make-array (+ (- send sstart) null-padding)
264 ;; This takes care of any null padding the
265 ;; caller requests.
266 :initial-element 0
267 :element-type '(unsigned-byte 8)))
268 (index 0)
269 (error-position 0)
270 (error-replacement))
271 (tagbody
272 :no-error
273 (loop for pos of-type index from sstart below send
274 do (let ((byte (funcall get-bytes string pos replacement)))
275 (typecase byte
276 ((unsigned-byte 8)
277 (locally (declare (optimize (sb-c:insert-array-bounds-checks 0)))
278 (setf (aref octets index) byte)))
279 ((simple-array (unsigned-byte 8) (*))
280 ;; KLUDGE: We ran into encoding errors. Bail and do
281 ;; things the slow way (does anybody actually use this
282 ;; functionality besides our own test suite?).
283 (setf error-position pos error-replacement byte)
284 (go :error)))
285 (incf index))
286 finally (return-from string->latin% octets))
287 :error
288 ;; We have encoded INDEX octets so far and we ran into an
289 ;; encoding error at ERROR-POSITION; the user has asked us to
290 ;; replace the expected output with ERROR-REPLACEMENT.
291 (let ((new-octets (make-array (* index 2)
292 :element-type '(unsigned-byte 8)
293 :adjustable t :fill-pointer index)))
294 (replace new-octets octets)
295 (flet ((extend (thing)
296 (typecase thing
297 ((unsigned-byte 8) (vector-push-extend thing new-octets))
298 ((simple-array (unsigned-byte 8) (*))
299 (dotimes (i (length thing))
300 (vector-push-extend (aref thing i) new-octets))))))
301 (extend error-replacement)
302 (loop for pos of-type index from (1+ error-position) below send
303 do (extend (funcall get-bytes string pos replacement))
304 finally (return-from string->latin%
305 (progn
306 (unless (zerop null-padding)
307 (vector-push-extend 0 new-octets))
308 (copy-seq new-octets)))))))))
310 ;;;; to-string conversions
312 ;;; from latin (including ascii)
314 (defmacro define-latin->string* (accessor type)
315 (let ((name (make-od-name 'latin->string* accessor)))
316 `(progn
317 (declaim (inline ,name))
318 (defun ,name (string sstart send array astart aend mapper)
319 (declare (optimize speed #.*safety-0*)
320 (type simple-string string)
321 (type ,type array)
322 (type array-range sstart send astart aend)
323 (function mapper))
324 (loop for spos from sstart below send
325 for apos from astart below aend
326 do (setf (char string spos)
327 (code-char (funcall mapper (,accessor array apos))))
328 finally (return (values string spos apos)))))))
329 (instantiate-octets-definition define-latin->string*)
331 (defmacro define-latin->string (accessor type)
332 (let ((name (make-od-name 'latin->string accessor)))
333 `(progn
334 (declaim (inline ,name))
335 (defun ,name (array astart aend mapper)
336 (declare (optimize speed #.*safety-0*)
337 (type ,type array)
338 (type array-range astart aend)
339 (type function mapper))
340 (let ((length (the array-range (- aend astart))))
341 (values (,(make-od-name 'latin->string* accessor) (make-string length) 0 length
342 array astart aend
343 mapper)))))))
344 (instantiate-octets-definition define-latin->string)
346 ;;;; external formats
348 (defvar *default-external-format* :utf-8)
350 (defun default-external-format ()
351 (/show0 "/getting default external format")
352 *default-external-format*)
355 ;;;; public interface
357 (defun maybe-defaulted-external-format (external-format)
358 (get-external-format-or-lose (if (eq external-format :default)
359 (default-external-format)
360 external-format)))
362 (declaim (inline %octets-to-string))
363 (declaim (ftype (sfunction (function (vector (unsigned-byte 8)) index sequence-end t)
364 (or (simple-array character (*))
365 (simple-array base-char (*))))
366 %octets-to-string))
367 (defun %octets-to-string (fun vector start end replacement)
368 (declare (explicit-check start end :result))
369 (with-array-data ((vector vector)
370 (start start)
371 (end end)
372 :check-fill-pointer t)
373 (declare (type (simple-array (unsigned-byte 8) (*)) vector))
374 (funcall fun vector start end replacement)))
375 (declaim (notinline %octets-to-string))
376 (defun octets-to-string (vector &key (external-format :default) (start 0) end)
377 "Return a string obtained by decoding VECTOR according to EXTERNAL-FORMAT.
379 If EXTERNAL-FORMAT is given, it must designate an external format.
381 If given, START and END must be bounding index designators and
382 designate a subsequence of VECTOR that should be decoded.
384 If some of the octets of VECTOR (or the subsequence bounded by START
385 and END) cannot be decoded by EXTERNAL-FORMAT an error of a subtype of
386 SB-INT:CHARACTER-DECODING-ERROR is signaled.
388 Note that for some values of EXTERNAL-FORMAT the length of the
389 returned string may be different from the length of VECTOR (or the
390 subsequence bounded by START and END)."
391 (let* ((ef (maybe-defaulted-external-format external-format))
392 (replacement (ef-replacement ef)))
393 (declare (inline %octets-to-string))
394 (%octets-to-string (ef-octets-to-string-fun ef) vector start end replacement)))
396 (declaim (inline %string-to-octets))
397 (declaim (ftype (sfunction (function string index sequence-end t t)
398 (simple-array (unsigned-byte 8) (*)))
399 %string-to-octets))
400 (defun %string-to-octets (fun string start end null-terminate replacement)
401 (declare (explicit-check start end :result))
402 (with-array-data ((string string)
403 (start start)
404 (end end)
405 :check-fill-pointer t)
406 (declare (type simple-string string))
407 (funcall fun string start end null-terminate replacement)))
408 (declaim (notinline %string-to-octets))
409 (defun string-to-octets (string &key (external-format :default)
410 (start 0) end null-terminate)
411 "Return an octet vector that is STRING encoded according to EXTERNAL-FORMAT.
413 If EXTERNAL-FORMAT is given, it must designate an external format.
415 If given, START and END must be bounding index designators and
416 designate a subsequence of STRING that should be encoded.
418 If NULL-TERMINATE is true, the returned octet vector ends with an
419 additional 0 element that does not correspond to any part of STRING.
421 If some of the characters of STRING (or the subsequence bounded by
422 START and END) cannot be encoded by EXTERNAL-FORMAT an error of a
423 subtype of SB-INT:CHARACTER-ENCODING-ERROR is signaled.
425 Note that for some values of EXTERNAL-FORMAT and NULL-TERMINATE the
426 length of the returned vector may be different from the length of
427 STRING (or the subsequence bounded by START and END)."
428 (declare (explicit-check start end :result))
429 (let* ((ef (maybe-defaulted-external-format external-format))
430 (replacement (ef-replacement ef)))
431 (declare (inline %string-to-octets))
432 (%string-to-octets (ef-string-to-octets-fun ef) string start end
433 (if null-terminate 1 0) replacement)))
435 ;;; Vector of all available EXTERNAL-FORMAT instances. Each format is named
436 ;;; by one or more keyword symbols. The mapping from symbol to index into this
437 ;;; vector is memoized into the symbol's :EXTERNAL-FORMAT property.
438 (define-load-time-global *external-formats* (make-array 60 :initial-element nil))
440 (defun register-external-format (names &rest args)
441 ;; TODO: compare-and-swap the entry if NAME already has an index
442 ;; specifying to demand-load this format from a fasl.
443 ;; All synonyms of that name will also references the loaded format.
444 (let* ((table *external-formats*)
445 (newline-variant (getf args :newline-variant))
446 (index (get (car names) :external-format))
447 (current-entry (and index (aref table index)))
448 (canonical-ef (if (consp current-entry) (car current-entry) current-entry))
449 (names (if canonical-ef (ef-names canonical-ef) names))
450 (ef (apply #'%make-external-format :names names args))
451 (free-index (or index (position nil table))))
452 (unless (eql newline-variant :lf)
453 (aver index)
454 (aver current-entry)
455 (unless (consp current-entry)
456 (setf current-entry (list current-entry)))
457 (setf (aref table index)
458 (cons (car current-entry)
459 (acons (list newline-variant) ef (cdr current-entry))))
460 (return-from register-external-format))
461 (dolist (name names)
462 (setf (get name :external-format) free-index))
463 (setf (aref table free-index) ef)))
465 ;;; This function was moved from 'fd-stream' because it depends on
466 ;;; the various error classes, two of which are defined just above.
467 ;;; XXX: Why does this get called with :DEFAULT and NIL when neither is
468 ;;; the name of any format? Shouldn't those be handled higher up,
469 ;;; or else this should return the actual default?
470 (defun get-external-format (external-format)
471 (let* ((external-format (ensure-list external-format))
472 (options (cdr external-format)))
473 (unless (symbolp (car external-format))
474 (return-from get-external-format nil))
475 (loop for (option value) on options by 'cddr
476 unless (or (eql option :newline) (eql option :replacement))
477 do (return-from get-external-format nil))
478 (binding*
479 (((format-name newline replacement)
480 (values (car external-format)
481 (getf options :newline :lf)
482 (getf options :replacement)))
483 (table-index (get format-name :external-format) :exit-if-null)
484 (formats *external-formats*)
485 (table-entry
486 ;; The table entry can be one of:
487 ;; 1. #<external-format>
488 ;; 2. (#<external-format> ((:crlf #\char) . #<modified-ef>) ...)
489 ;; for a list of modified formats that alter the
490 ;; newline encoding and choice of replacement character.
491 ;; 3. "namestring" - to autoload from a fasl containing
492 ;; the named format.
493 (let ((ef (svref formats table-index)))
494 (etypecase ef
495 ((or instance list) ef)
496 #+nil
497 (string
498 ;; Theoretically allow demand-loading the external-format
499 ;; from a fasl of this name. (Not done yet)
500 ;; (module-provide-contrib ef)
501 (let ((ef (svref formats table-index)))
502 (aver (external-format-p ef))
503 ef)))))
504 ((base-format variations)
505 (if (listp table-entry)
506 (values (car table-entry) (cdr table-entry))
507 (values table-entry nil)))
508 (newline-base-format
509 (if (eql newline :lf)
510 base-format
511 (cdr (assoc (list newline) variations :test #'equal)))
512 :exit-if-null))
513 (unless (typep replacement '(or null character string (unsigned-byte 8) (simple-array (unsigned-byte 8) 1)))
514 (return-from get-external-format nil))
515 (when (or (not newline-base-format) (not replacement))
516 (return-from get-external-format newline-base-format))
517 (loop
518 (let ((key (cons newline replacement)))
519 (awhen (assoc key variations :test #'equal)
520 (return (cdr it)))
521 (let* ((new-ef (let ((copy (copy-structure newline-base-format)))
522 (setf (ef-replacement copy) replacement)
523 copy))
524 (new-table-entry
525 (cons base-format (acons key new-ef variations)))
526 (old (cas (svref formats table-index) table-entry new-table-entry)))
527 (when (eq old table-entry)
528 (return new-ef))
529 ;; CAS failure -> some other thread added an entry. It's probably
530 ;; for the same replacement char which is usually #\ufffd.
531 ;; So try again. At worst this conses some more garbage.
532 (setq table-entry old)))))))
534 (push
535 `("SB-IMPL"
536 char-class char-class2 char-class3
537 ,@(let (macros)
538 (flet ((ends-with-p (s1 s2)
539 (let ((diff (- (length s1) (length s2))))
540 (and (>= diff 0) (string= s1 s2 :start1 diff)))))
541 (do-symbols (s "SB-IMPL" macros)
542 (let ((name (symbol-name s)))
543 (when (and (macro-function s)
544 (eql (mismatch name "DEFINE-") 7)
545 (or (ends-with-p name "->STRING")
546 (ends-with-p name "->STRING*")))
547 (push s macros)))))))
548 *!removable-symbols*)