Fix FORMAT compilation on non-simple strings.
[sbcl.git] / src / code / octets.lisp
blob7d6dcf838dbd9712fd9337d5760b3c90f72876b9
1 ;;;; code for string to octet conversion
3 ;;;; This software is part of the SBCL system. See the README file for
4 ;;;; more information.
5 ;;;;
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
12 ;;; FIXME: The latin9 stuff is currently #!+sb-unicode, because I
13 ;;; don't like the idea of trying to do CODE-CHAR #x<big>. Is that a
14 ;;; justified fear? Can we arrange that it's caught and converted to
15 ;;; a decoding error error? Or should we just give up on non-Unicode
16 ;;; builds?
18 (in-package "SB!IMPL")
21 ;;;; conditions
23 ;;; encoding condition
25 (define-condition octets-encoding-error (character-encoding-error)
26 ((string :initarg :string :reader octets-encoding-error-string)
27 (position :initarg :position :reader octets-encoding-error-position)
28 (external-format :initarg :external-format
29 :reader octets-encoding-error-external-format))
30 (:report (lambda (c s)
31 (format s "Unable to encode character ~A as ~S."
32 (char-code (char (octets-encoding-error-string c)
33 (octets-encoding-error-position c)))
34 (octets-encoding-error-external-format c)))))
36 (declaim (ftype (sfunction (t t t) (simple-array (unsigned-byte 8) 1))
37 encoding-error))
38 (defun encoding-error (external-format string pos)
39 (restart-case
40 (error 'octets-encoding-error
41 :external-format external-format
42 :string string
43 :position pos)
44 (use-value (replacement)
45 :report "Supply a set of bytes to use in place of the invalid one."
46 :interactive
47 (lambda ()
48 (read-evaluated-form
49 "Replacement byte, bytes, character, or string (evaluated): "))
50 (typecase replacement
51 ((unsigned-byte 8)
52 (make-array 1 :element-type '(unsigned-byte 8) :initial-element replacement))
53 (character
54 (string-to-octets (string replacement)
55 :external-format external-format))
56 (string
57 (string-to-octets replacement
58 :external-format external-format))
60 (coerce replacement '(simple-array (unsigned-byte 8) (*))))))))
62 ;;; decoding condition
64 ;;; for UTF8, the specific condition signalled will be a generalized
65 ;;; instance of one of the following:
66 ;;;
67 ;;; end-of-input-in-character
68 ;;; character-out-of-range
69 ;;; invalid-utf8-starter-byte
70 ;;; invalid-utf8-continuation-byte
71 ;;;
72 ;;; Of these, the only one truly likely to be of interest to calling
73 ;;; code is end-of-input-in-character (in which case it's likely to
74 ;;; want to make a note of octet-decoding-error-start, supply "" as a
75 ;;; replacement string, and then move that last chunk of bytes to the
76 ;;; beginning of its buffer for the next go round) but they're all
77 ;;; provided on the off chance they're of interest.
79 (define-condition octet-decoding-error (character-decoding-error)
80 ((array :initarg :array :accessor octet-decoding-error-array)
81 (start :initarg :start :accessor octet-decoding-error-start)
82 (end :initarg :end :accessor octet-decoding-error-end)
83 (position :initarg :pos :accessor octet-decoding-bad-byte-position)
84 (external-format :initarg :external-format
85 :accessor octet-decoding-error-external-format))
86 (:report
87 (lambda (condition stream)
88 (format stream "Illegal ~S character starting at byte position ~D."
89 (octet-decoding-error-external-format condition)
90 (octet-decoding-error-start condition)))))
92 (define-condition end-of-input-in-character (octet-decoding-error) ())
93 (define-condition character-out-of-range (octet-decoding-error) ())
94 (define-condition invalid-utf8-starter-byte (octet-decoding-error) ())
95 (define-condition invalid-utf8-continuation-byte (octet-decoding-error) ())
96 (define-condition overlong-utf8-sequence (octet-decoding-error) ())
98 (define-condition malformed-ascii (octet-decoding-error) ())
100 (defun decoding-error (array start end external-format reason pos)
101 (restart-case
102 (error reason
103 :external-format external-format
104 :array array
105 :start start
106 :end end
107 :pos pos)
108 (use-value (s)
109 :report "Supply a replacement string designator."
110 :interactive
111 (lambda ()
112 (read-evaluated-form
113 "Enter a replacement string designator (evaluated): "))
114 (string s))))
116 ;;; Utilities used in both to-string and to-octet conversions
118 (defmacro instantiate-octets-definition (definer)
119 `(progn
120 (,definer aref (simple-array (unsigned-byte 8) (*)))
121 (,definer sap-ref-8 system-area-pointer)))
123 ;;; FIXME: find out why the comment about SYMBOLICATE below is true
124 ;;; and fix it, or else replace with SYMBOLICATE.
126 ;;; FIXME: this is cute, but is going to prevent greps for def.*<name>
127 ;;; from working for (defun ,(make-od-name ...) ...)
128 (eval-when (:compile-toplevel :load-toplevel :execute)
129 (defun make-od-name (sym1 sym2)
130 ;; "MAKE-NAME" is too generic, but this doesn't do quite what
131 ;; SYMBOLICATE does; MAKE-OD-NAME ("octets definition") it is
132 ;; then.
133 (intern (concatenate 'string (symbol-name sym1) "-" (symbol-name sym2))
134 (symbol-package sym1))))
136 ;;;; to-octets conversions
138 ;;; to latin (including ascii)
140 ;;; Converting bytes to character codes is easy: just use a 256-element
141 ;;; lookup table that maps each possible byte to its corresponding
142 ;;; character code.
144 ;;; Converting character codes to bytes is a little harder, since the
145 ;;; codes may be spare (e.g. we use codes 0-127, 3490, and 4598). The
146 ;;; previous version of this macro utilized a gigantic CASE expression
147 ;;; to do the hard work, with the result that the code was huge (since
148 ;;; SBCL's then-current compilation strategy for CASE expressions was
149 ;;; (and still is) converting CASE into COND into if-the-elses--which is
150 ;;; also inefficient unless your code happens to occur very early in the
151 ;;; chain.
153 ;;; The current strategy is to build a table:
155 ;;; [ ... code_1 byte_1 code_2 byte_2 ... code_n byte_n ... ]
157 ;;; such that the codes are sorted in order from lowest to highest. We
158 ;;; can then binary search the table to discover the appropriate byte
159 ;;; for a character code. We also implement an optimization: all unibyte
160 ;;; mappings do not remap ASCII (0-127) and some do not remap part of
161 ;;; the range beyond character code 127. So we check to see if the
162 ;;; character code falls into that range first (a quick check, since
163 ;;; character codes are guaranteed to be positive) and then do the binary
164 ;;; search if not. This optimization also enables us to cut down on the
165 ;;; size of our lookup table.
166 (defmacro define-unibyte-mapper (byte-char-name code-byte-name &rest exceptions)
167 (let* (;; Build a list of (CODE BYTE) pairs
168 (pairs (loop for byte below 256
169 for code = (let ((exception (cdr (assoc byte exceptions))))
170 (cond
171 ((car exception) (car exception))
172 ((null exception) byte)
173 (t nil)))
174 when code collect (list code byte) into elements
175 finally (return elements)))
176 ;; Find the smallest character code such that the corresponding
177 ;; byte is != to the code.
178 (lowest-non-equivalent-code
179 (caar (sort (copy-seq exceptions) #'< :key #'car)))
180 ;; Sort them for our lookup table.
181 (sorted-pairs (sort (subseq pairs lowest-non-equivalent-code)
182 #'< :key #'car))
183 ;; Create the lookup table.
184 (sorted-lookup-table
185 (reduce #'append sorted-pairs :from-end t :initial-value nil)))
186 (flet ((pick-type (vector &optional missing-points-p)
187 (if missing-points-p
188 (let ((max (reduce #'max (remove nil vector))))
189 (cond ((<= max #x7F) '(signed-byte 8))
190 ((<= max #x7FFF) '(signed-byte 16))
191 (t '(signed-byte 32))))
192 (let ((max (reduce #'max vector)))
193 (cond ((<= max #xFF) '(unsigned-byte 8))
194 ((<= max #xFFFF) '(unsigned-byte 16))
195 (t '(unsigned-byte 32)))))))
196 `(progn
197 ;; We *could* inline this, but it's not obviously the right thing,
198 ;; because each use of the inlined function in a different file
199 ;; would be forced to dump the large-ish array. To do things like
200 ;; this, you generally want a load-time ref to a global constant.
201 ;(declaim (inline ,byte-char-name))
202 (defun ,byte-char-name (byte)
203 (declare (optimize speed (safety 0))
204 (type (unsigned-byte 8) byte))
205 ,(let ((byte-to-code
206 (loop for byte below 256
207 collect (let ((exception (cdr (assoc byte exceptions))))
208 (if exception
209 (car exception)
210 byte)))))
211 (if (position nil byte-to-code)
212 ;; There are bytes with no translation. Represent "missing"
213 ;; as -1 when stored, convert to NIL when accessed.
214 ;; We could use a single otherwise-unused point to mean NIL,
215 ;; but it would be confusing if in one table #xFFFF represents
216 ;; NIL and another #xF00D represents NIL.
217 `(let ((code (aref ,(!make-specialized-array
218 256 (pick-type byte-to-code t)
219 (substitute -1 nil byte-to-code))
220 byte)))
221 (if (>= code 0) code))
222 ;; Every byte has a translation
223 `(aref ,(!make-specialized-array
224 256 (pick-type byte-to-code) byte-to-code)
225 byte))))
226 (defun ,code-byte-name (code)
227 (declare (optimize speed (safety 0))
228 (type char-code code))
229 (if (< code ,lowest-non-equivalent-code)
230 code
231 (loop with code-to-byte-table =
232 ,(!make-specialized-array
233 (length sorted-lookup-table)
234 (pick-type sorted-lookup-table)
235 sorted-lookup-table)
236 with low = 0
237 with high = (- (length code-to-byte-table) 2)
238 while (< low high)
239 do (let ((mid (logandc2 (truncate (+ low high 2) 2) 1)))
240 (if (< code (aref code-to-byte-table mid))
241 (setf high (- mid 2))
242 (setf low mid)))
243 finally (return (if (eql code (aref code-to-byte-table low))
244 (aref code-to-byte-table (1+ low))
245 nil)))))))))
247 (declaim (inline get-latin-bytes))
248 (defun get-latin-bytes (mapper external-format string pos)
249 (let ((code (funcall mapper (char-code (char string pos)))))
250 (declare (type (or null char-code) code))
251 (values (cond
252 ((and code (< code 256)) code)
254 (encoding-error external-format string pos)))
255 1)))
257 (declaim (inline string->latin%))
258 (defun string->latin% (string sstart send get-bytes null-padding)
259 (declare (optimize speed)
260 (type simple-string string)
261 (type index sstart send)
262 (type (integer 0 1) null-padding)
263 (type function get-bytes))
264 ;; The latin encodings are all unibyte encodings, so just directly
265 ;; compute the number of octets we're going to generate.
266 (let ((octets (make-array (+ (- send sstart) null-padding)
267 ;; This takes care of any null padding the
268 ;; caller requests.
269 :initial-element 0
270 :element-type '(unsigned-byte 8)))
271 (index 0)
272 (error-position 0)
273 (error-replacement))
274 (tagbody
275 :no-error
276 (loop for pos of-type index from sstart below send
277 do (let ((byte (funcall get-bytes string pos)))
278 (typecase byte
279 ((unsigned-byte 8)
280 (locally (declare (optimize (sb!c::insert-array-bounds-checks 0)))
281 (setf (aref octets index) byte)))
282 ((simple-array (unsigned-byte 8) (*))
283 ;; KLUDGE: We ran into encoding errors. Bail and do
284 ;; things the slow way (does anybody actually use this
285 ;; functionality besides our own test suite?).
286 (setf error-position pos error-replacement byte)
287 (go :error)))
288 (incf index))
289 finally (return-from string->latin% octets))
290 :error
291 ;; We have encoded INDEX octets so far and we ran into an
292 ;; encoding error at ERROR-POSITION; the user has asked us to
293 ;; replace the expected output with ERROR-REPLACEMENT.
294 (let ((new-octets (make-array (* index 2)
295 :element-type '(unsigned-byte 8)
296 :adjustable t :fill-pointer index)))
297 (replace new-octets octets)
298 (flet ((extend (thing)
299 (typecase thing
300 ((unsigned-byte 8) (vector-push-extend thing new-octets))
301 ((simple-array (unsigned-byte 8) (*))
302 (dotimes (i (length thing))
303 (vector-push-extend (aref thing i) new-octets))))))
304 (extend error-replacement)
305 (loop for pos of-type index from (1+ error-position) below send
306 do (extend (funcall get-bytes string pos))
307 finally (return-from string->latin%
308 (progn
309 (unless (zerop null-padding)
310 (vector-push-extend 0 new-octets))
311 (copy-seq new-octets)))))))))
313 ;;;; to-string conversions
315 ;;; from latin (including ascii)
317 (defmacro define-latin->string* (accessor type)
318 (let ((name (make-od-name 'latin->string* accessor)))
319 `(progn
320 (declaim (inline ,name))
321 (defun ,name (string sstart send array astart aend mapper)
322 (declare (optimize speed (safety 0))
323 (type simple-string string)
324 (type ,type array)
325 (type array-range sstart send astart aend)
326 (function mapper))
327 (loop for spos from sstart below send
328 for apos from astart below aend
329 do (setf (char string spos)
330 (code-char (funcall mapper (,accessor array apos))))
331 finally (return (values string spos apos)))))))
332 (instantiate-octets-definition define-latin->string*)
334 (defmacro define-latin->string (accessor type)
335 (let ((name (make-od-name 'latin->string accessor)))
336 `(progn
337 (declaim (inline ,name))
338 (defun ,name (array astart aend mapper)
339 (declare (optimize speed (safety 0))
340 (type ,type array)
341 (type array-range astart aend)
342 (type function mapper))
343 (let ((length (the array-range (- aend astart))))
344 (values (,(make-od-name 'latin->string* accessor) (make-string length) 0 length
345 array astart aend
346 mapper)))))))
347 (instantiate-octets-definition define-latin->string)
349 ;;;; external formats
351 (defvar *default-external-format* nil)
353 (defun default-external-format ()
354 (or *default-external-format*
355 ;; On non-unicode, use iso-8859-1 instead of detecting it from
356 ;; the locale settings. Defaulting to an external-format which
357 ;; can represent characters that the CHARACTER type can't
358 ;; doesn't seem very sensible.
359 #!-sb-unicode
360 (setf *default-external-format* :latin-1)
361 (let ((external-format #!-win32 (intern (or #!-android
362 (alien-funcall
363 (extern-alien
364 "nl_langinfo"
365 (function (c-string :external-format :latin-1)
366 int))
367 sb!unix:codeset)
368 "LATIN-1")
369 "KEYWORD")
370 #!+win32 (sb!win32::ansi-codepage)))
371 (let ((entry (get-external-format external-format)))
372 (cond
373 (entry
374 (/show0 "matched"))
376 ;; FIXME! This WARN would try to do printing
377 ;; before the streams have been initialized,
378 ;; causing an infinite erroring loop. We should
379 ;; either print it by calling to C, or delay the
380 ;; warning until later. Since we're in freeze
381 ;; right now, and the warning isn't really
382 ;; essential, I'm doing what's least likely to
383 ;; cause damage, and commenting it out. This
384 ;; should be revisited after 0.9.17. -- JES,
385 ;; 2006-09-21
386 #+nil
387 (warn "Invalid external-format ~A; using LATIN-1"
388 external-format)
389 (setf external-format :latin-1))))
390 (/show0 "/default external format ok")
391 (setf *default-external-format* external-format))))
393 ;;;; public interface
395 (defun maybe-defaulted-external-format (external-format)
396 (get-external-format-or-lose (if (eq external-format :default)
397 (default-external-format)
398 external-format)))
400 (defun octets-to-string (vector &key (external-format :default) (start 0) end)
401 (declare (type (vector (unsigned-byte 8)) vector))
402 (with-array-data ((vector vector)
403 (start start)
404 (end end)
405 :check-fill-pointer t)
406 (declare (type (simple-array (unsigned-byte 8) (*)) vector))
407 (let ((ef (maybe-defaulted-external-format external-format)))
408 (funcall (ef-octets-to-string-fun ef) vector start end))))
410 (defun string-to-octets (string &key (external-format :default)
411 (start 0) end null-terminate)
412 (declare (type string string))
413 (with-array-data ((string string)
414 (start start)
415 (end end)
416 :check-fill-pointer t)
417 (declare (type simple-string string))
418 (let ((ef (maybe-defaulted-external-format external-format)))
419 (funcall (ef-string-to-octets-fun ef) string start end
420 (if null-terminate 1 0)))))
422 #!+sb-unicode
423 (defvar +unicode-replacement-character+ (string (code-char #xfffd)))
424 #!+sb-unicode
425 (defun use-unicode-replacement-char (condition)
426 (use-value +unicode-replacement-character+ condition))
428 ;;; Utilities that maybe should be exported
430 #!+sb-unicode
431 (defmacro with-standard-replacement-character (&body body)
432 `(handler-bind ((octet-encoding-error #'use-unicode-replacement-char))
433 ,@body))
435 (defmacro with-default-decoding-replacement ((c) &body body)
436 (let ((cname (gensym)))
437 `(let ((,cname ,c))
438 (handler-bind
439 ((octet-decoding-error (lambda (c)
440 (use-value ,cname c))))
441 ,@body))))
443 ;;; This function was moved from 'fd-stream' because it depends on
444 ;;; the various error classes, two of which are defined just above.
445 (defun get-external-format (external-format)
446 (flet ((keyword-external-format (keyword)
447 (declare (type keyword keyword))
448 (gethash keyword *external-formats*))
449 (replacement-handlerify (entry replacement)
450 (when entry
451 (wrap-external-format-functions
452 entry
453 (lambda (fun)
454 (and fun
455 (lambda (&rest rest)
456 (declare (dynamic-extent rest))
457 (handler-bind
458 ((stream-decoding-error
459 (lambda (c)
460 (declare (ignore c))
461 (invoke-restart 'input-replacement replacement)))
462 (stream-encoding-error
463 (lambda (c)
464 (declare (ignore c))
465 (invoke-restart 'output-replacement replacement)))
466 (octets-encoding-error
467 (lambda (c) (use-value replacement c)))
468 (octet-decoding-error
469 (lambda (c) (use-value replacement c))))
470 (apply fun rest)))))))))
471 (typecase external-format
472 (keyword (keyword-external-format external-format))
473 ((cons keyword)
474 (let ((entry (keyword-external-format (car external-format)))
475 (replacement (getf (cdr external-format) :replacement)))
476 (if replacement
477 (replacement-handlerify entry replacement)
478 entry))))))
480 (push
481 `("SB-IMPL"
482 char-class char-class2 char-class3
483 ,@(let (macros)
484 (flet ((ends-with-p (s1 s2)
485 (let ((diff (- (length s1) (length s2))))
486 (and (>= diff 0) (string= s1 s2 :start1 diff)))))
487 (do-symbols (s "SB-IMPL" macros)
488 (let ((name (symbol-name s)))
489 (when (and (macro-function s)
490 (eql (mismatch name "DEFINE-") 7)
491 (or (ends-with-p name "->STRING")
492 (ends-with-p name "->STRING*")))
493 (push s macros)))))))
494 sb!impl::*!removable-symbols*)