1 ;;; rfc2047.el --- functions for encoding and decoding rfc2047 messages
3 ;; Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 ;; 2005, 2006, 2007 Free Software Foundation, Inc.
6 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
7 ;; MORIOKA Tomohiko <morioka@jaist.ac.jp>
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 3, or (at your option)
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23 ;; Boston, MA 02110-1301, USA.
27 ;; RFC 2047 is "MIME (Multipurpose Internet Mail Extensions) Part
28 ;; Three: Message Header Extensions for Non-ASCII Text".
34 (defvar message-posting-charset
))
39 ;; Fixme: Avoid this (used for mail-parse-charset) mm dependence on gnus.
41 (require 'rfc2045
) ;; rfc2045-encode-string
42 (autoload 'mm-body-7-or-8
"mm-bodies")
44 (defvar rfc2047-header-encoding-alist
45 '(("Newsgroups" . nil
)
48 ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|\\(In-\\)?Reply-To\\|Sender\
49 \\|Mail-Followup-To\\|Mail-Copies-To\\|Approved\\)" . address-mime
)
51 "*Header/encoding method alist.
52 The list is traversed sequentially. The keys can either be
57 1) nil, in which case no encoding is done;
58 2) `mime', in which case the header will be encoded according to RFC2047;
59 3) `address-mime', like `mime', but takes account of the rules for address
60 fields (where quoted strings and comments must be treated separately);
61 4) a charset, in which case it will be encoded as that charset;
62 5) `default', in which case the field will be encoded as the rest
65 (defvar rfc2047-charset-encoding-alist
89 "Alist of MIME charsets to RFC2047 encodings.
90 Valid encodings are nil, `Q' and `B'. These indicate binary (no) encoding,
91 quoted-printable and base64 respectively.")
93 (defvar rfc2047-encode-function-alist
94 '((Q . rfc2047-q-encode-string
)
95 (B . rfc2047-b-encode-string
)
97 "Alist of RFC2047 encodings to encoding functions.")
99 (defvar rfc2047-encode-encoded-words t
100 "Whether encoded words should be encoded again.")
103 ;;; Functions for encoding RFC2047 messages
106 (defun rfc2047-qp-or-base64 ()
107 "Return the type with which to encode the buffer.
108 This is either `base64' or `quoted-printable'."
110 (let ((limit (min (point-max) (+ 2000 (point-min))))
112 (goto-char (point-min))
113 (skip-chars-forward "\x20-\x7f\r\n\t" limit
)
114 (while (< (point) limit
)
117 (skip-chars-forward "\x20-\x7f\r\n\t" limit
))
118 (if (or (< (* 6 n8bit
) (- limit
(point-min)))
119 ;; Don't base64, say, a short line with a single
120 ;; non-ASCII char when splitting parts by charset.
125 (defun rfc2047-narrow-to-field ()
126 "Narrow the buffer to the header on the current line."
132 (if (re-search-forward "^[^ \n\t]" nil t
)
135 (goto-char (point-min)))
137 (defun rfc2047-field-value ()
138 "Return the value of the field at point."
141 (rfc2047-narrow-to-field)
142 (re-search-forward ":[ \t\n]*" nil t
)
143 (buffer-substring-no-properties (point) (point-max)))))
145 (defun rfc2047-quote-special-characters-in-quoted-strings (&optional
147 "Quote special characters with `\\'s in quoted strings.
148 Quoting will not be done in a quoted string if it contains characters
149 matching ENCODABLE-REGEXP or it is within parentheses."
150 (goto-char (point-min))
151 (let ((tspecials (concat "[" ietf-drums-tspecials
"]"))
154 (with-syntax-table (standard-syntax-table)
158 (eq (char-before) ?\
)))
160 (goto-char (point-max)))
162 (narrow-to-region start
(point))
164 (while (search-forward "\"" nil t
)
165 (setq beg
(match-beginning 0))
166 (unless (eq (char-before beg
) ?
\\)
172 (setq end
(1- (point)))
174 (if (and encodable-regexp
175 (re-search-forward encodable-regexp end t
))
178 (narrow-to-region beg end
)
179 (while (re-search-forward tspecials nil
'move
)
180 (if (eq (char-before) ?
\\)
181 (if (looking-at tspecials
) ;; Already quoted.
184 (goto-char (match-beginning 0))
190 (goto-char (point-max)))
192 (setq start
(point))))))
194 (defvar rfc2047-encoding-type
'address-mime
195 "The type of encoding done by `rfc2047-encode-region'.
196 This should be dynamically bound around calls to
197 `rfc2047-encode-region' to either `mime' or `address-mime'. See
198 `rfc2047-header-encoding-alist', for definitions.")
200 (defun rfc2047-encode-message-header ()
201 "Encode the message header according to `rfc2047-header-encoding-alist'.
202 Should be called narrowed to the head of the message."
205 (goto-char (point-min))
206 (let (alist elem method
)
209 (rfc2047-narrow-to-field)
211 alist rfc2047-header-encoding-alist
)
212 (while (setq elem
(pop alist
))
213 (when (or (and (stringp (car elem
))
214 (looking-at (car elem
)))
218 (if (not (rfc2047-encodable-p))
220 (when (eq method
'address-mime
)
221 (rfc2047-quote-special-characters-in-quoted-strings))
222 (if (and (eq (mm-body-7-or-8) '8bit
)
225 (car message-posting-charset
)))
226 ;; 8 bit must be decoded.
227 (mm-encode-coding-region
228 (point-min) (point-max)
229 (mm-charset-to-coding-system
230 (car message-posting-charset
))))
231 ;; No encoding necessary, but folding is nice
235 (goto-char (point-min))
236 (skip-chars-forward "^:")
237 (when (looking-at ": ")
241 ;; We found something that may perhaps be encoded.
242 (re-search-forward "^[^:]+: *" nil t
)
244 ((eq method
'address-mime
)
245 (rfc2047-encode-region (point) (point-max)))
247 (let ((rfc2047-encoding-type 'mime
))
248 (rfc2047-encode-region (point) (point-max))))
249 ((eq method
'default
)
250 (if (and (featurep 'mule
)
251 (if (boundp 'default-enable-multibyte-characters
)
252 default-enable-multibyte-characters
)
254 (mm-encode-coding-region (point) (point-max)
255 mail-parse-charset
)))
256 ;; We get this when CC'ing messsages to newsgroups with
257 ;; 8-bit names. The group name mail copy just got
258 ;; unconditionally encoded. Previously, it would ask
259 ;; whether to encode, which was quite confusing for the
260 ;; user. If the new behaviour is wrong, tell me. I have
261 ;; left the old code commented out below.
262 ;; -- Per Abrahamsen <abraham@dina.kvl.dk> Date: 2001-10-07.
263 ;; Modified by Dave Love, with the commented-out code changed
264 ;; in accordance with changes elsewhere.
266 (rfc2047-encode-region (point) (point-max)))
268 ;;; (if (or (message-options-get
269 ;;; 'rfc2047-encode-message-header-encode-any)
270 ;;; (message-options-set
271 ;;; 'rfc2047-encode-message-header-encode-any
273 ;;; "Some texts are not encoded. Encode anyway?")))
274 ;;; (rfc2047-encode-region (point-min) (point-max))
275 ;;; (error "Cannot send unencoded text")))
276 ((mm-coding-system-p method
)
277 (if (or (and (featurep 'mule
)
278 (if (boundp 'default-enable-multibyte-characters
)
279 default-enable-multibyte-characters
))
280 (featurep 'file-coding
))
281 (mm-encode-coding-region (point) (point-max) method
)))
284 (goto-char (point-max)))))))
286 ;; Fixme: This, and the require below may not be the Right Thing, but
287 ;; should be safe just before release. -- fx 2001-02-08
288 (eval-when-compile (defvar message-posting-charset
))
290 (defun rfc2047-encodable-p ()
291 "Return non-nil if any characters in current buffer need encoding in headers.
292 The buffer may be narrowed."
293 (require 'message
) ; for message-posting-charset
295 (mm-find-mime-charset-region (point-min) (point-max))))
296 (goto-char (point-min))
297 (or (and rfc2047-encode-encoded-words
299 (search-forward "=?" nil t
)
300 (goto-char (point-min))))
302 (not (equal charsets
(list (car message-posting-charset
))))))))
304 ;; Use this syntax table when parsing into regions that may need
305 ;; encoding. Double quotes are string delimiters, backslash is
306 ;; character quoting, and all other RFC 2822 special characters are
307 ;; treated as punctuation so we can use forward-sexp/forward-word to
308 ;; skip to the end of regions appropriately. Nb. ietf-drums does
309 ;; things differently.
310 (defconst rfc2047-syntax-table
311 ;; (make-char-table 'syntax-table '(2)) only works in Emacs.
312 (let ((table (make-syntax-table)))
313 ;; The following is done to work for setting all elements of the table
314 ;; in Emacs 21 and 22 and XEmacs; it appears to be the cleanest way.
315 ;; Play safe and don't assume the form of the word syntax entry --
317 (if (fboundp 'set-char-table-range
) ; Emacs
318 (funcall (intern "set-char-table-range")
319 table t
(aref (standard-syntax-table) ?a
))
320 (if (fboundp 'put-char-table
)
321 (if (fboundp 'get-char-table
) ; warning avoidance
322 (put-char-table t
(get-char-table ?a
(standard-syntax-table))
324 (modify-syntax-entry ?
\\ "\\" table
)
325 (modify-syntax-entry ?
\" "\"" table
)
326 (modify-syntax-entry ?\
( "(" table
)
327 (modify-syntax-entry ?\
) ")" table
)
328 (modify-syntax-entry ?\
< "." table
)
329 (modify-syntax-entry ?\
> "." table
)
330 (modify-syntax-entry ?\
[ "." table
)
331 (modify-syntax-entry ?\
] "." table
)
332 (modify-syntax-entry ?
: "." table
)
333 (modify-syntax-entry ?\
; "." table)
334 (modify-syntax-entry ?
, "." table
)
335 (modify-syntax-entry ?
@ "." table
)
338 (defun rfc2047-encode-region (b e
)
339 "Encode words in region B to E that need encoding.
340 By default, the region is treated as containing RFC2822 addresses.
341 Dynamically bind `rfc2047-encoding-type' to change that."
343 (narrow-to-region b e
)
344 (let ((encodable-regexp (if rfc2047-encode-encoded-words
345 "[^\000-\177]+\\|=\\?"
347 start
; start of current token
349 ;; Whether there's an encoded word before the current token,
350 ;; either immediately or separated by space.
352 (orig-text (buffer-substring-no-properties b e
)))
353 (if (eq 'mime rfc2047-encoding-type
)
354 ;; Simple case. Continuous words in which all those contain
355 ;; non-ASCII characters are encoded collectively. Encoding
356 ;; ASCII words, including `Re:' used in Subject headers, is
357 ;; avoided for interoperability with non-MIME clients and
358 ;; for making it easy to find keywords.
360 (goto-char (point-min))
361 (while (progn (skip-chars-forward " \t\n")
364 (while (and (looking-at "[ \t\n]*\\([^ \t\n]+\\)")
366 (setq end
(match-end 0))
367 (re-search-forward encodable-regexp end t
)))
369 (if (> (point) start
)
370 (rfc2047-encode start
(point))
372 ;; `address-mime' case -- take care of quoted words, comments.
373 (rfc2047-quote-special-characters-in-quoted-strings encodable-regexp
)
374 (with-syntax-table rfc2047-syntax-table
375 (goto-char (point-min))
376 (condition-case err
; in case of unbalanced quotes
377 ;; Look for rfc2822-style: sequences of atoms, quoted
378 ;; strings, specials, whitespace. (Specials mustn't be
382 (skip-chars-forward " \t\n")
385 ((not (char-after))) ; eob
387 ((eq ?
\" (setq csyntax
(char-syntax (char-after))))
391 ;; Does it need encoding?
393 (if (re-search-forward encodable-regexp end
'move
)
394 ;; It needs encoding. Strip the quotes first,
395 ;; since encoded words can't occur in quotes.
398 (delete-backward-char 1)
402 ;; There was a preceding quoted word. We need
403 ;; to include any separating whitespace in this
404 ;; word to avoid it getting lost.
405 (skip-chars-backward " \t")
406 ;; A space is needed between the encoded words.
410 ;; Adjust the end position for the deleted quotes.
411 (rfc2047-encode start
(- end
2))
412 (setq last-encoded t
)) ; record that it was encoded
413 (setq last-encoded nil
)))
415 ;; Skip other delimiters, but record that they've
416 ;; potentially separated quoted words.
418 (setq last-encoded nil
))
420 (error "Unbalanced parentheses"))
422 ;; Look for the end of parentheses.
424 ;; Encode text as an unstructured field.
425 (let ((rfc2047-encoding-type 'mime
))
426 (rfc2047-encode-region (1+ start
) (1- (point))))
427 (skip-chars-forward ")"))
428 (t ; normal token/whitespace sequence
430 ;; Skip one ASCII word, or encode continuous words
431 ;; in which all those contain non-ASCII characters.
433 (while (not (or end
(eobp)))
434 (when (looking-at "[\000-\177]+")
438 (while (and (or (re-search-forward
439 "[ \t\n]\\|\\Sw" end
'move
)
441 (eq ?
\\ (char-syntax (char-before))))
442 ;; Skip backslash-quoted characters.
445 (setq end
(match-beginning 0))
446 (if rfc2047-encode-encoded-words
449 (when (search-forward "=?" end
'move
)
450 (goto-char (match-beginning 0))
453 ;; Where the value nil of `end' means there may be
454 ;; text to have to be encoded following the point.
455 ;; Otherwise, the point reached to the end of ASCII
456 ;; words separated by whitespace or a special char.
458 (when (looking-at encodable-regexp
)
459 (goto-char (setq begin
(match-end 0)))
460 (while (and (looking-at "[ \t\n]+\\([^ \t\n]+\\)")
461 (setq end
(match-end 0))
463 (while (re-search-forward
464 encodable-regexp end t
))
467 (or (not (re-search-forward "\\Sw" end t
))
469 (goto-char (match-beginning 0))
472 (when (looking-at "[^ \t\n]+")
473 (setq end
(match-end 0))
474 (if (re-search-forward "\\Sw+" end t
)
475 ;; There are special characters better
476 ;; to be encoded so that MTAs may parse
478 (cond ((= end
(point)))
479 ((looking-at (concat "\\sw*\\("
484 (goto-char (1- (match-end 0)))
485 (unless (= (point) (match-beginning 0))
486 ;; Separate encodable text and
490 (skip-chars-forward " \t\n")
491 (if (and (looking-at "[^ \t\n]+")
492 (string-match encodable-regexp
495 (goto-char end
)))))))
496 (skip-chars-backward " \t\n")
499 (if (re-search-forward encodable-regexp end
'move
)
501 (unless (memq (char-before start
) '(nil ?
\t ?
))
504 (skip-chars-backward "^ \t\n")
505 (and (looking-at "\\Sw+")
506 (= (match-end 0) start
)))
507 ;; Also encode bogus delimiters.
509 ;; Separate encodable text and delimiter.
512 (setq start
(1+ start
)
514 (rfc2047-encode start end
)
515 (setq last-encoded t
))
516 (setq last-encoded nil
)))))
518 (if (or debug-on-quit debug-on-error
)
519 (signal (car err
) (cdr err
))
520 (error "Invalid data for rfc2047 encoding: %s"
521 (mm-replace-in-string orig-text
"[ \t\n]+" " "))))))))
522 (rfc2047-fold-region b
(point))
523 (goto-char (point-max))))
525 (defun rfc2047-encode-string (string)
526 "Encode words in STRING.
527 By default, the string is treated as containing addresses (see
528 `rfc2047-encoding-type')."
529 (mm-with-multibyte-buffer
531 (rfc2047-encode-region (point-min) (point-max))
534 (defvar rfc2047-encode-max-chars
76
535 "Maximum characters of each header line that contain encoded-words.
536 If it is nil, encoded-words will not be folded. Too small value may
537 cause an error. Don't change this for no particular reason.")
539 (defun rfc2047-encode-1 (column string cs encoder start crest tail
541 "Subroutine used by `rfc2047-encode'."
542 (cond ((string-equal string
"")
544 ((not rfc2047-encode-max-chars
)
546 (funcall encoder
(if cs
547 (mm-encode-coding-string string cs
)
550 ((>= column rfc2047-encode-max-chars
)
552 (cond ((string-match "\n[ \t]+\\'" eword
)
553 ;; Reomove a superfluous empty line.
554 (setq eword
(substring eword
0 (match-beginning 0))))
555 ((string-match "(+\\'" eword
)
556 ;; Break the line before the open parenthesis.
557 (setq crest
(concat crest
(match-string 0 eword
))
558 eword
(substring eword
0 (match-beginning 0))))))
559 (rfc2047-encode-1 (length crest
) string cs encoder start
" " tail
560 (concat eword
"\n" crest
)))
563 (limit (1- (length string
)))
568 (setq next
(concat start
571 (mm-encode-coding-string
572 (substring string
0 (1+ index
))
574 (substring string
0 (1+ index
))))
576 len
(+ column
(length next
)))
577 (if (> len rfc2047-encode-max-chars
)
580 (if (or (< index limit
)
581 (<= (+ len
(or (string-match "\n" tail
)
583 rfc2047-encode-max-chars
))
586 (if (string-match "\\`)+" tail
)
587 ;; Break the line after the close parenthesis.
588 (setq tail
(concat (substring tail
0 (match-end 0))
590 (substring tail
(match-end 0)))
596 (concat eword next tail
)
599 (string-match "(+\\'" eword
))
600 (setq crest
(concat crest
(match-string 0 eword
))
601 eword
(substring eword
0 (match-beginning 0)))
602 (setq eword
(concat eword next
)))
604 eword
(concat eword next
)))
605 (when (string-match "\n[ \t]+\\'" eword
)
606 ;; Reomove a superfluous empty line.
607 (setq eword
(substring eword
0 (match-beginning 0))))
608 (rfc2047-encode-1 (length crest
) (substring string index
)
609 cs encoder start
" " tail
610 (concat eword
"\n" crest
)))))))
612 (defun rfc2047-encode (b e
)
613 "Encode the word(s) in the region B to E.
614 Point moves to the end of the region."
615 (let ((mime-charset (or (mm-find-mime-charset-region b e
) (list 'us-ascii
)))
616 cs encoding tail crest eword
)
617 (cond ((> (length mime-charset
) 1)
618 (error "Can't rfc2047-encode `%s'"
619 (buffer-substring-no-properties b e
)))
620 ((= (length mime-charset
) 1)
621 (setq mime-charset
(car mime-charset
)
622 cs
(mm-charset-to-coding-system mime-charset
))
623 (unless (and (mm-multibyte-p)
624 (mm-coding-system-p cs
))
627 (narrow-to-region b e
)
629 (or (cdr (assq mime-charset
630 rfc2047-charset-encoding-alist
))
631 ;; For the charsets that don't have a preferred
632 ;; encoding, choose the one that's shorter.
633 (if (eq (rfc2047-qp-or-base64) 'base64
)
638 (skip-chars-forward "^ \t\n")
639 ;; `tail' may contain a close parenthesis.
640 (setq tail
(buffer-substring-no-properties e
(point)))
642 (setq b
(point-marker)
643 e
(set-marker (make-marker) e
))
644 (rfc2047-fold-region (point-at-bol) b
)
646 (skip-chars-backward "^ \t\n")
647 (unless (= 0 (skip-chars-backward " \t"))
648 ;; `crest' may contain whitespace and an open parenthesis.
649 (setq crest
(buffer-substring-no-properties (point) b
)))
650 (setq eword
(rfc2047-encode-1
652 (mm-replace-in-string
653 (buffer-substring-no-properties b e
)
654 "\n\\([ \t]?\\)" "\\1")
656 (or (cdr (assq encoding
657 rfc2047-encode-function-alist
))
659 (concat "=?" (downcase (symbol-name mime-charset
))
660 "?" (upcase (symbol-name encoding
)) "?")
663 (delete-region (if (eq (aref eword
0) ?
\n)
665 ;; The line was folded before encoding.
670 ;; `eword' contains `crest' and `tail'.
674 (unless (or (/= 0 (length tail
))
676 (looking-at "[ \t\n)]"))
681 (defun rfc2047-fold-field ()
682 "Fold the current header field."
685 (rfc2047-narrow-to-field)
686 (rfc2047-fold-region (point-min) (point-max)))))
688 (defun rfc2047-fold-region (b e
)
689 "Fold long lines in region B to E."
691 (narrow-to-region b e
)
692 (goto-char (point-min))
696 (bol (save-restriction
700 (when (and (or break qword-break
)
701 (> (- (point) bol
) 76))
702 (goto-char (or break qword-break
))
705 (skip-chars-backward " \t")
706 (if (looking-at "[ \t]")
709 (setq bol
(1- (point)))
710 ;; Don't break before the first non-LWSP characters.
711 (skip-chars-forward " \t")
715 ((eq (char-after) ?
\n)
720 (skip-chars-forward " \t")
721 (unless (or (eobp) (eq (char-after) ?
\n))
723 ((eq (char-after) ?
\r)
725 ((memq (char-after) '(? ?
\t))
726 (skip-chars-forward " \t")
727 (unless first
;; Don't break just after the header name.
728 (setq break
(point))))
730 (if (not (looking-at "=\\?[^=]"))
731 (if (eq (char-after) ?
=)
733 (skip-chars-forward "^ \t\n\r="))
734 ;; Don't break at the start of the field.
735 (unless (= (point) b
)
736 (setq qword-break
(point)))
737 (skip-chars-forward "^ \t\n\r")))
739 (skip-chars-forward "^ \t\n\r")))
741 (when (and (or break qword-break
)
742 (> (- (point) bol
) 76))
743 (goto-char (or break qword-break
))
746 (if (or (> 0 (skip-chars-backward " \t"))
747 (looking-at "[ \t]"))
750 (setq bol
(1- (point)))
751 ;; Don't break before the first non-LWSP characters.
752 (skip-chars-forward " \t")
754 (forward-char 1))))))
756 (defun rfc2047-unfold-field ()
757 "Fold the current line."
760 (rfc2047-narrow-to-field)
761 (rfc2047-unfold-region (point-min) (point-max)))))
763 (defun rfc2047-unfold-region (b e
)
764 "Unfold lines in region B to E."
766 (narrow-to-region b e
)
767 (goto-char (point-min))
768 (let ((bol (save-restriction
771 (eol (point-at-eol)))
774 (if (and (looking-at "[ \t]")
775 (< (- (point-at-eol) bol
) 76))
776 (delete-region eol
(progn
778 (skip-chars-forward "\r\n")
780 (setq bol
(point-at-bol)))
781 (setq eol
(point-at-eol))
784 (defun rfc2047-b-encode-string (string)
785 "Base64-encode the header contained in STRING."
786 (base64-encode-string string t
))
788 (defun rfc2047-q-encode-string (string)
789 "Quoted-printable-encode the header in STRING."
790 (mm-with-unibyte-buffer
792 (quoted-printable-encode-region
793 (point-min) (point-max) nil
794 ;; = (\075), _ (\137), ? (\077) are used in the encoded word.
795 ;; Avoid using 8bit characters.
796 ;; This list excludes `especials' (see the RFC2047 syntax),
797 ;; meaning that some characters in non-structured fields will
798 ;; get encoded when they con't need to be. The following is
799 ;; what it used to be.
800 ;;; ;; Equivalent to "^\000-\007\011\013\015-\037\200-\377=_?"
801 ;;; "\010\012\014\040-\074\076\100-\136\140-\177")
802 "-\b\n\f !#-'*+0-9A-Z\\^`-~\d")
803 (subst-char-in-region (point-min) (point-max) ? ?_
)
806 (defun rfc2047-encode-parameter (param value
)
807 "Return and PARAM=VALUE string encoded in the RFC2047-like style.
808 This is a replacement for the `rfc2231-encode-string' function.
810 When attaching files as MIME parts, we should use the RFC2231 encoding
811 to specify the file names containing non-ASCII characters. However,
812 many mail softwares don't support it in practice and recipients won't
813 be able to extract files with correct names. Instead, the RFC2047-like
814 encoding is acceptable generally. This function provides the very
815 RFC2047-like encoding, resigning to such a regrettable trend. To use
816 it, put the following line in your ~/.gnus.el file:
818 \(defalias 'mail-header-encode-parameter 'rfc2047-encode-parameter)
820 (let ((rfc2047-encoding-type 'mime
)
821 (rfc2047-encode-max-chars nil
))
822 (rfc2045-encode-string param
(rfc2047-encode-string value
))))
825 ;;; Functions for decoding RFC2047 messages
829 (defconst rfc2047-encoded-word-regexp
830 "=\\?\\([^][\000-\040()<>@,\;:*\\\"/?.=]+\\)\\(?:\\*[^?]+\\)?\
831 \\?\\(B\\|Q\\)\\?\\([!->@-~ ]*\\)\\?="))
833 (defvar rfc2047-quote-decoded-words-containing-tspecials nil
834 "If non-nil, quote decoded words containing special characters.")
836 (defvar rfc2047-allow-incomplete-encoded-text t
837 "*Non-nil means allow incomplete encoded-text in successive encoded-words.
838 Dividing of encoded-text in the place other than character boundaries
839 violates RFC2047 section 5, while we have a capability to decode it.
840 If it is non-nil, the decoder will decode B- or Q-encoding in each
841 encoded-word, concatenate them, and decode it by charset. Otherwise,
842 the decoder will fully decode each encoded-word before concatenating
845 (defun rfc2047-strip-backslashes-in-quoted-strings ()
846 "Strip backslashes in quoted strings. `\\\"' remains."
847 (goto-char (point-min))
849 (with-syntax-table (standard-syntax-table)
850 (while (search-forward "\"" nil t
)
851 (unless (eq (char-before) ?
\\)
852 (setq beg
(match-end 0))
853 (goto-char (match-beginning 0))
858 (narrow-to-region beg
(1- (point)))
860 (while (search-forward "\\" nil
'move
)
861 (unless (memq (char-after) '(?
\"))
862 (delete-backward-char 1))
866 (goto-char beg
))))))))
868 (defun rfc2047-charset-to-coding-system (charset)
869 "Return coding-system corresponding to MIME CHARSET.
870 If your Emacs implementation can't decode CHARSET, return nil."
871 (when (stringp charset
)
872 (setq charset
(intern (downcase charset
))))
873 (when (or (not charset
)
874 (eq 'gnus-all mail-parse-ignored-charsets
)
875 (memq 'gnus-all mail-parse-ignored-charsets
)
876 (memq charset mail-parse-ignored-charsets
))
877 (setq charset mail-parse-charset
))
878 (let ((cs (mm-charset-to-coding-system charset
)))
879 (cond ((eq cs
'ascii
)
880 (setq cs
(or (mm-charset-to-coding-system mail-parse-charset
)
882 ((mm-coding-system-p cs
))
884 (listp mail-parse-ignored-charsets
)
885 (memq 'gnus-unknown mail-parse-ignored-charsets
))
886 (setq cs
(mm-charset-to-coding-system mail-parse-charset
))))
891 (defun rfc2047-decode-encoded-words (words)
892 "Decode successive encoded-words in WORDS and return a decoded string.
893 Each element of WORDS looks like (CHARSET ENCODING ENCODED-TEXT
895 (let (word charset cs encoding text rest
)
897 (setq word
(pop words
))
898 (if (and (setq cs
(rfc2047-charset-to-coding-system
899 (setq charset
(car word
))))
901 (cond ((char-equal ?B
(nth 1 word
))
902 (setq text
(base64-decode-string
903 (rfc2047-pad-base64 (nth 2 word
)))))
904 ((char-equal ?Q
(nth 1 word
))
905 (setq text
(quoted-printable-decode-string
906 (mm-subst-char-in-string
907 ?_ ?
(nth 2 word
) t
)))))
909 (message "%s" (error-message-string code
))
911 (if (and rfc2047-allow-incomplete-encoded-text
913 ;; Concatenate text of which the charset is the same.
914 (setcdr (car rest
) (concat (cdar rest
) text
))
915 (push (cons cs text
) rest
))
916 ;; Don't decode encoded-word.
917 (push (cons nil
(nth 3 word
)) rest
)))
920 (or (and (setq cs
(caar rest
))
922 (mm-decode-coding-string (cdar rest
) cs
)
924 (message "%s" (error-message-string code
))
926 (concat (when (cdr rest
) " ")
929 (not (eq (string-to-char words
) ?
)))
935 ;; Fixme: This should decode in place, not cons intermediate strings.
936 ;; Also check whether it needs to worry about delimiting fields like
939 ;; In fact it's reported that (invalid) encoding of mailboxes in
940 ;; addr-specs is in use, so delimiting fields might help. Probably
941 ;; not decoding a word which isn't properly delimited is good enough
942 ;; and worthwhile (is it more correct or not?), e.g. something like
943 ;; `=?iso-8859-1?q?foo?=@'.
945 (defun rfc2047-decode-region (start end
&optional address-mime
)
946 "Decode MIME-encoded words in region between START and END.
947 If ADDRESS-MIME is non-nil, strip backslashes which precede characters
948 other than `\"' and `\\' in quoted strings."
950 (let ((case-fold-search t
)
951 (eword-regexp (eval-when-compile
952 ;; Ignore whitespace between encoded-words.
953 (concat "[\n\t ]*\\(" rfc2047-encoded-word-regexp
958 (narrow-to-region start end
)
960 (rfc2047-strip-backslashes-in-quoted-strings))
961 (goto-char (setq b start
))
962 ;; Look for the encoded-words.
963 (while (setq match
(re-search-forward eword-regexp nil t
))
964 (setq e
(match-beginning 1)
968 (push (list (match-string 2) ;; charset
969 (char-after (match-beginning 3)) ;; encoding
970 (match-string 4) ;; encoded-text
971 (match-string 1)) ;; encoded-word
973 ;; Look for the subsequent encoded-words.
974 (when (setq match
(looking-at eword-regexp
))
975 (goto-char (setq end
(match-end 0)))))
976 ;; Replace the encoded-words with the decoded one.
977 (delete-region e end
)
978 (insert (rfc2047-decode-encoded-words (nreverse words
)))
980 (narrow-to-region e
(point))
982 ;; Remove newlines between decoded words, though such
983 ;; things essentially must not be there.
984 (while (re-search-forward "[\n\r]+" nil t
)
986 ;; Quote decoded words if there are special characters
987 ;; which might violate RFC2822.
988 (when (and rfc2047-quote-decoded-words-containing-tspecials
989 (let ((regexp (car (rassq
991 rfc2047-header-encoding-alist
))))
996 (while (and (memq (char-after) '(? ?
\t))
997 (zerop (forward-line -
1))))
998 (looking-at regexp
)))))
1001 (skip-chars-forward " \t")
1002 (setq start
(point))
1003 (setq quoted
(eq (char-after) ?
\"))
1004 (goto-char (point-max))
1005 (skip-chars-backward " \t")
1006 (if (setq quoted
(and quoted
1007 (> (point) (1+ start
))
1008 (eq (char-before) ?
\")))
1011 (setq start
(1+ start
)
1012 end
(point-marker)))
1013 (setq end
(point-marker)))
1015 (while (search-forward "\"" end t
)
1018 (zerop (%
(skip-chars-backward "\\\\") 2))
1019 (goto-char (match-beginning 0)))
1022 (when (and (not quoted
)
1026 (concat "[" ietf-drums-tspecials
"]")
1032 (set-marker end nil
)))
1033 (goto-char (point-max)))
1034 (when (and (mm-multibyte-p)
1036 (not (eq mail-parse-charset
'us-ascii
))
1037 (not (eq mail-parse-charset
'gnus-decoded
)))
1038 (mm-decode-coding-region b e mail-parse-charset
))
1040 (when (and (mm-multibyte-p)
1042 (not (eq mail-parse-charset
'us-ascii
))
1043 (not (eq mail-parse-charset
'gnus-decoded
)))
1044 (mm-decode-coding-region b
(point-max) mail-parse-charset
))))))
1046 (defun rfc2047-decode-address-region (start end
)
1047 "Decode MIME-encoded words in region between START and END.
1048 Backslashes which precede characters other than `\"' and `\\' in quoted
1049 strings are stripped."
1050 (rfc2047-decode-region start end t
))
1052 (defun rfc2047-decode-string (string &optional address-mime
)
1053 "Decode MIME-encoded STRING and return the result.
1054 If ADDRESS-MIME is non-nil, strip backslashes which precede characters
1055 other than `\"' and `\\' in quoted strings."
1056 (let ((m (mm-multibyte-p)))
1057 (if (string-match "=\\?" string
)
1059 ;; Fixme: This logic is wrong, but seems to be required by
1060 ;; Gnus summary buffer generation. The value of `m' depends
1061 ;; on the current buffer, not global multibyteness or that
1062 ;; of the string. Also the string returned should always be
1063 ;; multibyte in a multibyte session, i.e. the buffer should
1064 ;; be multibyte before `buffer-string' is called.
1066 (mm-enable-multibyte))
1069 (rfc2047-decode-region (point-min) (point-max) address-mime
))
1074 (when (mm-multibyte-string-p string
)
1075 (mm-enable-multibyte))
1077 (rfc2047-strip-backslashes-in-quoted-strings)
1079 ;; Fixme: As above, `m' here is inappropriate.
1082 (not (eq mail-parse-charset
'us-ascii
))
1083 (not (eq mail-parse-charset
'gnus-decoded
)))
1084 ;; `decode-coding-string' in Emacs offers a third optional
1085 ;; arg NOCOPY to avoid consing a new string if the decoding
1086 ;; is "trivial". Unfortunately it currently doesn't
1087 ;; consider anything else than a `nil' coding system
1089 ;; `rfc2047-decode-string' is called multiple times for each
1090 ;; article during summary buffer generation, and we really
1091 ;; want to avoid unnecessary consing. So we bypass
1092 ;; `decode-coding-string' if the string is purely ASCII.
1093 (if (and (fboundp 'detect-coding-string
)
1094 ;; string is purely ASCII
1095 (eq (detect-coding-string string t
) 'undecided
))
1097 (mm-decode-coding-string string mail-parse-charset
))
1098 (mm-string-as-multibyte string
)))))
1100 (defun rfc2047-decode-address-string (string)
1101 "Decode MIME-encoded STRING and return the result.
1102 Backslashes which precede characters other than `\"' and `\\' in quoted
1103 strings are stripped."
1104 (rfc2047-decode-string string t
))
1106 (defun rfc2047-pad-base64 (string)
1107 "Pad STRING to quartets."
1108 ;; Be more liberal to accept buggy base64 strings. If
1109 ;; base64-decode-string accepts buggy strings, this function could
1110 ;; be aliased to identity.
1111 (if (= 0 (mod (length string
) 4))
1113 (when (string-match "=+$" string
)
1114 (setq string
(substring string
0 (match-beginning 0))))
1115 (case (mod (length string
) 4)
1117 (1 string
) ;; Error, don't pad it.
1118 (2 (concat string
"=="))
1119 (3 (concat string
"=")))))
1123 ;;; arch-tag: a07fe3d4-22b5-4c4a-bd89-b1f82d5d36f6
1124 ;;; rfc2047.el ends here