1 ;;; mail-extr.el --- extract full name and address from RFC 822 mail header.
3 ;; Copyright (C) 1992 Free Software Foundation, Inc.
5 ;; Author: Joe Wells <jbw@cs.bu.edu>
10 ;; This file is part of GNU Emacs.
12 ;; GNU Emacs is free software; you can redistribute it and/or modify
13 ;; it under the terms of the GNU General Public License as published by
14 ;; the Free Software Foundation; either version 1, or (at your option)
17 ;; GNU Emacs is distributed in the hope that it will be useful,
18 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;; GNU General Public License for more details.
22 ;; You should have received a copy of the GNU General Public License
23 ;; along with GNU Emacs; see the file COPYING. If not, write to
24 ;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
28 ;; Here is `mail-extr', a package for extracting full names and canonical
29 ;; addresses from RFC 822 mail headers. It is intended to be hooked into
30 ;; other Emacs Lisp packages that deal with RFC 822 format messages, such as
31 ;; Gnews, GNUS, RMAIL, MH-E, BBDB, VM, Supercite, etc. Thus, this release is
32 ;; mainly for Emacs Lisp developers.
34 ;; There are two main benefits:
36 ;; 1. Higher probability of getting the correct full name for a human than
37 ;; any other package I know of. (On the other hand, it will cheerfully
38 ;; mangle non-human names/comments.)
39 ;; 2. Address part is put in a canonical form.
41 ;; The interface is not yet carved in stone; please give me suggestions.
43 ;; I have an extensive test-case collection of funny addresses if you want to
44 ;; work with the code. Developing this code requires frequent testing to
45 ;; make sure you're not breaking functionality. I'm not posting the
46 ;; test-cases because they take over 100K.
48 ;; If you find an address that mail-extr fails on, please send it to me along
49 ;; with what you think the correct results should be. I do not consider it a
50 ;; bug if mail-extr mangles a comment that does not correspond to a real
51 ;; human full name, although I would prefer that mail-extr would return the
56 ;; * Full name handling:
58 ;; * knows where full names can be found in an address.
59 ;; * avoids using empty comments and quoted text.
60 ;; * extracts full names from mailbox names.
61 ;; * recognizes common formats for comments after a full name.
62 ;; * puts a period and a space after each initial.
63 ;; * understands & referring to the mailbox name capitalized.
64 ;; * strips name prefixes like "Prof.", etc..
65 ;; * understands what characters can occur in names (not just letters).
66 ;; * figures out middle initial from mailbox name.
67 ;; * removes funny nicknames.
68 ;; * keeps suffixes such as Jr., Sr., III, etc.
69 ;; * reorders "Last, First" type names.
71 ;; * Address handling:
73 ;; * parses rfc822 quoted text, comments, and domain literals.
74 ;; * parses rfc822 multi-line headers.
75 ;; * does something reasonable with rfc822 GROUP addresses.
76 ;; * handles many rfc822 noncompliant and garbage addresses.
77 ;; * canonicalizes addresses (after stripping comments/phrases outside <>).
78 ;; * converts ! addresses into .UUCP and %-style addresses.
79 ;; * converts rfc822 ROUTE addresses to %-style addresses.
80 ;; * truncates %-style addresses at leftmost fully qualified domain name.
81 ;; * handles local relative precedence of ! vs. % and @ (untested).
83 ;; It does almost no string creation. It primarily uses the built-in
84 ;; parsing routines with the appropriate syntax tables. This should
85 ;; result in greater speed.
89 ;; * handle all test cases. (This will take forever.)
90 ;; * software to pick the correct header to use (eg., "Senders-Name:").
91 ;; * multiple addresses in the "From:" header (almost all of the necessary
93 ;; * flag to not treat `,' as an address separator. (This is useful when
94 ;; there is a "From:" header but no "Sender:" header, because then there
95 ;; is only allowed to be one address.)
96 ;; * mailbox name does not necessarily contain full name.
97 ;; * fixing capitalization when it's all upper or lowercase. (Hard!)
98 ;; * some of the domain literal handling is missing. (But I've never even
99 ;; seen one of these in a mail address, so maybe no big deal.)
100 ;; * arrange to have syntax tables byte-compiled.
102 ;; * delete unused variables.
103 ;; * arrange for testing with different relative precedences of ! vs. @
105 ;; * put mail-variant-method back into mail-extract-address-components.
106 ;; * insert documentation strings!
107 ;; * handle X.400-gatewayed addresses according to RFC 1148.
111 ;; Mon Apr 6 23:59:09 1992 Joe Wells (jbw at bigbird.bu.edu)
113 ;; * Cleaned up some more. Release version 1.0 to world.
115 ;; Sun Apr 5 19:39:08 1992 Joe Wells (jbw at bigbird.bu.edu)
117 ;; * Cleaned up full name extraction extensively.
119 ;; Sun Feb 2 14:45:24 1992 Joe Wells (jbw at bigbird.bu.edu)
121 ;; * Total rewrite. Integrated mail-canonicalize-address into
122 ;; mail-extract-address-components. Now handles GROUP addresses more
123 ;; or less correctly. Better handling of lots of different cases.
125 ;; Fri Jun 14 19:39:50 1991
130 ;; Variable definitions.
132 (defvar mail-
@-binds-tighter-than-
! nil
)
134 ;;----------------------------------------------------------------------
135 ;; what orderings are meaningful?????
136 ;;(defvar mail-operator-precedence-list '(?! ?% ?@))
137 ;; Right operand of a % or a @ must be a domain name, period. No other
138 ;; operators allowed. Left operand of a @ is an address relative to that
141 ;; Left operand of a ! must be a domain name. Right operand is an
142 ;; arbitrary address.
143 ;;----------------------------------------------------------------------
145 (defconst mail-space-char
32)
147 (defconst mail-whitespace
" \t\n")
149 ;; Any character that can occur in a name in an RFC822 address.
150 ;; Yes, there are weird people with digits in their names.
151 (defconst mail-all-letters
"A-Za-z---{|}'~0-9`.")
153 ;; Any character that can occur in a name, not counting characters that
154 ;; separate parts of a multipart name.
155 (defconst mail-all-letters-but-separators
"A-Za-z{|}'~0-9`")
157 ;; Any character that can start a name
158 (defconst mail-first-letters
"A-Za-z")
160 ;; Any character that can end a name.
161 (defconst mail-last-letters
"A-Za-z`'.")
163 ;; Matches an initial not followed by both a period and a space.
164 (defconst mail-bad-initials-pattern
165 (format "\\(\\([^%s]\\|\\`\\)[%s]\\)\\(\\.\\([^ ]\\)\\| \\|\\([^%s .]\\)\\|\\'\\)"
166 mail-all-letters mail-first-letters mail-all-letters
))
168 (defconst mail-non-name-chars
(concat "^" mail-all-letters
"."))
170 (defconst mail-non-begin-name-chars
(concat "^" mail-first-letters
))
172 (defconst mail-non-end-name-chars
(concat "^" mail-last-letters
))
174 ;; Matches periods used instead of spaces. Must not match the period
175 ;; following an initial.
176 (defconst mail-bad-\.-pattern
177 (format "\\([%s][%s]\\)\\.+\\([%s]\\)"
178 mail-all-letters mail-last-letters mail-first-letters
))
180 ;; Matches an embedded or leading nickname that should be removed.
181 (defconst mail-nickname-pattern
182 (format "\\([ .]\\|\\`\\)[\"'`\[\(]\\([ .%s]+\\)[\]\"'\)] "
185 ;; Matches a leading title that is not part of the name (does not
186 ;; contribute to uniquely identifying the person).
187 (defconst mail-full-name-prefixes
188 '"\\` *\\(Prof\\|Dr\\|Mrs?\\|Rev\\|Rabbi\\|SysOp\\|LCDR\\)\\.? ")
190 ;; Matches the occurrence of a generational name suffix, and the last
191 ;; character of the preceding name.
192 (defconst mail-full-name-suffix-pattern
194 "\\(,? ?\\([JjSs]r\\.?\\|V?I+V?\\)\\)\\([^%s]\\([^%s]\\|\\'\\)\\|\\'\\)"
195 mail-all-letters mail-all-letters
))
197 (defconst mail-roman-numeral-pattern
200 ;; Matches a trailing uppercase (with other characters possible) acronym.
201 ;; Must not match a trailing uppercase last name or trailing initial
202 (defconst mail-weird-acronym-pattern
"\\([A-Z]+[-_/]\\|[A-Z][A-Z][A-Z]?\\b\\)")
204 ;; Matches a mixed-case or lowercase name (not an initial).
205 (defconst mail-mixed-case-name-pattern
207 "\\b\\([a-z][%s]*[%s]\\|[%s][%s]*[a-z][%s]*[%s]\\|[%s][%s]*[a-z]\\)"
208 mail-all-letters mail-last-letters
209 mail-first-letters mail-all-letters mail-all-letters mail-last-letters
210 mail-first-letters mail-all-letters
))
212 ;; Matches a trailing alternative address.
213 (defconst mail-alternative-address-pattern
"[a-zA-Z.]+[!@][a-zA-Z.]")
215 ;; Matches a variety of trailing comments not including comma-delimited
217 (defconst mail-trailing-comment-start-pattern
" [-{]\\|--\\|[+@#></\;]")
219 ;; Matches a name (not an initial).
220 ;; This doesn't force a word boundary at the end because sometimes a
221 ;; comment is separated by a `-' with no preceding space.
222 (defconst mail-name-pattern
225 mail-first-letters mail-all-letters mail-last-letters
))
227 (defconst mail-initial-pattern
228 (format "\\b[%s]\\([. ]\\|\\b\\)" mail-first-letters
))
230 ;; Matches a single name before a comma.
231 (defconst mail-last-name-first-pattern
232 (concat "\\`" mail-name-pattern
","))
234 ;; Matches telephone extensions.
235 (defconst mail-telephone-extension-pattern
236 "\\(\\([Ee]xt\\|[Tt]el\\|[Xx]\\).?\\)? *\\+?[0-9][- 0-9]+")
238 ;; Matches ham radio call signs.
239 (defconst mail-ham-call-sign-pattern
240 "\\b[A-Z]+[0-9][A-Z0-9]*")
242 ;; Matches normal single-part name
243 (defconst mail-normal-name-pattern
246 mail-first-letters mail-all-letters-but-separators mail-last-letters
))
248 ;; Matches normal two names with missing middle initial
249 (defconst mail-two-name-pattern
250 (concat "\\`\\(" mail-normal-name-pattern
251 "\\|" mail-initial-pattern
252 "\\) +\\(" mail-normal-name-pattern
"\\)\\(,\\|\\'\\)"))
254 (defvar address-syntax-table
(make-syntax-table))
255 (defvar address-comment-syntax-table
(make-syntax-table))
256 (defvar address-domain-literal-syntax-table
(make-syntax-table))
257 (defvar address-text-comment-syntax-table
(make-syntax-table))
258 (defvar address-text-syntax-table
(make-syntax-table))
262 (let ((syntax-table (symbol-value (car pair
))))
266 (if (eq 2 (length item
))
267 (modify-syntax-entry (car item
) (car (cdr item
)) syntax-table
)
268 (let ((char (car item
))
269 (bound (car (cdr item
)))
270 (syntax (car (cdr (cdr item
)))))
271 (while (<= char bound
)
272 (modify-syntax-entry char syntax syntax-table
)
273 (setq char
(1+ char
)))))))
275 '((address-syntax-table
276 (0 31 "w") ;control characters
278 (?
! ?~
"w") ;printable characters
280 (128 255 "w") ;high-bit-on characters
297 ;; % and ! aren't RFC822 characters, but it is convenient to pretend
301 (address-comment-syntax-table
306 (address-domain-literal-syntax-table
311 (address-text-comment-syntax-table
324 (address-text-syntax-table
338 ;; Utility functions and macros.
340 (defmacro mail-undo-backslash-quoting
(beg end
)
343 (narrow-to-region (, beg
) (, end
))
344 (goto-char (point-min))
346 (while (re-search-forward "\\\\\\(.\\)" nil t
)
347 (replace-match "\\1")
348 ;; CHECK: does this leave point after the replacement?
351 (defmacro mail-nuke-char-at
(pos)
355 (insert mail-space-char
))))
357 (defmacro mail-nuke-elements-outside-range
(list-symbol beg-symbol end-symbol
358 &optional no-replace
)
360 (setq temp
(, list-symbol
))
362 (cond ((or (> (car temp
) (, end-symbol
))
363 (< (car temp
) (, beg-symbol
)))
365 (` (mail-nuke-char-at (car temp
)))))
367 (setq temp
(cdr temp
)))
368 (setq (, list-symbol
) (delq nil
(, list-symbol
))))))
370 (defun mail-demarkerize (marker)
373 (let ((temp (marker-position marker
)))
374 (set-marker marker nil
)
378 (defun mail-markerize (pos)
384 (defmacro mail-last-element
(list)
385 "Return last element of LIST."
386 (` (let ((list (, list
)))
387 (while (not (null (cdr list
)))
388 (setq list
(cdr list
)))
391 (defmacro mail-safe-move-sexp
(arg)
392 "Safely skip over one balanced sexp, if there is one. Return t if success."
393 (` (condition-case error
395 (goto-char (scan-sexps (point) (, arg
)))
398 (if (string-equal (nth 1 error
) "Unbalanced parentheses")
401 (signal (car error
) (cdr error
))))))))
404 ;; The main function to grind addresses
406 (defun mail-extract-address-components (address)
407 "Given an rfc 822 ADDRESS, extract full name and canonical address.
408 Returns a list of the form (FULL-NAME CANONICAL-ADDRESS)."
409 (let ((canonicalization-buffer (get-buffer-create "*canonical address*"))
410 (extraction-buffer (get-buffer-create "*extract address components*"))
414 <-pos
>-pos
@-pos
:-pos
,-pos
!-pos %-pos \
;-pos
415 group-
:-pos group-\
;-pos route-addr-:-pos
417 first-real-pos last-real-pos
418 phrase-beg phrase-end
419 comment-beg comment-end
428 saved-%-pos saved-
!-pos saved-
@-pos
429 domain-pos \.-pos insert-point
)
432 (set-buffer extraction-buffer
)
433 (buffer-disable-undo extraction-buffer
)
434 (set-syntax-table address-syntax-table
)
437 (setq case-fold-search nil
)
439 ;; Insert extra space at beginning to allow later replacement with <
440 ;; without having to move markers.
441 (insert mail-space-char address
)
443 ;; stolen from rfc822.el
444 ;; Unfold multiple lines.
445 (goto-char (point-min))
446 (while (re-search-forward "\\([^\\]\\(\\\\\\\\\\)*\\)\n[ \t]" nil t
)
447 (replace-match "\\1 " t
))
449 ;; first pass grabs useful information about address
450 (goto-char (point-min))
452 (skip-chars-forward mail-whitespace
)
454 (setq char
(char-after (point)))
456 (if (not (eq char ?\
())
457 (setq first-real-pos
(point))))
461 (set-syntax-table address-comment-syntax-table
)
462 ;; only record the first non-empty comment's position
463 (if (and (not comment-beg
)
466 (skip-chars-forward mail-whitespace
)
467 (not (eq ?\
) (char-after (point))))))
468 (setq comment-beg
(point)))
469 ;; TODO: don't record if unbalanced
470 (or (mail-safe-move-sexp 1)
472 (set-syntax-table address-syntax-table
)
475 (setq comment-end
(point))))
478 ;; only record the first non-empty quote's position
479 (if (and (not quote-beg
)
482 (skip-chars-forward mail-whitespace
)
483 (not (eq ?
\" (char-after (point))))))
484 (setq quote-beg
(point)))
485 ;; TODO: don't record if unbalanced
486 (or (mail-safe-move-sexp 1)
490 (setq quote-end
(point))))
493 (set-syntax-table address-domain-literal-syntax-table
)
494 (or (mail-safe-move-sexp 1)
496 (set-syntax-table address-syntax-table
))
497 ;; commas delimit addresses when outside < > pairs.
501 ;; handle weird munged addresses
502 (> (mail-last-element <-pos
) (car >-pos
)))))
503 (setq multiple-addresses t
)
505 (narrow-to-region (point-min) (point)))
506 ;; record the position of various interesting chars, determine
508 ((setq record-pos-symbol
510 '((?
< .
<-pos
) (?
> .
>-pos
) (?
@ .
@-pos
)
511 (?
: .
:-pos
) (?
, .
,-pos
) (?
! .
!-pos
)
512 (?% . %-pos
) (?\
; . \;-pos)))))
513 (set record-pos-symbol
514 (cons (point) (symbol-value record-pos-symbol
)))
519 ;; comment terminator illegal
521 ;; domain literal terminator illegal
523 ;; \ allowed only within quoted strings,
524 ;; domain literals, and comments
527 (mail-nuke-char-at (point))
532 (setq last-real-pos
(point))))
534 ;; Use only the leftmost <, if any. Replace all others with spaces.
536 (mail-nuke-char-at (car <-pos
))
537 (setq <-pos
(cdr <-pos
)))
539 ;; Use only the rightmost >, if any. Replace all others with spaces.
541 (mail-nuke-char-at (nth 1 >-pos
))
542 (setcdr >-pos
(nthcdr 2 >-pos
)))
544 ;; If multiple @s and a :, but no < and >, insert around buffer.
545 ;; This commonly happens on the UUCP "From " line. Ugh.
546 (cond ((and (> (length @-pos
) 1)
547 :-pos
;TODO: check if between @s
549 (goto-char (point-min))
551 (setq <-pos
(list (point)))
554 ;; If < but no >, insert > in rightmost possible position
557 (goto-char (point-max))
558 (setq >-pos
(list (point)))
561 ;; If > but no <, replace > with space.
564 (mail-nuke-char-at (car >-pos
))
567 ;; Turn >-pos and <-pos into non-lists
568 (setq >-pos
(car >-pos
)
571 ;; Trim other punctuation lists of items outside < > pair to handle
573 (cond (<-pos
; don't need to check >-pos also
574 ;; handle bozo software that violates RFC 822 by sticking
575 ;; punctuation marks outside of a < > pair
576 (mail-nuke-elements-outside-range @-pos
<-pos
>-pos t
)
577 ;; RFC 822 says nothing about these two outside < >, but
578 ;; remove those positions from the lists to make things
580 (mail-nuke-elements-outside-range !-pos
<-pos
>-pos t
)
581 (mail-nuke-elements-outside-range %-pos
<-pos
>-pos t
)))
583 ;; Check for : that indicates GROUP list and for : part of
585 ;; Can't possibly be more than two :. Nuke any extra.
587 (setq temp
(car :-pos
)
589 (cond ((and <-pos
>-pos
592 (if (or route-addr-
:-pos
595 (< temp
(nth 1 @-pos
)))
596 (mail-nuke-char-at temp
)
597 (setq route-addr-
:-pos temp
)))
601 (setq group-
:-pos temp
))))
603 ;; Nuke any ; that is in or to the left of a < > pair or to the left
604 ;; of a GROUP starting :. Also, there may only be one ;.
606 (setq temp
(car \
;-pos)
608 (cond ((and <-pos
>-pos
611 (mail-nuke-char-at temp
))
612 ((and (or (not group-
:-pos
)
613 (> temp group-
:-pos
))
615 (setq group-\
;-pos temp))))
617 ;; Handle junk like ";@host.company.dom" that sendmail adds.
618 ;; **** should I remember comment positions?
620 ;; this is fine for now
621 (mail-nuke-elements-outside-range !-pos group-
:-pos group-\
;-pos t)
622 (mail-nuke-elements-outside-range @-pos group-
:-pos group-\
;-pos t)
623 (mail-nuke-elements-outside-range %-pos group-
:-pos group-\
;-pos t)
624 (mail-nuke-elements-outside-range ,-pos group-
:-pos group-\
;-pos t)
626 (> last-real-pos
(1+ group-\
;-pos))
627 (setq last-real-pos
(1+ group-\
;-pos)))
629 (> comment-end group-\
;-pos)
630 (setq comment-end nil
633 (> quote-end group-\
;-pos)
636 (narrow-to-region (point-min) group-\
;-pos))
638 ;; Any commas must be between < and : of ROUTE-ADDR. Nuke any
640 ;; Hell, go ahead an nuke all of the commas.
641 ;; **** This will cause problems when we start handling commas in
642 ;; the PHRASE part .... no it won't ... yes it will ... ?????
643 (mail-nuke-elements-outside-range ,-pos
1 1)
645 ;; can only have multiple @s inside < >. The fact that some MTAs
646 ;; put de-bracketed ROUTE-ADDRs in the UUCP-style "From " line is
649 ;; Locate PHRASE part of ROUTE-ADDR.
652 (skip-chars-backward mail-whitespace
)
653 (setq phrase-end
(point))
654 (goto-char (or ;;group-:-pos
656 (skip-chars-forward mail-whitespace
)
657 (if (< (point) phrase-end
)
658 (setq phrase-beg
(point))
659 (setq phrase-end nil
))))
661 ;; handle ROUTE-ADDRS with real ROUTEs.
662 ;; If there are multiple @s, then we assume ROUTE-ADDR syntax, and
663 ;; any % or ! must be semantically meaningless.
664 ;; TODO: do this processing into canonicalization buffer
665 (cond (route-addr-:-pos
668 >-pos
(copy-marker >-pos
)
669 route-addr-
:-pos
(copy-marker route-addr-
:-pos
))
671 (insert-before-markers ?X
)
672 (goto-char (car @-pos
))
673 (while (setq @-pos
(cdr @-pos
))
675 (setq %-pos
(cons (point-marker) %-pos
))
677 (goto-char (1- >-pos
))
679 (insert-buffer-substring extraction-buffer
680 (car @-pos
) route-addr-
:-pos
)
681 (delete-region (car @-pos
) route-addr-
:-pos
))
683 (setq saved-
@-pos
(list (point)))))
684 (setq @-pos saved-
@-pos
)
687 (mail-nuke-char-at route-addr-
:-pos
)
688 (mail-demarkerize route-addr-
:-pos
)
689 (setq route-addr-
:-pos nil
690 >-pos
(mail-demarkerize >-pos
)
691 %-pos
(mapcar 'mail-demarkerize %-pos
))))
694 (setq @-pos
(car @-pos
))
696 ;; TODO: remove comments in the middle of an address
698 (set-buffer canonicalization-buffer
)
700 (buffer-disable-undo canonicalization-buffer
)
701 (set-syntax-table address-syntax-table
)
702 (setq case-fold-search nil
)
706 (insert-buffer-substring extraction-buffer
)
709 (narrow-to-region (progn
710 (goto-char (1+ <-pos
))
711 (skip-chars-forward mail-whitespace
)
714 ;; ****** Oh no! What if the address is completely empty!
715 (narrow-to-region first-real-pos last-real-pos
))
718 (mail-nuke-elements-outside-range %-pos
(point-min) @-pos
))
720 (mail-nuke-elements-outside-range !-pos
(point-min) (car %-pos
)))
721 (and @-pos
!-pos
(not %-pos
)
722 (mail-nuke-elements-outside-range !-pos
(point-min) @-pos
))
724 ;; Error condition:?? (and %-pos (not @-pos))
727 ;; **** I don't understand this save-restriction and the
728 ;; narrow-to-region inside it. Why did I do that?
731 mail-
@-binds-tighter-than-
!)
733 (setq %-pos
(cons (point) %-pos
)
737 (setq insert-point
(point-max)))
738 (mail-@-binds-tighter-than-
!
739 (setq insert-point
(point-max)))
741 (setq insert-point
(mail-last-element %-pos
)
742 saved-%-pos
(mapcar 'mail-markerize %-pos
)
744 @-pos
(mail-markerize @-pos
)))
746 (setq insert-point
@-pos
)
747 (setq @-pos
(mail-markerize @-pos
)))
749 (setq insert-point
(point-max))))
750 (narrow-to-region (point-min) insert-point
)
751 (setq saved-
!-pos
(car !-pos
))
753 (goto-char (point-max))
754 (cond ((and (not @-pos
)
757 (insert-before-markers "@ "))
759 (setq %-pos
(cons (point) %-pos
))
760 (insert-before-markers "% ")))
762 (insert-buffer-substring
770 (mail-safe-move-sexp -
1)
771 (skip-chars-backward mail-whitespace
)
772 (eq ?.
(preceding-char)))
773 (insert-before-markers
775 (skip-chars-backward mail-whitespace
)
776 (eq ?.
(preceding-char)))
780 (setq !-pos
(cdr !-pos
))))
782 (setq %-pos
(append (mapcar 'mail-demarkerize saved-%-pos
)
784 (setq @-pos
(mail-demarkerize @-pos
))
785 (narrow-to-region (1+ saved-
!-pos
) (point-max))))
788 (goto-char (car %-pos
))
792 (setq %-pos
(cdr %-pos
))))
793 (setq %-pos
(nreverse %-pos
))
794 ;; RFC 1034 doesn't approve of this, oh well:
795 (downcase-region (or (car %-pos
) @-pos
(point-max)) (point-max))
796 (cond (%-pos
; implies @-pos valid
800 (goto-char (or (nth 1 temp
)
802 (skip-chars-backward mail-whitespace
)
804 (mail-safe-move-sexp -
1)
805 (setq domain-pos
(point))
806 (skip-chars-backward mail-whitespace
)
807 (setq \.-pos
(eq ?.
(preceding-char))))
811 (buffer-substring domain-pos
(point)))
813 (narrow-to-region (point-min) (point))
814 (goto-char (car temp
))
818 (setq %-pos
(delq @-pos %-pos
))
820 (throw 'truncated t
)))
821 (setq temp
(cdr temp
))))))
822 (setq mbox-beg
(point-min)
823 mbox-end
(if %-pos
(car %-pos
)
827 ;; Done canonicalizing address.
829 (set-buffer extraction-buffer
)
831 ;; Find the full name
833 (cond ((and phrase-beg
834 (eq quote-beg phrase-beg
)
835 (<= quote-end phrase-end
))
836 (narrow-to-region (1+ quote-beg
) (1- quote-end
))
837 (mail-undo-backslash-quoting (point-min) (point-max)))
839 (narrow-to-region phrase-beg phrase-end
))
841 (narrow-to-region (1+ comment-beg
) (1- comment-end
))
842 (mail-undo-backslash-quoting (point-min) (point-max)))
844 ;; *** Work in canon buffer instead? No, can't. Hmm.
845 (delete-region (point-min) (point-max))
846 (insert-buffer-substring canonicalization-buffer
848 (goto-char (point-min))
849 (setq \.-ends-name
(search-forward "_" nil t
))
850 (goto-char (point-min))
852 (skip-chars-forward mail-whitespace
)
854 (setq char
(char-after (point)))
857 (setq quote-beg
(point))
858 (or (mail-safe-move-sexp 1)
859 ;; TODO: handle this error condition!!!!!
861 ;; take into account deletions
862 (setq quote-end
(- (point) 2))
866 (goto-char quote-beg
)
868 (mail-undo-backslash-quoting quote-beg quote-end
)
869 (or (eq mail-space-char
(char-after (point)))
871 (setq \.-ends-name t
))
873 (if (eq (char-after (1+ (point))) ?_
)
877 (insert mail-space-char
))
879 (narrow-to-region (point-min) (point))
882 ((memq (char-syntax char
) '(?. ?
\\))
886 (setq atom-beg
(point))
888 (setq atom-end
(point))
890 (narrow-to-region atom-beg atom-end
)
891 (goto-char (point-min))
892 (while (re-search-forward "\\([^_]+\\)_" nil t
)
893 (replace-match "\\1 "))
894 (goto-char (point-max))))))))
896 (set-syntax-table address-text-syntax-table
)
898 (setq xxx
(mail-variant-method (buffer-string)))
899 (delete-region (point-min) (point-max))
901 (goto-char (point-min))
903 ;; ;; Compress whitespace
904 ;; (goto-char (point-min))
905 ;; (while (re-search-forward "[ \t\n]+" nil t)
906 ;; (replace-match " "))
908 ;; ;; Fix . used as space
909 ;; (goto-char (point-min))
910 ;; (while (re-search-forward mail-bad-\.-pattern nil t)
911 ;; (replace-match "\\1 \\2"))
913 ;; ;; Delete trailing parenthesized comment
914 ;; (goto-char (point-max))
915 ;; (skip-chars-backward mail-whitespace)
916 ;; (cond ((memq (char-after (1- (point))) '(?\) ?\} ?\]))
917 ;; (setq comment-end (point))
918 ;; (set-syntax-table address-text-comment-syntax-table)
919 ;; (or (mail-safe-move-sexp -1)
920 ;; (backward-char 1))
921 ;; (set-syntax-table address-text-syntax-table)
922 ;; (setq comment-beg (point))
923 ;; (skip-chars-backward mail-whitespace)
925 ;; (narrow-to-region (1+ comment-beg) (1- comment-end))
926 ;; (narrow-to-region (point-min) (point)))))
928 ;; ;; Find, save, and delete any name suffix
930 ;; (goto-char (point-min))
931 ;; (cond ((re-search-forward mail-full-name-suffix-pattern nil t)
932 ;; (setq name-suffix (buffer-substring (match-beginning 3)
934 ;; (replace-match "\\1 \\4")))
936 ;; ;; Delete ALL CAPS words and after, if preceded by mixed-case or
937 ;; ;; lowercase words. Eg. XT-DEM.
938 ;; (goto-char (point-min))
939 ;; ;; ## This will lose on something like "SMITH MAX".
940 ;; ;; ## maybe it should be
941 ;; ;; ## " \\([A-Z]+[-_/][A-Z]+\\|[A-Z][A-Z][A-Z]\\)\\b.*[^A-Z \t]"
942 ;; ;; ## that is, three-letter-upper-case-word with non-upper-case
943 ;; ;; ## characters following it.
944 ;; (if (re-search-forward mail-mixed-case-name-pattern nil t)
945 ;; (if (re-search-forward mail-weird-acronym-pattern nil t)
946 ;; (narrow-to-region (point-min) (match-beginning 0))))
948 ;; ;; Delete trailing alternative address
949 ;; (goto-char (point-min))
950 ;; (if (re-search-forward mail-alternative-address-pattern nil t)
951 ;; (narrow-to-region (point-min) (match-beginning 0)))
953 ;; ;; Delete trailing comment
954 ;; (goto-char (point-min))
955 ;; (if (re-search-forward mail-trailing-comment-start-pattern nil t)
957 ;; (goto-char (match-beginning 0))
958 ;; (skip-chars-backward mail-whitespace)
960 ;; (narrow-to-region (point-min) (match-beginning 0))))
962 ;; ;; Delete trailing comma-separated comment
963 ;; (goto-char (point-min))
964 ;; ;; ## doesn't this break "Smith, John"? Yes.
965 ;; (re-search-forward mail-last-name-first-pattern nil t)
966 ;; (while (search-forward "," nil t)
967 ;; (or (save-excursion
969 ;; (looking-at mail-full-name-suffix-pattern))
970 ;; (narrow-to-region (point-min) (1- (point)))))
972 ;; ;; Delete telephone numbers and ham radio call signs
973 ;; (goto-char (point-min))
974 ;; (if (re-search-forward mail-telephone-extension-pattern nil t)
975 ;; (narrow-to-region (point-min) (match-beginning 0)))
976 ;; (goto-char (point-min))
977 ;; (if (re-search-forward mail-ham-call-sign-pattern nil t)
978 ;; (if (eq (match-beginning 0) (point-min))
979 ;; (narrow-to-region (match-end 0) (point-max))
980 ;; (narrow-to-region (point-min) (match-beginning 0))))
982 ;; ;; Delete trailing word followed immediately by .
983 ;; (goto-char (point-min))
984 ;; ;; ## what's this for? doesn't it mess up "Public, Harry Q."? No.
985 ;; (if (re-search-forward "\\b[A-Za-z][A-Za-z]+\\. *\\'" nil t)
986 ;; (narrow-to-region (point-min) (match-beginning 0)))
988 ;; ;; Handle & substitution
989 ;; ;; TODO: remember to disable middle initial guessing
990 ;; (goto-char (point-min))
991 ;; (cond ((re-search-forward "\\( \\|\\`\\)&\\( \\|\\'\\)" nil t)
992 ;; (goto-char (match-end 1))
994 ;; (capitalize-region
997 ;; (insert-buffer-substring canonicalization-buffer
998 ;; mbox-beg mbox-end)
1001 ;; ;; Delete nickname
1002 ;; (goto-char (point-min))
1003 ;; (if (re-search-forward mail-nickname-pattern nil t)
1004 ;; (replace-match (if (eq (match-beginning 2) (1- (match-end 2)))
1008 ;; ;; Fixup initials
1010 ;; (goto-char (point-min))
1011 ;; (re-search-forward mail-bad-initials-pattern nil t))
1013 ;; (if (match-beginning 4)
1015 ;; (if (match-beginning 5)
1020 ;; (goto-char (point-min))
1021 ;; (if (re-search-forward mail-full-name-prefixes nil t)
1022 ;; (narrow-to-region (point) (point-max)))
1024 ;; ;; Delete trailing and preceding non-name characters
1025 ;; (goto-char (point-min))
1026 ;; (skip-chars-forward mail-non-begin-name-chars)
1027 ;; (narrow-to-region (point) (point-max))
1028 ;; (goto-char (point-max))
1029 ;; (skip-chars-backward mail-non-end-name-chars)
1030 ;; (narrow-to-region (point-min) (point))
1032 ;; If name is "First Last" and userid is "F?L", then assume
1033 ;; the middle initial is the second letter in the userid.
1034 ;; initially by Jamie Zawinski <jwz@lucid.com>
1035 (cond ((and (eq 3 (- mbox-end mbox-beg
))
1037 (goto-char (point-min))
1038 (looking-at mail-two-name-pattern
)))
1039 (setq fi
(char-after (match-beginning 0))
1040 li
(char-after (match-beginning 3)))
1042 (set-buffer canonicalization-buffer
)
1043 ;; char-equal is ignoring case here, so no need to upcase
1045 (let ((case-fold-search t
))
1046 (and (char-equal fi
(char-after mbox-beg
))
1047 (char-equal li
(char-after (1- mbox-end
)))
1048 (setq mi
(char-after (1+ mbox-beg
))))))
1050 ;; TODO: use better table than syntax table
1051 (eq ?w
(char-syntax mi
)))
1052 (goto-char (match-beginning 3))
1053 (insert (upcase mi
) ". ")))))
1055 ;; ;; Restore suffix
1056 ;; (cond (name-suffix
1057 ;; (goto-char (point-max))
1058 ;; (insert ", " name-suffix)
1059 ;; (backward-word 1)
1060 ;; (cond ((memq (following-char) '(?j ?J ?s ?S))
1061 ;; (capitalize-word 1)
1062 ;; (or (eq (following-char) ?.)
1065 ;; (upcase-word 1)))))
1068 (list (buffer-string)
1070 (set-buffer canonicalization-buffer
)
1074 ;; TODO: put this back in the above function now that it's proven:
1075 (defun mail-variant-method (string)
1076 (let ((variant-buffer (get-buffer-create "*variant method buffer*"))
1078 mixed-case-flag lower-case-flag upper-case-flag
1079 suffix-flag last-name-comma-flag
1080 comment-beg comment-end initial beg end
1083 (set-buffer variant-buffer
)
1084 (buffer-disable-undo variant-buffer
)
1085 (set-syntax-table address-text-syntax-table
)
1088 (setq case-fold-search nil
)
1092 ;; Fix . used as space
1093 (goto-char (point-min))
1094 (while (re-search-forward mail-bad-\.-pattern nil t
)
1095 (replace-match "\\1 \\2"))
1097 ;; Skip any initial garbage.
1098 (goto-char (point-min))
1099 (skip-chars-forward mail-non-begin-name-chars
)
1100 (skip-chars-backward "& \"")
1101 (narrow-to-region (point) (point-max))
1105 (skip-chars-forward mail-whitespace
)
1110 ((and (eq word-count
0)
1111 (looking-at mail-full-name-prefixes
))
1112 (goto-char (match-end 0))
1113 (narrow-to-region (point) (point-max)))
1115 ;; Stop after name suffix
1116 ((and (>= word-count
2)
1117 (looking-at mail-full-name-suffix-pattern
))
1118 (skip-chars-backward mail-whitespace
)
1119 (setq suffix-flag
(point))
1120 (if (eq ?
, (following-char))
1123 ;; Enforce at least one space after comma
1124 (or (eq mail-space-char
(following-char))
1125 (insert mail-space-char
))
1126 (skip-chars-forward mail-whitespace
)
1127 (cond ((memq (following-char) '(?j ?J ?s ?S
))
1129 (if (eq (following-char) ?.
)
1134 (setq word-count
(1+ word-count
))
1138 ((looking-at "MKA \\(.+\\)") ; "Mundanely Known As"
1140 (goto-char (match-beginning 1))
1141 (narrow-to-region (point) (point-max)))
1143 ;; Various stopping points
1145 ;; Stop before ALL CAPS acronyms, if preceded by mixed-case or
1146 ;; lowercase words. Eg. XT-DEM.
1147 (and (>= word-count
2)
1148 (or mixed-case-flag lower-case-flag
)
1149 (looking-at mail-weird-acronym-pattern
)
1150 (not (looking-at mail-roman-numeral-pattern
)))
1151 ;; Stop before 4-or-more letter lowercase words preceded by
1152 ;; mixed case or uppercase words.
1153 (and (>= word-count
2)
1154 (or upper-case-flag mixed-case-flag
)
1155 (looking-at "[a-z][a-z][a-z][a-z]+\\b"))
1156 ;; Stop before trailing alternative address
1157 (looking-at mail-alternative-address-pattern
)
1158 ;; Stop before trailing comment not introduced by comma
1159 (looking-at mail-trailing-comment-start-pattern
)
1160 ;; Stop before telephone numbers
1161 (looking-at mail-telephone-extension-pattern
))
1164 ;; Check for initial last name followed by comma
1165 ((and (eq ?
, (following-char))
1168 (setq last-name-comma-flag t
)
1169 (or (eq mail-space-char
(following-char))
1170 (insert mail-space-char
)))
1172 ;; Stop before trailing comma-separated comment
1173 ((eq ?
, (following-char))
1176 ;; Delete parenthesized/quoted comment/nickname
1177 ((memq (following-char) '(?\
( ?\
{ ?\
[ ?
\" ?
\' ?\
`))
1178 (setq comment-beg
(point))
1179 (set-syntax-table address-text-comment-syntax-table
)
1180 (cond ((memq (following-char) '(?
\' ?\
`))
1181 (if (eq ?
\' (following-char))
1183 (or (search-forward "'" nil t
)
1186 (or (mail-safe-move-sexp 1)
1187 (goto-char (point-max)))))
1188 (set-syntax-table address-text-syntax-table
)
1189 (setq comment-end
(point))
1191 ;; Handle case of entire name being quoted
1192 ((and (eq word-count
0)
1193 (looking-at " *\\'")
1194 (>= (- comment-end comment-beg
) 2))
1195 (narrow-to-region (1+ comment-beg
) (1- comment-end
))
1196 (goto-char (point-min)))
1198 ;; Handle case of quoted initial
1199 (if (and (or (= 3 (- comment-end comment-beg
))
1200 (and (= 4 (- comment-end comment-beg
))
1201 (eq ?.
(char-after (+ 2 comment-beg
)))))
1202 (not (looking-at " *\\'")))
1203 (setq initial
(char-after (1+ comment-beg
)))
1205 (delete-region comment-beg comment-end
)
1207 (insert initial
". ")))))
1209 ;; Delete ham radio call signs
1210 ((looking-at mail-ham-call-sign-pattern
)
1211 (delete-region (match-beginning 0) (match-end 0)))
1213 ;; Handle & substitution
1214 ;; TODO: remember to disable middle initial guessing
1216 (eq mail-space-char
(preceding-char)))
1217 (looking-at "&\\( \\|\\'\\)"))
1222 (insert-buffer-substring canonicalization-buffer
1227 ((looking-at mail-initial-pattern
)
1228 (or (eq (following-char) (upcase (following-char)))
1229 (setq lower-case-flag t
))
1231 (if (eq ?.
(following-char))
1234 (or (eq mail-space-char
(following-char))
1235 (insert mail-space-char
))
1236 (setq word-count
(1+ word-count
)))
1238 ;; Regular name words
1239 ((looking-at mail-name-pattern
)
1241 (setq end
(match-end 0))
1242 (set (if (re-search-forward "[a-z]" end t
)
1245 (re-search-forward "[A-Z]" end t
))
1248 'upper-case-flag
) t
)
1250 (setq word-count
(1+ word-count
)))
1255 (narrow-to-region (point-min) (point))
1257 ;; Delete trailing word followed immediately by .
1258 (cond ((not suffix-flag
)
1259 (goto-char (point-min))
1260 (if (re-search-forward "\\b[A-Za-z][A-Za-z]+\\. *\\'" nil t
)
1261 (narrow-to-region (point-min) (match-beginning 0)))))
1263 ;; If last name first put it at end (but before suffix)
1264 (cond (last-name-comma-flag
1265 (goto-char (point-min))
1266 (search-forward ",")
1267 (setq end
(1- (point)))
1268 (goto-char (or suffix-flag
(point-max)))
1269 (or (eq mail-space-char
(preceding-char))
1270 (insert mail-space-char
))
1271 (insert-buffer-substring (current-buffer) (point-min) end
)
1272 (narrow-to-region (1+ end
) (point-max))))
1274 (goto-char (point-max))
1275 (skip-chars-backward mail-non-end-name-chars
)
1276 (if (eq ?.
(following-char))
1278 (narrow-to-region (point)
1280 (goto-char (point-min))
1281 (skip-chars-forward mail-non-begin-name-chars
)
1284 ;; Compress whitespace
1285 (goto-char (point-min))
1286 (while (re-search-forward "[ \t\n]+" nil t
)
1287 (replace-match " "))
1289 (buffer-substring (point-min) (point-max))
1293 ;; The country names are just in there for show right now, and because
1294 ;; Jamie thought it would be neat. They aren't used yet.
1296 ;; Keep in mind that the country abbreviations follow ISO-3166. There is
1297 ;; a U.S. FIPS that specifies a different set of two-letter country
1300 ;; TODO: put this in its own obarray, instead of cluttering up the main
1301 ;; symbol table with junk.
1307 (put x
'domain-name t
)
1308 (put (car x
) 'domain-name
(nth 1 x
)))))
1310 (ar "Argentina") ; Argentine Republic
1311 arpa
; Advanced Projects Research Agency
1312 (at "Austria") ; The Republic of _
1315 (be "Belgium") ; The Kingdom of _
1317 bitnet
; Because It's Time NET
1318 (bo "Bolivia") ; Republic of _
1319 (br "Brazil") ; The Federative Republic of _
1323 (ch "Switzerland") ; The Swiss Confederation
1324 (cl "Chile") ; The Republic of _
1325 (cn "China") ; The People's Republic of _
1328 (cr "Costa Rica") ; The Republic of _
1329 (cs "Czechoslovakia")
1333 (do "Dominican Republic") ; The _
1334 (ec "Ecuador") ; The Republic of _
1336 (eg "Egypt") ; The Arab Republic of _
1337 (es "Spain") ; The Kingdom of _
1338 (fi "Finland") ; The Republic of _
1341 gov
; Government (U.S.A.)
1342 (gr "Greece") ; The Hellenic Republic
1344 (hu "Hungary") ; The Hungarian People's Republic (???)
1346 (il "Israel") ; The State of _
1347 (in "India") ; The Republic of _
1348 int
; something British, don't know what
1349 (is "Iceland") ; The Republic of _
1350 (it "Italy") ; The Italian Republic
1353 (kn "St. Kitts and Nevis")
1356 (lk "Sri Lanka") ; The Democratic Socialist Republic of _
1357 mil
; Military (U.S.A.)
1358 (mx "Mexico") ; The United Mexican States
1359 (my "Malaysia") ; changed to Myanmar????
1361 nato
; North Atlantic Treaty Organization
1363 (ni "Nicaragua") ; The Republic of _
1364 (nl "Netherlands") ; The Kingdom of the _
1365 (no "Norway") ; The Kingdom of _
1369 (pg "Papua New Guinea")
1370 (ph "Philippines") ; The Republic of the _
1373 (pt "Portugal") ; The Portugese Republic
1375 (se "Sweden") ; The Kingdom of _
1376 (sg "Singapore") ; The Republic of _
1379 (th "Thailand") ; The Kingdom of _
1381 (tr "Turkey") ; The Republic of _
1382 (tt "Trinidad and Tobago")
1384 (uk "United Kingdom") ; The _ of Great Britain
1385 unter-dom
; something German
1386 (us "U.S.A.") ; The United States of America
1387 uucp
; Unix to Unix CoPy
1388 (uy "Uruguay") ; The Eastern Republic of _
1389 (vc "St. Vincent and the Grenadines")
1390 (ve "Venezuela") ; The Republic of _
1391 (yu "Yugoslavia") ; The Socialist Federal Republic of _
1392 ;; Also said to be Zambia ...
1393 (za "South Africa") ; The Republic of _ (why not Zaire???)
1394 (zw "Zimbabwe") ; Republic of _
1399 ;; Code for testing.
1401 (defun time-extract ()
1403 (setq times
(cons (current-time-string) times
)
1404 list problem-address-alist
)
1406 (mail-extract-address-components (car (car list
)))
1407 (setq list
(cdr list
)))
1408 (setq times
(cons (current-time-string) times
))
1411 (defun test-extract (&optional starting-point
)
1413 (set-buffer (get-buffer-create "*Testing*"))
1416 (mapcar 'test-extract-internal
1418 (memq starting-point problem-address-alist
)
1419 problem-address-alist
)))
1421 (defvar failed-item
)
1422 (defun test-extract-internal (item)
1423 (setq failed-item item
)
1424 (let* ((address (car item
))
1425 (correct-name (nth 1 item
))
1426 (correct-canon (nth 2 item
))
1427 (result (mail-extract-address-components address
))
1429 (canon (nth 1 result
))
1430 (name-correct (or (null correct-name
)
1431 (string-equal (downcase correct-name
)
1433 (canon-correct (or (null correct-canon
)
1434 (string-equal correct-canon canon
))))
1435 (cond ((not (and name-correct canon-correct
))
1436 (pop-to-buffer "*Testing*")
1437 (select-window (get-buffer-window (current-buffer)))
1438 (goto-char (point-max))
1439 (insert "Address: " address
"\n")
1440 (if (not name-correct
)
1441 (insert " Correct Name: [" correct-name
1442 "]\; Result: [" name
"]\n"))
1443 (if (not canon-correct
)
1444 (insert " Correct Canon: [" correct-canon
1445 "]\; Result: [" canon
"]\n"))
1448 (setq failed-item nil
))
1450 (defun test-continue-extract ()
1452 (test-extract failed-item
))
1457 ;; warsaw@nlm.nih.gov (A Bad Dude -- Barry Warsaw)
1468 ;;; mail-extr.el ends here