lisp/emacs-lisp/rx.el

   1 ;;; rx.el --- sexp notation for regular expressions
   2
   3 ;; Copyright (C) 2001-2011 Free Software Foundation, Inc.
   4
   5 ;; Author: Gerd Moellmann <gerd@gnu.org>
   6 ;; Maintainer: FSF
   7 ;; Keywords: strings, regexps, extensions
   8
   9 ;; This file is part of GNU Emacs.
  10
  11 ;; GNU Emacs is free software: you can redistribute it and/or modify
  12 ;; it under the terms of the GNU General Public License as published by
  13 ;; the Free Software Foundation, either version 3 of the License, or
  14 ;; (at your option) any later version.
  15
  16 ;; GNU Emacs is distributed in the hope that it will be useful,
  17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 ;; GNU General Public License for more details.
  20
  21 ;; You should have received a copy of the GNU General Public License
  22 ;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
  23
  24 ;;; Commentary:
  25
  26 ;; This is another implementation of sexp-form regular expressions.
  27 ;; It was unfortunately written without being aware of the Sregex
  28 ;; package coming with Emacs, but as things stand, Rx completely
  29 ;; covers all regexp features, which Sregex doesn't, doesn't suffer
  30 ;; from the bugs mentioned in the commentary section of Sregex, and
  31 ;; uses a nicer syntax (IMHO, of course :-).
  32
  33 ;; This significantly extended version of the original, is almost
  34 ;; compatible with Sregex.  The only incompatibility I (fx) know of is
  35 ;; that the `repeat' form can't have multiple regexp args.
  36
  37 ;; Now alternative forms are provided for a degree of compatibility
  38 ;; with Shivers' attempted definitive SRE notation
  39 ;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>.  SRE forms not
  40 ;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
  41 ;; ,<exp>, (word ...), word+, posix-string, and character class forms.
  42 ;; Some forms are inconsistent with SRE, either for historical reasons
  43 ;; or because of the implementation -- simple translation into Emacs
  44 ;; regexp strings.  These include: any, word.  Also, case-sensitivity
  45 ;; and greediness are controlled by variables external to the regexp,
  46 ;; and you need to feed the forms to the `posix-' functions to get
  47 ;; SRE's POSIX semantics.  There are probably more difficulties.
  48
  49 ;; Rx translates a sexp notation for regular expressions into the
  50 ;; usual string notation.  The translation can be done at compile-time
  51 ;; by using the `rx' macro.  It can be done at run-time by calling
  52 ;; function `rx-to-string'.  See the documentation of `rx' for a
  53 ;; complete description of the sexp notation.
  54 ;;
  55 ;; Some examples of string regexps and their sexp counterparts:
  56 ;;
  57 ;; "^[a-z]*"
  58 ;; (rx (and line-start (0+ (in "a-z"))))
  59 ;;
  60 ;; "\n[^ \t]"
  61 ;; (rx (and "\n" (not blank))), or
  62 ;; (rx (and "\n" (not (any " \t"))))
  63 ;;
  64 ;; "\\*\\*\\* EOOH \\*\\*\\*\n"
  65 ;; (rx "*** EOOH ***\n")
  66 ;;
  67 ;; "\\<\\(catch\\|finally\\)\\>[^_]"
  68 ;; (rx (and word-start (submatch (or "catch" "finally")) word-end
  69 ;;          (not (any ?_))))
  70 ;;
  71 ;; "[ \t\n]*:\\([^:]+\\|$\\)"
  72 ;; (rx (and (zero-or-more (in " \t\n")) ":"
  73 ;;          (submatch (or line-end (one-or-more (not (any ?:)))))))
  74 ;;
  75 ;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
  76 ;; (rx (and line-start
  77 ;;          "content-transfer-encoding:"
  78 ;;          (+ (? ?\n)) blank
  79 ;;          "quoted-printable"
  80 ;;          (+ (? ?\n)) blank))
  81 ;;
  82 ;; (concat "^\\(?:" something-else "\\)")
  83 ;; (rx (and line-start (eval something-else))), statically or
  84 ;; (rx-to-string '(and line-start ,something-else)), dynamically.
  85 ;;
  86 ;; (regexp-opt '(STRING1 STRING2 ...))
  87 ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
  88 ;; calls `regexp-opt' as needed.
  89 ;;
  90 ;; "^;;\\s-*\n\\|^\n"
  91 ;; (rx (or (and line-start ";;" (0+ space) ?\n)
  92 ;;         (and line-start ?\n)))
  93 ;;
  94 ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
  95 ;; (rx (and "$Id: "
  96 ;;          (1+ (not (in " ")))
  97 ;;          " "
  98 ;;          (submatch (1+ (not (in " "))))
  99 ;;          " "))
 100 ;;
 101 ;; "\\\\\\\\\\[\\w+"
 102 ;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
 103 ;;
 104 ;; etc.
 105
 106 ;;; History:
 107 ;;
 108
 109 ;;; Code:
 110
 111 (defconst rx-constituents
 112   '((and                . (rx-and 1 nil))
 113     (seq                . and)          ; SRE
 114     (:                  . and)          ; SRE
 115     (sequence           . and)          ; sregex
 116     (or                 . (rx-or 1 nil))
 117     (|                  . or)           ; SRE
 118     (not-newline        . ".")
 119     (nonl               . not-newline)  ; SRE
 120     (anything           . (rx-anything 0 nil))
 121     (any                . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
 122     (any                . ".")          ; sregex
 123     (in                 . any)
 124     (char               . any)          ; sregex
 125     (not-char           . (rx-not-char 1 nil rx-check-any)) ; sregex
 126     (not                . (rx-not 1 1 rx-check-not))
 127     (repeat             . (rx-repeat 2 nil))
 128     (=                  . (rx-= 2 nil))    ; SRE
 129     (>=                 . (rx->= 2 nil))   ; SRE
 130     (**                 . (rx-** 2 nil))   ; SRE
 131     (submatch           . (rx-submatch 1 nil)) ; SRE
 132     (group              . submatch)     ; sregex
 133     (zero-or-more       . (rx-kleene 1 nil))
 134     (one-or-more        . (rx-kleene 1 nil))
 135     (zero-or-one        . (rx-kleene 1 nil))
 136     (\?                 . zero-or-one)  ; SRE
 137     (\??                . zero-or-one)
 138     (*                  . zero-or-more) ; SRE
 139     (*?                 . zero-or-more)
 140     (0+                 . zero-or-more)
 141     (+                  . one-or-more)  ; SRE
 142     (+?                 . one-or-more)
 143     (1+                 . one-or-more)
 144     (optional           . zero-or-one)
 145     (opt                . zero-or-one)  ; sregex
 146     (minimal-match      . (rx-greedy 1 1))
 147     (maximal-match      . (rx-greedy 1 1))
 148     (backref            . (rx-backref 1 1 rx-check-backref))
 149     (line-start         . "^")
 150     (bol                . line-start)   ; SRE
 151     (line-end           . "$")
 152     (eol                . line-end)     ; SRE
 153     (string-start       . "\\`")
 154     (bos                . string-start) ; SRE
 155     (bot                . string-start) ; sregex
 156     (string-end         . "\\'")
 157     (eos                . string-end)   ; SRE
 158     (eot                . string-end)   ; sregex
 159     (buffer-start       . "\\`")
 160     (buffer-end         . "\\'")
 161     (point              . "\\=")
 162     (word-start         . "\\<")
 163     (bow                . word-start)   ; SRE
 164     (word-end           . "\\>")
 165     (eow                . word-end)     ; SRE
 166     (word-boundary      . "\\b")
 167     (not-word-boundary  . "\\B")        ; sregex
 168     (symbol-start       . "\\_<")
 169     (symbol-end         . "\\_>")
 170     (syntax             . (rx-syntax 1 1))
 171     (not-syntax         . (rx-not-syntax 1 1)) ; sregex
 172     (category           . (rx-category 1 1 rx-check-category))
 173     (eval               . (rx-eval 1 1))
 174     (regexp             . (rx-regexp 1 1 stringp))
 175     (regex              . regexp)       ; sregex
 176     (digit              . "[[:digit:]]")
 177     (numeric            . digit)        ; SRE
 178     (num                . digit)        ; SRE
 179     (control            . "[[:cntrl:]]") ; SRE
 180     (cntrl              . control)       ; SRE
 181     (hex-digit          . "[[:xdigit:]]") ; SRE
 182     (hex                . hex-digit)      ; SRE
 183     (xdigit             . hex-digit)      ; SRE
 184     (blank              . "[[:blank:]]")  ; SRE
 185     (graphic            . "[[:graph:]]")  ; SRE
 186     (graph              . graphic)        ; SRE
 187     (printing           . "[[:print:]]")  ; SRE
 188     (print              . printing)       ; SRE
 189     (alphanumeric       . "[[:alnum:]]")  ; SRE
 190     (alnum              . alphanumeric)   ; SRE
 191     (letter             . "[[:alpha:]]")
 192     (alphabetic         . letter)       ; SRE
 193     (alpha              . letter)       ; SRE
 194     (ascii              . "[[:ascii:]]") ; SRE
 195     (nonascii           . "[[:nonascii:]]")
 196     (lower              . "[[:lower:]]") ; SRE
 197     (lower-case         . lower)         ; SRE
 198     (punctuation        . "[[:punct:]]") ; SRE
 199     (punct              . punctuation)   ; SRE
 200     (space              . "[[:space:]]") ; SRE
 201     (whitespace         . space)         ; SRE
 202     (white              . space)         ; SRE
 203     (upper              . "[[:upper:]]") ; SRE
 204     (upper-case         . upper)         ; SRE
 205     (word               . "[[:word:]]")  ; inconsistent with SRE
 206     (wordchar           . word)          ; sregex
 207     (not-wordchar       . "\\W"))
 208   "Alist of sexp form regexp constituents.
 209 Each element of the alist has the form (SYMBOL . DEFN).
 210 SYMBOL is a valid constituent of sexp regular expressions.
 211 If DEFN is a string, SYMBOL is translated into DEFN.
 212 If DEFN is a symbol, use the definition of DEFN, recursively.
 213 Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE).
 214 FUNCTION is used to produce code for SYMBOL.  MIN-ARGS and MAX-ARGS
 215 are the minimum and maximum number of arguments the function-form
 216 sexp constituent SYMBOL may have in sexp regular expressions.
 217 MAX-ARGS nil means no limit.  PREDICATE, if specified, means that
 218 all arguments must satisfy PREDICATE.")
 219
 220
 221 (defconst rx-syntax
 222   '((whitespace         . ?-)
 223     (punctuation        . ?.)
 224     (word               . ?w)
 225     (symbol             . ?_)
 226     (open-parenthesis   . ?\()
 227     (close-parenthesis  . ?\))
 228     (expression-prefix  . ?\')
 229     (string-quote       . ?\")
 230     (paired-delimiter   . ?$)
 231     (escape             . ?\\)
 232     (character-quote    . ?/)
 233     (comment-start      . ?<)
 234     (comment-end        . ?>)
 235     (string-delimiter   . ?|)
 236     (comment-delimiter  . ?!))
 237   "Alist mapping Rx syntax symbols to syntax characters.
 238 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
 239 symbol in `(syntax SYMBOL)', and CHAR is the syntax character
 240 corresponding to SYMBOL, as it would be used with \\s or \\S in
 241 regular expressions.")
 242
 243
 244 (defconst rx-categories
 245   '((consonant                  . ?0)
 246     (base-vowel                 . ?1)
 247     (upper-diacritical-mark     . ?2)
 248     (lower-diacritical-mark     . ?3)
 249     (tone-mark                  . ?4)
 250     (symbol                     . ?5)
 251     (digit                      . ?6)
 252     (vowel-modifying-diacritical-mark . ?7)
 253     (vowel-sign                 . ?8)
 254     (semivowel-lower            . ?9)
 255     (not-at-end-of-line         . ?<)
 256     (not-at-beginning-of-line   . ?>)
 257     (alpha-numeric-two-byte     . ?A)
 258     (chinse-two-byte            . ?C)
 259     (greek-two-byte             . ?G)
 260     (japanese-hiragana-two-byte . ?H)
 261     (indian-two-byte            . ?I)
 262     (japanese-katakana-two-byte . ?K)
 263     (korean-hangul-two-byte     . ?N)
 264     (cyrillic-two-byte          . ?Y)
 265     (combining-diacritic        . ?^)
 266     (ascii                      . ?a)
 267     (arabic                     . ?b)
 268     (chinese                    . ?c)
 269     (ethiopic                   . ?e)
 270     (greek                      . ?g)
 271     (korean                     . ?h)
 272     (indian                     . ?i)
 273     (japanese                   . ?j)
 274     (japanese-katakana          . ?k)
 275     (latin                      . ?l)
 276     (lao                        . ?o)
 277     (tibetan                    . ?q)
 278     (japanese-roman             . ?r)
 279     (thai                       . ?t)
 280     (vietnamese                 . ?v)
 281     (hebrew                     . ?w)
 282     (cyrillic                   . ?y)
 283     (can-break                  . ?|))
 284   "Alist mapping symbols to category characters.
 285 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
 286 symbol in `(category SYMBOL)', and CHAR is the category character
 287 corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in
 288 regular expression strings.")
 289
 290
 291 (defvar rx-greedy-flag t
 292   "Non-nil means produce greedy regular expressions for `zero-or-one',
 293 `zero-or-more', and `one-or-more'.  Dynamically bound.")
 294
 295
 296 (defun rx-info (op head)
 297   "Return parsing/code generation info for OP.
 298 If OP is the space character ASCII 32, return info for the symbol `?'.
 299 If OP is the character `?', return info for the symbol `??'.
 300 See also `rx-constituents'.
 301 If HEAD is non-nil, then OP is the head of a sexp, otherwise it's
 302 a standalone symbol."
 303   (cond ((eq op ? ) (setq op '\?))
 304         ((eq op ??) (setq op '\??)))
 305   (let (old-op)
 306     (while (and (not (null op)) (symbolp op))
 307       (setq old-op op)
 308       (setq op (cdr (assq op rx-constituents)))
 309       (when (if head (stringp op) (consp op))
 310         ;; We found something but of the wrong kind.  Let's look for an
 311         ;; alternate definition for the other case.
 312         (let ((new-op
 313                (cdr (assq old-op (cdr (memq (assq old-op rx-constituents)
 314                                             rx-constituents))))))
 315           (if (and new-op (not (if head (stringp new-op) (consp new-op))))
 316               (setq op new-op))))))
 317   op)
 318
 319
 320 (defun rx-check (form)
 321   "Check FORM according to its car's parsing info."
 322   (unless (listp form)
 323     (error "rx `%s' needs argument(s)" form))
 324   (let* ((rx (rx-info (car form) 'head))
 325          (nargs (1- (length form)))
 326          (min-args (nth 1 rx))
 327          (max-args (nth 2 rx))
 328          (type-pred (nth 3 rx)))
 329     (when (and (not (null min-args))
 330                (< nargs min-args))
 331       (error "rx form `%s' requires at least %d args"
 332              (car form) min-args))
 333     (when (and (not (null max-args))
 334                (> nargs max-args))
 335       (error "rx form `%s' accepts at most %d args"
 336              (car form) max-args))
 337     (when (not (null type-pred))
 338       (dolist (sub-form (cdr form))
 339         (unless (funcall type-pred sub-form)
 340           (error "rx form `%s' requires args satisfying `%s'"
 341                  (car form) type-pred))))))
 342
 343
 344 (defun rx-group-if (regexp group)
 345   "Put shy groups around REGEXP if seemingly necessary when GROUP
 346 is non-nil."
 347   (cond
 348    ;; for some repetition
 349    ((eq group '*) (if (rx-atomic-p regexp) (setq group nil)))
 350    ;; for concatenation
 351    ((eq group ':)
 352     (if (rx-atomic-p
 353          (if (string-match
 354               "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp)
 355              (substring regexp 0 (match-beginning 0))
 356            regexp))
 357         (setq group nil)))
 358    ;; for OR
 359    ((eq group '|) (setq group nil))
 360    ;; do anyway
 361    ((eq group t))
 362    ((rx-atomic-p regexp t) (setq group nil)))
 363   (if group
 364       (concat "\\(?:" regexp "\\)")
 365     regexp))
 366
 367
 368 (defvar rx-parent)
 369 ;; dynamically bound in some functions.
 370
 371
 372 (defun rx-and (form)
 373   "Parse and produce code from FORM.
 374 FORM is of the form `(and FORM1 ...)'."
 375   (rx-check form)
 376   (rx-group-if
 377    (mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil)
 378    (and (memq rx-parent '(* t)) rx-parent)))
 379
 380
 381 (defun rx-or (form)
 382   "Parse and produce code from FORM, which is `(or FORM1 ...)'."
 383   (rx-check form)
 384   (rx-group-if
 385    (if (memq nil (mapcar 'stringp (cdr form)))
 386        (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")
 387      (regexp-opt (cdr form)))
 388    (and (memq rx-parent '(: * t)) rx-parent)))
 389
 390
 391 (defun rx-anything (form)
 392   "Match any character."
 393   (if (consp form)
 394       (error "rx `anythng' syntax error: %s" form))
 395   (rx-or (list 'or 'not-newline ?\n)))
 396
 397
 398 (defun rx-any-delete-from-range (char ranges)
 399   "Delete by side effect character CHAR from RANGES.
 400 Only both edges of each range is checked."
 401   (let (m)
 402     (cond
 403      ((memq char ranges) (setq ranges (delq char ranges)))
 404      ((setq m (assq char ranges))
 405       (if (eq (1+ char) (cdr m))
 406           (setcar (memq m ranges) (1+ char))
 407         (setcar m (1+ char))))
 408      ((setq m (rassq char ranges))
 409       (if (eq (1- char) (car m))
 410           (setcar (memq m ranges) (1- char))
 411         (setcdr m (1- char)))))
 412     ranges))
 413
 414
 415 (defun rx-any-condense-range (args)
 416   "Condense by side effect ARGS as range for Rx `any'."
 417   (let (str
 418         l)
 419     ;; set STR list of all strings
 420     ;; set L list of all ranges
 421     (mapc (lambda (e) (cond ((stringp e) (push e str))
 422                             ((numberp e) (push (cons e e) l))
 423                             (t (push e l))))
 424           args)
 425     ;; condense overlapped ranges in L
 426     (let ((tail (setq l (sort l #'car-less-than-car)))
 427           d)
 428       (while (setq d (cdr tail))
 429         (if (>= (cdar tail) (1- (caar d)))
 430             (progn
 431               (setcdr (car tail) (max (cdar tail) (cdar d)))
 432               (setcdr tail (cdr d)))
 433           (setq tail d))))
 434     ;; Separate small ranges to single number, and delete dups.
 435     (nconc
 436      (apply #'nconc
 437             (mapcar (lambda (e)
 438                       (cond
 439                        ((= (car e) (cdr e)) (list (car e)))
 440                        ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e)))
 441                        ((list e))))
 442                     l))
 443      (delete-dups str))))
 444
 445
 446 (defun rx-check-any-string (str)
 447   "Check string argument STR for Rx `any'."
 448   (let ((i 0)
 449         c1 c2 l)
 450     (if (= 0 (length str))
 451         (error "String arg for Rx `any' must not be empty"))
 452     (while (string-match ".-." str i)
 453       ;; string before range: convert it to characters
 454       (if (< i (match-beginning 0))
 455           (setq l (nconc
 456                    l
 457                    (append (substring str i (match-beginning 0)) nil))))
 458       ;; range
 459       (setq i (match-end 0)
 460             c1 (aref str (match-beginning 0))
 461             c2 (aref str (1- i)))
 462       (cond
 463        ((< c1 c2) (setq l (nconc l (list (cons c1 c2)))))
 464        ((= c1 c2) (setq l (nconc l (list c1))))))
 465     ;; rest?
 466     (if (< i (length str))
 467         (setq l (nconc l (append (substring str i) nil))))
 468     l))
 469
 470
 471 (defun rx-check-any (arg)
 472    "Check arg ARG for Rx `any'."
 473    (cond
 474     ((integerp arg) (list arg))
 475     ((symbolp arg)
 476      (let ((translation (condition-case nil
 477                             (rx-form arg)
 478                           (error nil))))
 479        (if (or (null translation)
 480                (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation)))
 481            (error "Invalid char class `%s' in Rx `any'" arg))
 482        (list (substring translation 1 -1)))) ; strip outer brackets
 483     ((and (integerp (car-safe arg)) (integerp (cdr-safe arg)))
 484      (list arg))
 485     ((stringp arg) (rx-check-any-string arg))
 486     ((error
 487       "rx `any' requires string, character, char pair or char class args"))))
 488
 489
 490 (defun rx-any (form)
 491   "Parse and produce code from FORM, which is `(any ARG ...)'.
 492 ARG is optional."
 493   (rx-check form)
 494   (let* ((args (rx-any-condense-range
 495                 (apply
 496                  #'nconc
 497                  (mapcar #'rx-check-any (cdr form)))))
 498          m
 499          s)
 500     (cond
 501      ;; single close bracket
 502      ;;  => "[]...-]" or "[]...--.]"
 503      ((memq ?\] args)
 504       ;; set ] at the beginning
 505       (setq args (cons ?\] (delq ?\] args)))
 506       ;; set - at the end
 507       (if (or (memq ?- args) (assq ?- args))
 508           (setq args (nconc (rx-any-delete-from-range ?- args)
 509                             (list ?-)))))
 510      ;; close bracket starts a range
 511      ;;  => "[]-....-]" or "[]-.--....]"
 512      ((setq m (assq ?\] args))
 513       ;; bring it to the beginning
 514       (setq args (cons m (delq m args)))
 515       (cond ((memq ?- args)
 516              ;; to the end
 517              (setq args (nconc (delq ?- args) (list ?-))))
 518             ((setq m (assq ?- args))
 519              ;; next to the bracket's range, make the second range
 520              (setcdr args (cons m (delq m args))))))
 521      ;; bracket in the end range
 522      ;;  => "[]...-]"
 523      ((setq m (rassq ?\] args))
 524       ;; set ] at the beginning
 525       (setq args (cons ?\] (rx-any-delete-from-range ?\] args)))
 526       ;; set - at the end
 527       (if (or (memq ?- args) (assq ?- args))
 528           (setq args (nconc (rx-any-delete-from-range ?- args)
 529                             (list ?-)))))
 530      ;; {no close bracket appears}
 531      ;;
 532      ;; bring single bar to the beginning
 533      ((memq ?- args)
 534       (setq args (cons ?- (delq ?- args))))
 535      ;; bar start a range, bring it to the beginning
 536      ((setq m (assq ?- args))
 537       (setq args (cons m (delq m args))))
 538      ;;
 539      ;; hat at the beginning?
 540      ((or (eq (car args) ?^) (eq (car-safe (car args)) ?^))
 541       (setq args (if (cdr args)
 542                      `(,(cadr args) ,(car args) ,@(cddr args))
 543                    (nconc (rx-any-delete-from-range ?^ args)
 544                           (list ?^))))))
 545     ;; some 1-char?
 546     (if (and (null (cdr args)) (numberp (car args))
 547              (or (= 1 (length
 548                        (setq s (regexp-quote (string (car args))))))
 549                  (and (equal (car args) ?^) ;; unnecessary predicate?
 550                       (null (eq rx-parent '!)))))
 551         s
 552       (concat "["
 553               (mapconcat
 554                (lambda (e) (cond
 555                             ((numberp e) (string e))
 556                             ((consp e)
 557                              (if (and (= (1+ (car e)) (cdr e))
 558                                       ;; rx-any-condense-range should
 559                                       ;; prevent this case from happening.
 560                                       (null (memq (car e) '(?\] ?-)))
 561                                       (null (memq (cdr e) '(?\] ?-))))
 562                                  (string (car e) (cdr e))
 563                                (string (car e) ?- (cdr e))))
 564                             (e)))
 565                args
 566                nil)
 567               "]"))))
 568
 569
 570 (defun rx-check-not (arg)
 571   "Check arg ARG for Rx `not'."
 572   (unless (or (and (symbolp arg)
 573                    (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'"
 574                                  (condition-case nil
 575                                      (rx-form arg)
 576                                    (error ""))))
 577               (eq arg 'word-boundary)
 578               (and (consp arg)
 579                    (memq (car arg) '(not any in syntax category))))
 580     (error "rx `not' syntax error: %s" arg))
 581   t)
 582
 583
 584 (defun rx-not (form)
 585   "Parse and produce code from FORM.  FORM is `(not ...)'."
 586   (rx-check form)
 587   (let ((result (rx-form (cadr form) '!))
 588         case-fold-search)
 589     (cond ((string-match "\\`\\[^" result)
 590            (cond
 591             ((equal result "[^]") "[^^]")
 592             ((and (= (length result) 4) (null (eq rx-parent '!)))
 593              (regexp-quote (substring result 2 3)))
 594             ((concat "[" (substring result 2)))))
 595           ((eq ?\[ (aref result 0))
 596            (concat "[^" (substring result 1)))
 597           ((string-match "\\`\\\\[scbw]" result)
 598            (concat (upcase (substring result 0 2))
 599                    (substring result 2)))
 600           ((string-match "\\`\\\\[SCBW]" result)
 601            (concat (downcase (substring result 0 2))
 602                    (substring result 2)))
 603           (t
 604            (concat "[^" result "]")))))
 605
 606
 607 (defun rx-not-char (form)
 608   "Parse and produce code from FORM.  FORM is `(not-char ...)'."
 609   (rx-check form)
 610   (rx-not `(not (in ,@(cdr form)))))
 611
 612
 613 (defun rx-not-syntax (form)
 614   "Parse and produce code from FORM.  FORM is `(not-syntax SYNTAX)'."
 615   (rx-check form)
 616   (rx-not `(not (syntax ,@(cdr form)))))
 617
 618
 619 (defun rx-trans-forms (form &optional skip)
 620   "If FORM's length is greater than two, transform it to length two.
 621 A form (HEAD REST ...) becomes (HEAD (and REST ...)).
 622 If SKIP is non-nil, allow that number of items after the head, i.e.
 623 `(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1."
 624   (unless skip (setq skip 0))
 625   (let ((tail (nthcdr (1+ skip) form)))
 626     (if (= (length tail) 1)
 627         form
 628       (let ((form (copy-sequence form)))
 629         (setcdr (nthcdr skip form) (list (cons 'and tail)))
 630         form))))
 631
 632
 633 (defun rx-= (form)
 634   "Parse and produce code from FORM `(= N ...)'."
 635   (rx-check form)
 636   (setq form (rx-trans-forms form 1))
 637   (unless (and (integerp (nth 1 form))
 638                (> (nth 1 form) 0))
 639     (error "rx `=' requires positive integer first arg"))
 640   (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
 641
 642
 643 (defun rx->= (form)
 644   "Parse and produce code from FORM `(>= N ...)'."
 645   (rx-check form)
 646   (setq form (rx-trans-forms form 1))
 647   (unless (and (integerp (nth 1 form))
 648                (> (nth 1 form) 0))
 649     (error "rx `>=' requires positive integer first arg"))
 650   (format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
 651
 652
 653 (defun rx-** (form)
 654   "Parse and produce code from FORM `(** N M ...)'."
 655   (rx-check form)
 656   (rx-form (cons 'repeat (cdr (rx-trans-forms form 2))) '*))
 657
 658
 659 (defun rx-repeat (form)
 660   "Parse and produce code from FORM.
 661 FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'."
 662   (rx-check form)
 663   (if (> (length form) 4)
 664       (setq form (rx-trans-forms form 2)))
 665   (if (null (nth 2 form))
 666       (setq form (cons (nth 0 form) (cons (nth 1 form) (nthcdr 3 form)))))
 667   (cond ((= (length form) 3)
 668          (unless (and (integerp (nth 1 form))
 669                       (> (nth 1 form) 0))
 670            (error "rx `repeat' requires positive integer first arg"))
 671          (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
 672         ((or (not (integerp (nth 2 form)))
 673              (< (nth 2 form) 0)
 674              (not (integerp (nth 1 form)))
 675              (< (nth 1 form) 0)
 676              (< (nth 2 form) (nth 1 form)))
 677          (error "rx `repeat' range error"))
 678         (t
 679          (format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*)
 680                  (nth 1 form) (nth 2 form)))))
 681
 682
 683 (defun rx-submatch (form)
 684   "Parse and produce code from FORM, which is `(submatch ...)'."
 685   (concat "\\("
 686           (if (= 2 (length form))
 687               ;; Only one sub-form.
 688               (rx-form (cadr form))
 689             ;; Several sub-forms implicitly concatenated.
 690             (mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil))
 691           "\\)"))
 692
 693
 694 (defun rx-backref (form)
 695   "Parse and produce code from FORM, which is `(backref N)'."
 696   (rx-check form)
 697   (format "\\%d" (nth 1 form)))
 698
 699 (defun rx-check-backref (arg)
 700   "Check arg ARG for Rx `backref'."
 701   (or (and (integerp arg) (>= arg 1) (<= arg 9))
 702       (error "rx `backref' requires numeric 1<=arg<=9: %s" arg)))
 703
 704 (defun rx-kleene (form)
 705   "Parse and produce code from FORM.
 706 FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
 707 `zero-or-more' etc.  operators.
 708 If OP is one of `*', `+', `?', produce a greedy regexp.
 709 If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
 710 If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
 711 is non-nil."
 712   (rx-check form)
 713   (setq form (rx-trans-forms form))
 714   (let ((suffix (cond ((memq (car form) '(* + ?\s)) "")
 715                       ((memq (car form) '(*? +? ??)) "?")
 716                       (rx-greedy-flag "")
 717                       (t "?")))
 718         (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
 719                   ((memq (car form) '(+ +? 1+ one-or-more))  "+")
 720                   (t "?"))))
 721     (rx-group-if
 722      (concat (rx-form (cadr form) '*) op suffix)
 723      (and (memq rx-parent '(t *)) rx-parent))))
 724
 725
 726 (defun rx-atomic-p (r &optional lax)
 727   "Return non-nil if regexp string R is atomic.
 728 An atomic regexp R is one such that a suffix operator
 729 appended to R will apply to all of R.  For example, \"a\"
 730 \"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\",
 731 \"[ab]c\", and \"ab\\|ab*c\" are not atomic.
 732
 733 This function may return false negatives, but it will not
 734 return false positives.  It is nevertheless useful in
 735 situations where an efficiency shortcut can be taken only if a
 736 regexp is atomic.  The function can be improved to detect
 737 more cases of atomic regexps.  Presently, this function
 738 detects the following categories of atomic regexp;
 739
 740   a group or shy group:  \\(...\\)
 741   a character class:     [...]
 742   a single character:    a
 743
 744 On the other hand, false negatives will be returned for
 745 regexps that are atomic but end in operators, such as
 746 \"a+\".  I think these are rare.  Probably such cases could
 747 be detected without much effort.  A guarantee of no false
 748 negatives would require a theoretic specification of the set
 749 of all atomic regexps."
 750   (let ((l (length r)))
 751     (cond
 752      ((<= l 1))
 753      ((= l 2) (= (aref r 0) ?\\))
 754      ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
 755      ((null lax)
 756       (cond
 757        ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^\]]\\)*\\]\\'" r))
 758        ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^\)]\\)*\\\\)\\'" r)))))))
 759
 760
 761 (defun rx-syntax (form)
 762   "Parse and produce code from FORM, which is `(syntax SYMBOL)'."
 763   (rx-check form)
 764   (let* ((sym (cadr form))
 765          (syntax (cdr (assq sym rx-syntax))))
 766     (unless syntax
 767       ;; Try sregex compatibility.
 768       (cond
 769        ((characterp sym) (setq syntax sym))
 770        ((symbolp sym)
 771         (let ((name (symbol-name sym)))
 772           (if (= 1 (length name))
 773               (setq syntax (aref name 0))))))
 774       (unless syntax
 775         (error "Unknown rx syntax `%s'" sym)))
 776     (format "\\s%c" syntax)))
 777
 778
 779 (defun rx-check-category (form)
 780   "Check the argument FORM of a `(category FORM)'."
 781   (unless (or (integerp form)
 782               (cdr (assq form rx-categories)))
 783     (error "Unknown category `%s'" form))
 784   t)
 785
 786
 787 (defun rx-category (form)
 788   "Parse and produce code from FORM, which is `(category SYMBOL)'."
 789   (rx-check form)
 790   (let ((char (if (integerp (cadr form))
 791                   (cadr form)
 792                 (cdr (assq (cadr form) rx-categories)))))
 793     (format "\\c%c" char)))
 794
 795
 796 (defun rx-eval (form)
 797   "Parse and produce code from FORM, which is `(eval FORM)'."
 798   (rx-check form)
 799   (rx-form (eval (cadr form)) rx-parent))
 800
 801
 802 (defun rx-greedy (form)
 803   "Parse and produce code from FORM.
 804 If FORM is '(minimal-match FORM1)', non-greedy versions of `*',
 805 `+', and `?' operators will be used in FORM1.  If FORM is
 806 '(maximal-match FORM1)', greedy operators will be used."
 807   (rx-check form)
 808   (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
 809     (rx-form (cadr form) rx-parent)))
 810
 811
 812 (defun rx-regexp (form)
 813   "Parse and produce code from FORM, which is `(regexp STRING)'."
 814   (rx-check form)
 815   (rx-group-if (cadr form) rx-parent))
 816
 817
 818 (defun rx-form (form &optional rx-parent)
 819   "Parse and produce code for regular expression FORM.
 820 FORM is a regular expression in sexp form.
 821 RX-PARENT shows which type of expression calls and controls putting of
 822 shy groups around the result and some more in other functions."
 823   (if (stringp form)
 824       (rx-group-if (regexp-quote form)
 825                    (if (and (eq rx-parent '*) (< 1 (length form)))
 826                        rx-parent))
 827     (cond ((integerp form)
 828            (regexp-quote (char-to-string form)))
 829           ((symbolp form)
 830            (let ((info (rx-info form nil)))
 831              (cond ((stringp info)
 832                     info)
 833                    ((null info)
 834                     (error "Unknown rx form `%s'" form))
 835                    (t
 836                     (funcall (nth 0 info) form)))))
 837           ((consp form)
 838            (let ((info (rx-info (car form) 'head)))
 839              (unless (consp info)
 840                (error "Unknown rx form `%s'" (car form)))
 841              (funcall (nth 0 info) form)))
 842           (t
 843            (error "rx syntax error at `%s'" form)))))
 844
 845
 846 ;;;###autoload
 847 (defun rx-to-string (form &optional no-group)
 848   "Parse and produce code for regular expression FORM.
 849 FORM is a regular expression in sexp form.
 850 NO-GROUP non-nil means don't put shy groups around the result."
 851   (rx-group-if (rx-form form) (null no-group)))
 852
 853
 854 ;;;###autoload
 855 (defmacro rx (&rest regexps)
 856   "Translate regular expressions REGEXPS in sexp form to a regexp string.
 857 REGEXPS is a non-empty sequence of forms of the sort listed below.
 858
 859 Note that `rx' is a Lisp macro; when used in a Lisp program being
 860  compiled, the translation is performed by the compiler.
 861 See `rx-to-string' for how to do such a translation at run-time.
 862
 863 The following are valid subforms of regular expressions in sexp
 864 notation.
 865
 866 STRING
 867      matches string STRING literally.
 868
 869 CHAR
 870      matches character CHAR literally.
 871
 872 `not-newline', `nonl'
 873      matches any character except a newline.
 874
 875 `anything'
 876      matches any character
 877
 878 `(any SET ...)'
 879 `(in SET ...)'
 880 `(char SET ...)'
 881      matches any character in SET ....  SET may be a character or string.
 882      Ranges of characters can be specified as `A-Z' in strings.
 883      Ranges may also be specified as conses like `(?A . ?Z)'.
 884
 885      SET may also be the name of a character class: `digit',
 886      `control', `hex-digit', `blank', `graph', `print', `alnum',
 887      `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper',
 888      `word', or one of their synonyms.
 889
 890 `(not (any SET ...))'
 891      matches any character not in SET ...
 892
 893 `line-start', `bol'
 894      matches the empty string, but only at the beginning of a line
 895      in the text being matched
 896
 897 `line-end', `eol'
 898      is similar to `line-start' but matches only at the end of a line
 899
 900 `string-start', `bos', `bot'
 901      matches the empty string, but only at the beginning of the
 902      string being matched against.
 903
 904 `string-end', `eos', `eot'
 905      matches the empty string, but only at the end of the
 906      string being matched against.
 907
 908 `buffer-start'
 909      matches the empty string, but only at the beginning of the
 910      buffer being matched against.  Actually equivalent to `string-start'.
 911
 912 `buffer-end'
 913      matches the empty string, but only at the end of the
 914      buffer being matched against.  Actually equivalent to `string-end'.
 915
 916 `point'
 917      matches the empty string, but only at point.
 918
 919 `word-start', `bow'
 920      matches the empty string, but only at the beginning of a word.
 921
 922 `word-end', `eow'
 923      matches the empty string, but only at the end of a word.
 924
 925 `word-boundary'
 926      matches the empty string, but only at the beginning or end of a
 927      word.
 928
 929 `(not word-boundary)'
 930 `not-word-boundary'
 931      matches the empty string, but not at the beginning or end of a
 932      word.
 933
 934 `symbol-start'
 935      matches the empty string, but only at the beginning of a symbol.
 936
 937 `symbol-end'
 938      matches the empty string, but only at the end of a symbol.
 939
 940 `digit', `numeric', `num'
 941      matches 0 through 9.
 942
 943 `control', `cntrl'
 944      matches ASCII control characters.
 945
 946 `hex-digit', `hex', `xdigit'
 947      matches 0 through 9, a through f and A through F.
 948
 949 `blank'
 950      matches space and tab only.
 951
 952 `graphic', `graph'
 953      matches graphic characters--everything except ASCII control chars,
 954      space, and DEL.
 955
 956 `printing', `print'
 957      matches printing characters--everything except ASCII control chars
 958      and DEL.
 959
 960 `alphanumeric', `alnum'
 961      matches letters and digits.  (But at present, for multibyte characters,
 962      it matches anything that has word syntax.)
 963
 964 `letter', `alphabetic', `alpha'
 965      matches letters.  (But at present, for multibyte characters,
 966      it matches anything that has word syntax.)
 967
 968 `ascii'
 969      matches ASCII (unibyte) characters.
 970
 971 `nonascii'
 972      matches non-ASCII (multibyte) characters.
 973
 974 `lower', `lower-case'
 975      matches anything lower-case.
 976
 977 `upper', `upper-case'
 978      matches anything upper-case.
 979
 980 `punctuation', `punct'
 981      matches punctuation.  (But at present, for multibyte characters,
 982      it matches anything that has non-word syntax.)
 983
 984 `space', `whitespace', `white'
 985      matches anything that has whitespace syntax.
 986
 987 `word', `wordchar'
 988      matches anything that has word syntax.
 989
 990 `not-wordchar'
 991      matches anything that has non-word syntax.
 992
 993 `(syntax SYNTAX)'
 994      matches a character with syntax SYNTAX.  SYNTAX must be one
 995      of the following symbols, or a symbol corresponding to the syntax
 996      character, e.g. `\\.' for `\\s.'.
 997
 998      `whitespace'               (\\s- in string notation)
 999      `punctuation'              (\\s.)
1000      `word'                     (\\sw)
1001      `symbol'                   (\\s_)
1002      `open-parenthesis'         (\\s()
1003      `close-parenthesis'        (\\s))
1004      `expression-prefix'        (\\s')
1005      `string-quote'             (\\s\")
1006      `paired-delimiter'         (\\s$)
1007      `escape'                   (\\s\\)
1008      `character-quote'          (\\s/)
1009      `comment-start'            (\\s<)
1010      `comment-end'              (\\s>)
1011      `string-delimiter'         (\\s|)
1012      `comment-delimiter'        (\\s!)
1013
1014 `(not (syntax SYNTAX))'
1015      matches a character that doesn't have syntax SYNTAX.
1016
1017 `(category CATEGORY)'
1018      matches a character with category CATEGORY.  CATEGORY must be
1019      either a character to use for C, or one of the following symbols.
1020
1021      `consonant'                        (\\c0 in string notation)
1022      `base-vowel'                       (\\c1)
1023      `upper-diacritical-mark'           (\\c2)
1024      `lower-diacritical-mark'           (\\c3)
1025      `tone-mark'                        (\\c4)
1026      `symbol'                           (\\c5)
1027      `digit'                            (\\c6)
1028      `vowel-modifying-diacritical-mark' (\\c7)
1029      `vowel-sign'                       (\\c8)
1030      `semivowel-lower'                  (\\c9)
1031      `not-at-end-of-line'               (\\c<)
1032      `not-at-beginning-of-line'         (\\c>)
1033      `alpha-numeric-two-byte'           (\\cA)
1034      `chinse-two-byte'                  (\\cC)
1035      `greek-two-byte'                   (\\cG)
1036      `japanese-hiragana-two-byte'       (\\cH)
1037      `indian-tow-byte'                  (\\cI)
1038      `japanese-katakana-two-byte'       (\\cK)
1039      `korean-hangul-two-byte'           (\\cN)
1040      `cyrillic-two-byte'                (\\cY)
1041      `combining-diacritic'              (\\c^)
1042      `ascii'                            (\\ca)
1043      `arabic'                           (\\cb)
1044      `chinese'                          (\\cc)
1045      `ethiopic'                         (\\ce)
1046      `greek'                            (\\cg)
1047      `korean'                           (\\ch)
1048      `indian'                           (\\ci)
1049      `japanese'                         (\\cj)
1050      `japanese-katakana'                (\\ck)
1051      `latin'                            (\\cl)
1052      `lao'                              (\\co)
1053      `tibetan'                          (\\cq)
1054      `japanese-roman'                   (\\cr)
1055      `thai'                             (\\ct)
1056      `vietnamese'                       (\\cv)
1057      `hebrew'                           (\\cw)
1058      `cyrillic'                         (\\cy)
1059      `can-break'                        (\\c|)
1060
1061 `(not (category CATEGORY))'
1062      matches a character that doesn't have category CATEGORY.
1063
1064 `(and SEXP1 SEXP2 ...)'
1065 `(: SEXP1 SEXP2 ...)'
1066 `(seq SEXP1 SEXP2 ...)'
1067 `(sequence SEXP1 SEXP2 ...)'
1068      matches what SEXP1 matches, followed by what SEXP2 matches, etc.
1069
1070 `(submatch SEXP1 SEXP2 ...)'
1071 `(group SEXP1 SEXP2 ...)'
1072      like `and', but makes the match accessible with `match-end',
1073      `match-beginning', and `match-string'.
1074
1075 `(or SEXP1 SEXP2 ...)'
1076 `(| SEXP1 SEXP2 ...)'
1077      matches anything that matches SEXP1 or SEXP2, etc.  If all
1078      args are strings, use `regexp-opt' to optimize the resulting
1079      regular expression.
1080
1081 `(minimal-match SEXP)'
1082      produce a non-greedy regexp for SEXP.  Normally, regexps matching
1083      zero or more occurrences of something are \"greedy\" in that they
1084      match as much as they can, as long as the overall regexp can
1085      still match.  A non-greedy regexp matches as little as possible.
1086
1087 `(maximal-match SEXP)'
1088      produce a greedy regexp for SEXP.  This is the default.
1089
1090 Below, `SEXP ...' represents a sequence of regexp forms, treated as if
1091 enclosed in `(and ...)'.
1092
1093 `(zero-or-more SEXP ...)'
1094 `(0+ SEXP ...)'
1095      matches zero or more occurrences of what SEXP ... matches.
1096
1097 `(* SEXP ...)'
1098      like `zero-or-more', but always produces a greedy regexp, independent
1099      of `rx-greedy-flag'.
1100
1101 `(*? SEXP ...)'
1102      like `zero-or-more', but always produces a non-greedy regexp,
1103      independent of `rx-greedy-flag'.
1104
1105 `(one-or-more SEXP ...)'
1106 `(1+ SEXP ...)'
1107      matches one or more occurrences of SEXP ...
1108
1109 `(+ SEXP ...)'
1110      like `one-or-more', but always produces a greedy regexp.
1111
1112 `(+? SEXP ...)'
1113      like `one-or-more', but always produces a non-greedy regexp.
1114
1115 `(zero-or-one SEXP ...)'
1116 `(optional SEXP ...)'
1117 `(opt SEXP ...)'
1118      matches zero or one occurrences of A.
1119
1120 `(? SEXP ...)'
1121      like `zero-or-one', but always produces a greedy regexp.
1122
1123 `(?? SEXP ...)'
1124      like `zero-or-one', but always produces a non-greedy regexp.
1125
1126 `(repeat N SEXP)'
1127 `(= N SEXP ...)'
1128      matches N occurrences.
1129
1130 `(>= N SEXP ...)'
1131      matches N or more occurrences.
1132
1133 `(repeat N M SEXP)'
1134 `(** N M SEXP ...)'
1135      matches N to M occurrences.
1136
1137 `(backref N)'
1138      matches what was matched previously by submatch N.
1139
1140 `(eval FORM)'
1141      evaluate FORM and insert result.  If result is a string,
1142      `regexp-quote' it.
1143
1144 `(regexp REGEXP)'
1145      include REGEXP in string notation in the result."
1146   (cond ((null regexps)
1147          (error "No regexp"))
1148         ((cdr regexps)
1149          (rx-to-string `(and ,@regexps) t))
1150         (t
1151          (rx-to-string (car regexps) t))))
1152 \f
1153 ;; ;; sregex.el replacement
1154
1155 ;; ;;;###autoload (provide 'sregex)
1156 ;; ;;;###autoload (autoload 'sregex "rx")
1157 ;; (defalias 'sregex 'rx-to-string)
1158 ;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
1159 ;; (defalias 'sregexq 'rx)
1160 \f
1161 (provide 'rx)
1162
1163 ;;; rx.el ends here