lisp/emacs-lisp/rx.el

   1 ;;; rx.el --- sexp notation for regular expressions
   2
   3 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005,
   4 ;;   2006 Free Software Foundation, Inc.
   5
   6 ;; Author: Gerd Moellmann <gerd@gnu.org>
   7 ;; Maintainer: FSF
   8 ;; Keywords: strings, regexps, extensions
   9
  10 ;; This file is part of GNU Emacs.
  11
  12 ;; GNU Emacs is free software; you can redistribute it and/or modify
  13 ;; it under the terms of the GNU General Public License as published by
  14 ;; the Free Software Foundation; either version 2, or (at your option)
  15 ;; any later version.
  16
  17 ;; GNU Emacs is distributed in the hope that it will be useful,
  18 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ;; GNU General Public License for more details.
  21
  22 ;; You should have received a copy of the GNU General Public License
  23 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  24 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  25 ;; Boston, MA 02110-1301, USA.
  26
  27 ;;; Commentary:
  28
  29 ;; This is another implementation of sexp-form regular expressions.
  30 ;; It was unfortunately written without being aware of the Sregex
  31 ;; package coming with Emacs, but as things stand, Rx completely
  32 ;; covers all regexp features, which Sregex doesn't, doesn't suffer
  33 ;; from the bugs mentioned in the commentary section of Sregex, and
  34 ;; uses a nicer syntax (IMHO, of course :-).
  35
  36 ;; This significantly extended version of the original, is almost
  37 ;; compatible with Sregex.  The only incompatibility I (fx) know of is
  38 ;; that the `repeat' form can't have multiple regexp args.
  39
  40 ;; Now alternative forms are provided for a degree of compatibility
  41 ;; with Shivers' attempted definitive SRE notation
  42 ;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>.  SRE forms not
  43 ;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
  44 ;; ,<exp>, (word ...), word+, posix-string, and character class forms.
  45 ;; Some forms are inconsistent with SRE, either for historical reasons
  46 ;; or because of the implementation -- simple translation into Emacs
  47 ;; regexp strings.  These include: any, word.  Also, case-sensitivity
  48 ;; and greediness are controlled by variables external to the regexp,
  49 ;; and you need to feed the forms to the `posix-' functions to get
  50 ;; SRE's POSIX semantics.  There are probably more difficulties.
  51
  52 ;; Rx translates a sexp notation for regular expressions into the
  53 ;; usual string notation.  The translation can be done at compile-time
  54 ;; by using the `rx' macro.  It can be done at run-time by calling
  55 ;; function `rx-to-string'.  See the documentation of `rx' for a
  56 ;; complete description of the sexp notation.
  57 ;;
  58 ;; Some examples of string regexps and their sexp counterparts:
  59 ;;
  60 ;; "^[a-z]*"
  61 ;; (rx (and line-start (0+ (in "a-z"))))
  62 ;;
  63 ;; "\n[^ \t]"
  64 ;; (rx (and "\n" (not blank))), or
  65 ;; (rx (and "\n" (not (any " \t"))))
  66 ;;
  67 ;; "\\*\\*\\* EOOH \\*\\*\\*\n"
  68 ;; (rx "*** EOOH ***\n")
  69 ;;
  70 ;; "\\<\\(catch\\|finally\\)\\>[^_]"
  71 ;; (rx (and word-start (submatch (or "catch" "finally")) word-end
  72 ;;          (not (any ?_))))
  73 ;;
  74 ;; "[ \t\n]*:\\([^:]+\\|$\\)"
  75 ;; (rx (and (zero-or-more (in " \t\n")) ":"
  76 ;;          (submatch (or line-end (one-or-more (not (any ?:)))))))
  77 ;;
  78 ;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
  79 ;; (rx (and line-start
  80 ;;          "content-transfer-encoding:"
  81 ;;          (+ (? ?\n)) blank
  82 ;;          "quoted-printable"
  83 ;;          (+ (? ?\n)) blank))
  84 ;;
  85 ;; (concat "^\\(?:" something-else "\\)")
  86 ;; (rx (and line-start (eval something-else))), statically or
  87 ;; (rx-to-string '(and line-start ,something-else)), dynamically.
  88 ;;
  89 ;; (regexp-opt '(STRING1 STRING2 ...))
  90 ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
  91 ;; calls `regexp-opt' as needed.
  92 ;;
  93 ;; "^;;\\s-*\n\\|^\n"
  94 ;; (rx (or (and line-start ";;" (0+ space) ?\n)
  95 ;;         (and line-start ?\n)))
  96 ;;
  97 ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
  98 ;; (rx (and "$Id: "
  99 ;;          (1+ (not (in " ")))
 100 ;;          " "
 101 ;;          (submatch (1+ (not (in " "))))
 102 ;;          " "))
 103 ;;
 104 ;; "\\\\\\\\\\[\\w+"
 105 ;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
 106 ;;
 107 ;; etc.
 108
 109 ;;; History:
 110 ;;
 111
 112 ;;; Code:
 113
 114 (defconst rx-constituents
 115   '((and                . (rx-and 1 nil))
 116     (seq                . and)          ; SRE
 117     (:                  . and)          ; SRE
 118     (sequence           . and)          ; sregex
 119     (or                 . (rx-or 1 nil))
 120     (|                  . or)           ; SRE
 121     (not-newline        . ".")
 122     (nonl               . not-newline)  ; SRE
 123     (anything           . ".\\|\n")
 124     (any                . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
 125     (in                 . any)
 126     (char               . any)          ; sregex
 127     (not-char           . (rx-not-char 1 nil rx-check-any)) ; sregex
 128     (not                . (rx-not 1 1 rx-check-not))
 129     ;; Partially consistent with sregex, whose `repeat' is like our
 130     ;; `**'.  (`repeat' with optional max arg and multiple sexp forms
 131     ;; is ambiguous.)
 132     (repeat             . (rx-repeat 2 3))
 133     (=                  . (rx-= 2 nil))    ; SRE
 134     (>=                 . (rx->= 2 nil))   ; SRE
 135     (**                 . (rx-** 2 nil))   ; SRE
 136     (submatch           . (rx-submatch 1 nil)) ; SRE
 137     (group              . submatch)
 138     (zero-or-more       . (rx-kleene 1 nil))
 139     (one-or-more        . (rx-kleene 1 nil))
 140     (zero-or-one        . (rx-kleene 1 nil))
 141     (\?                 . zero-or-one)  ; SRE
 142     (\??                . zero-or-one)
 143     (*                  . zero-or-more) ; SRE
 144     (*?                 . zero-or-more)
 145     (0+                 . zero-or-more)
 146     (+                  . one-or-more)  ; SRE
 147     (+?                 . one-or-more)
 148     (1+                 . one-or-more)
 149     (optional           . zero-or-one)
 150     (opt                . zero-or-one)  ; sregex
 151     (minimal-match      . (rx-greedy 1 1))
 152     (maximal-match      . (rx-greedy 1 1))
 153     (backref            . (rx-backref 1 1 rx-check-backref))
 154     (line-start         . "^")
 155     (bol                . line-start)   ; SRE
 156     (line-end           . "$")
 157     (eol                . line-end)     ; SRE
 158     (string-start       . "\\`")
 159     (bos                . string-start) ; SRE
 160     (bot                . string-start) ; sregex
 161     (string-end         . "\\'")
 162     (eos                . string-end)   ; SRE
 163     (eot                . string-end)   ; sregex
 164     (buffer-start       . "\\`")
 165     (buffer-end         . "\\'")
 166     (point              . "\\=")
 167     (word-start         . "\\<")
 168     (bow                . word-start)   ; SRE
 169     (word-end           . "\\>")
 170     (eow                . word-end)     ; SRE
 171     (word-boundary      . "\\b")
 172     (not-word-boundary  . "\\B")        ; sregex
 173     (symbol-start       . "\\_<")
 174     (symbol-end         . "\\_>")
 175     (syntax             . (rx-syntax 1 1))
 176     (not-syntax         . (rx-not-syntax 1 1)) ; sregex
 177     (category           . (rx-category 1 1 rx-check-category))
 178     (eval               . (rx-eval 1 1))
 179     (regexp             . (rx-regexp 1 1 stringp))
 180     (digit              . "[[:digit:]]")
 181     (numeric            . digit)        ; SRE
 182     (num                . digit)        ; SRE
 183     (control            . "[[:cntrl:]]") ; SRE
 184     (cntrl              . control)       ; SRE
 185     (hex-digit          . "[[:xdigit:]]") ; SRE
 186     (hex                . hex-digit)      ; SRE
 187     (xdigit             . hex-digit)      ; SRE
 188     (blank              . "[[:blank:]]")  ; SRE
 189     (graphic            . "[[:graph:]]")  ; SRE
 190     (graph              . graphic)        ; SRE
 191     (printing           . "[[:print:]]")  ; SRE
 192     (print              . printing)       ; SRE
 193     (alphanumeric       . "[[:alnum:]]")  ; SRE
 194     (alnum              . alphanumeric)   ; SRE
 195     (letter             . "[[:alpha:]]")
 196     (alphabetic         . letter)       ; SRE
 197     (alpha              . letter)       ; SRE
 198     (ascii              . "[[:ascii:]]") ; SRE
 199     (nonascii           . "[[:nonascii:]]")
 200     (lower              . "[[:lower:]]") ; SRE
 201     (lower-case         . lower)         ; SRE
 202     (punctuation        . "[[:punct:]]") ; SRE
 203     (punct              . punctuation)   ; SRE
 204     (space              . "[[:space:]]") ; SRE
 205     (whitespace         . space)         ; SRE
 206     (white              . space)         ; SRE
 207     (upper              . "[[:upper:]]") ; SRE
 208     (upper-case         . upper)         ; SRE
 209     (word               . "[[:word:]]")  ; inconsistent with SRE
 210     (wordchar           . word)          ; sregex
 211     (not-wordchar       . "[^[:word:]]") ; sregex (use \\W?)
 212     )
 213   "Alist of sexp form regexp constituents.
 214 Each element of the alist has the form (SYMBOL . DEFN).
 215 SYMBOL is a valid constituent of sexp regular expressions.
 216 If DEFN is a string, SYMBOL is translated into DEFN.
 217 If DEFN is a symbol, use the definition of DEFN, recursively.
 218 Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE).
 219 FUNCTION is used to produce code for SYMBOL.  MIN-ARGS and MAX-ARGS
 220 are the minimum and maximum number of arguments the function-form
 221 sexp constituent SYMBOL may have in sexp regular expressions.
 222 MAX-ARGS nil means no limit.  PREDICATE, if specified, means that
 223 all arguments must satisfy PREDICATE.")
 224
 225
 226 (defconst rx-syntax
 227   '((whitespace         . ?-)
 228     (punctuation        . ?.)
 229     (word               . ?w)
 230     (symbol             . ?_)
 231     (open-parenthesis   . ?\()
 232     (close-parenthesis  . ?\))
 233     (expression-prefix  . ?\')
 234     (string-quote       . ?\")
 235     (paired-delimiter   . ?$)
 236     (escape             . ?\\)
 237     (character-quote    . ?/)
 238     (comment-start      . ?<)
 239     (comment-end        . ?>)
 240     (string-delimiter   . ?|)
 241     (comment-delimiter  . ?!))
 242   "Alist mapping Rx syntax symbols to syntax characters.
 243 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
 244 symbol in `(syntax SYMBOL)', and CHAR is the syntax character
 245 corresponding to SYMBOL, as it would be used with \\s or \\S in
 246 regular expressions.")
 247
 248
 249 (defconst rx-categories
 250   '((consonant                  . ?0)
 251     (base-vowel                 . ?1)
 252     (upper-diacritical-mark     . ?2)
 253     (lower-diacritical-mark     . ?3)
 254     (tone-mark                  . ?4)
 255     (symbol                     . ?5)
 256     (digit                      . ?6)
 257     (vowel-modifying-diacritical-mark . ?7)
 258     (vowel-sign                 . ?8)
 259     (semivowel-lower            . ?9)
 260     (not-at-end-of-line         . ?<)
 261     (not-at-beginning-of-line   . ?>)
 262     (alpha-numeric-two-byte     . ?A)
 263     (chinse-two-byte            . ?C)
 264     (greek-two-byte             . ?G)
 265     (japanese-hiragana-two-byte . ?H)
 266     (indian-two-byte            . ?I)
 267     (japanese-katakana-two-byte . ?K)
 268     (korean-hangul-two-byte     . ?N)
 269     (cyrillic-two-byte          . ?Y)
 270     (combining-diacritic        . ?^)
 271     (ascii                      . ?a)
 272     (arabic                     . ?b)
 273     (chinese                    . ?c)
 274     (ethiopic                   . ?e)
 275     (greek                      . ?g)
 276     (korean                     . ?h)
 277     (indian                     . ?i)
 278     (japanese                   . ?j)
 279     (japanese-katakana          . ?k)
 280     (latin                      . ?l)
 281     (lao                        . ?o)
 282     (tibetan                    . ?q)
 283     (japanese-roman             . ?r)
 284     (thai                       . ?t)
 285     (vietnamese                 . ?v)
 286     (hebrew                     . ?w)
 287     (cyrillic                   . ?y)
 288     (can-break                  . ?|))
 289   "Alist mapping symbols to category characters.
 290 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
 291 symbol in `(category SYMBOL)', and CHAR is the category character
 292 corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in
 293 regular expression strings.")
 294
 295
 296 (defvar rx-greedy-flag t
 297   "Non-nil means produce greedy regular expressions for `zero-or-one',
 298 `zero-or-more', and `one-or-more'.  Dynamically bound.")
 299
 300
 301 (defun rx-info (op)
 302   "Return parsing/code generation info for OP.
 303 If OP is the space character ASCII 32, return info for the symbol `?'.
 304 If OP is the character `?', return info for the symbol `??'.
 305 See also `rx-constituents'."
 306   (cond ((eq op ? ) (setq op '\?))
 307         ((eq op ??) (setq op '\??)))
 308   (while (and (not (null op)) (symbolp op))
 309     (setq op (cdr (assq op rx-constituents))))
 310   op)
 311
 312
 313 (defun rx-check (form)
 314   "Check FORM according to its car's parsing info."
 315   (unless (listp form)
 316     (error "rx `%s' needs argument(s)" form))
 317   (let* ((rx (rx-info (car form)))
 318          (nargs (1- (length form)))
 319          (min-args (nth 1 rx))
 320          (max-args (nth 2 rx))
 321          (type-pred (nth 3 rx)))
 322     (when (and (not (null min-args))
 323                (< nargs min-args))
 324       (error "rx form `%s' requires at least %d args"
 325              (car form) min-args))
 326     (when (and (not (null max-args))
 327                (> nargs max-args))
 328       (error "rx form `%s' accepts at most %d args"
 329              (car form) max-args))
 330     (when (not (null type-pred))
 331       (dolist (sub-form (cdr form))
 332         (unless (funcall type-pred sub-form)
 333           (error "rx form `%s' requires args satisfying `%s'"
 334                  (car form) type-pred))))))
 335
 336
 337 (defun rx-and (form)
 338   "Parse and produce code from FORM.
 339 FORM is of the form `(and FORM1 ...)'."
 340   (rx-check form)
 341   (concat "\\(?:"
 342           (mapconcat
 343            (function (lambda (x) (rx-to-string x 'no-group)))
 344            (cdr form) nil)
 345           "\\)"))
 346
 347
 348 (defun rx-or (form)
 349   "Parse and produce code from FORM, which is `(or FORM1 ...)'."
 350   (rx-check form)
 351   (let ((all-args-strings t))
 352     (dolist (arg (cdr form))
 353       (unless (stringp arg)
 354         (setq all-args-strings nil)))
 355     (concat "\\(?:"
 356             (if all-args-strings
 357                 (regexp-opt (cdr form))
 358               (mapconcat #'rx-to-string (cdr form) "\\|"))
 359             "\\)")))
 360
 361
 362 (defvar rx-bracket)                    ; dynamically bound in `rx-any'
 363
 364 (defun rx-check-any (arg)
 365    "Check arg ARG for Rx `any'."
 366    (if (integerp arg)
 367        (setq arg (string arg)))
 368    (when (stringp arg)
 369      (if (zerop (length arg))
 370          (error "String arg for Rx `any' must not be empty"))
 371      ;; Quote ^ at start; don't bother to check whether this is first arg.
 372      (if (eq ?^ (aref arg 0))
 373          (setq arg (concat "\\" arg)))
 374      ;; Remove ] and set flag for adding it to start of overall result.
 375      (when (string-match "\\]" arg)
 376        (setq arg (replace-regexp-in-string "\\]" "" arg)
 377              rx-bracket "]")))
 378    (when (symbolp arg)
 379      (let ((translation (condition-case nil
 380                             (rx-to-string arg 'no-group)
 381                           (error nil))))
 382        (unless translation (error "Invalid char class `%s' in Rx `any'" arg))
 383        (setq arg (substring translation 1 -1)))) ; strip outer brackets
 384    ;; sregex compatibility
 385    (when (and (integerp (car-safe arg))
 386               (integerp (cdr-safe arg)))
 387      (setq arg (string (car arg) ?- (cdr arg))))
 388    (unless (stringp arg)
 389      (error "rx `any' requires string, character, char pair or char class args"))
 390    arg)
 391
 392 (defun rx-any (form)
 393   "Parse and produce code from FORM, which is `(any ARG ...)'.
 394 ARG is optional."
 395   (rx-check form)
 396   (let* ((rx-bracket nil)
 397          (args (mapcar #'rx-check-any (cdr form)))) ; side-effects `rx-bracket'
 398     ;; If there was a ?- in the form, move it to the front to avoid
 399     ;; accidental range.
 400     (if (member "-" args)
 401         (setq args (cons "-" (delete "-" args))))
 402     (apply #'concat "[" rx-bracket (append args '("]")))))
 403
 404
 405 (defun rx-check-not (arg)
 406   "Check arg ARG for Rx `not'."
 407   (unless (or (and (symbolp arg)
 408                    (string-match "\\`\\[\\[:[-a-z]:\\]\\]\\'"
 409                                  (condition-case nil
 410                                      (rx-to-string arg 'no-group)
 411                                    (error ""))))
 412               (eq arg 'word-boundary)
 413               (and (consp arg)
 414                    (memq (car arg) '(not any in syntax category))))
 415     (error "rx `not' syntax error: %s" arg))
 416   t)
 417
 418
 419 (defun rx-not (form)
 420   "Parse and produce code from FORM.  FORM is `(not ...)'."
 421   (rx-check form)
 422   (let ((result (rx-to-string (cadr form) 'no-group))
 423         case-fold-search)
 424     (cond ((string-match "\\`\\[^" result)
 425            (if (= (length result) 4)
 426                (substring result 2 3)
 427              (concat "[" (substring result 2))))
 428           ((eq ?\[ (aref result 0))
 429            (concat "[^" (substring result 1)))
 430           ((string-match "\\`\\\\[scb]" result)
 431            (concat (capitalize (substring result 0 2)) (substring result 2)))
 432           (t
 433            (concat "[^" result "]")))))
 434
 435
 436 (defun rx-not-char (form)
 437   "Parse and produce code from FORM.  FORM is `(not-char ...)'."
 438   (rx-check form)
 439   (rx-not `(not (in ,@(cdr form)))))
 440
 441
 442 (defun rx-not-syntax (form)
 443   "Parse and produce code from FORM.  FORM is `(not-syntax SYNTAX)'."
 444   (rx-check form)
 445   (rx-not `(not (syntax ,@(cdr form)))))
 446
 447
 448 (defun rx-trans-forms (form &optional skip)
 449   "If FORM's length is greater than two, transform it to length two.
 450 A form (HEAD REST ...) becomes (HEAD (and REST ...)).
 451 If SKIP is non-nil, allow that number of items after the head, i.e.
 452 `(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1."
 453   (unless skip (setq skip 0))
 454   (let ((tail (nthcdr (1+ skip) form)))
 455     (if (= (length tail) 1)
 456         form
 457       (let ((form (copy-sequence form)))
 458         (setcdr (nthcdr skip form) (list (cons 'and tail)))
 459         form))))
 460
 461
 462 (defun rx-= (form)
 463   "Parse and produce code from FORM `(= N ...)'."
 464   (rx-check form)
 465   (setq form (rx-trans-forms form 1))
 466   (unless (and (integerp (nth 1 form))
 467                (> (nth 1 form) 0))
 468     (error "rx `=' requires positive integer first arg"))
 469   (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
 470
 471
 472 (defun rx->= (form)
 473   "Parse and produce code from FORM `(>= N ...)'."
 474   (rx-check form)
 475   (setq form (rx-trans-forms form 1))
 476   (unless (and (integerp (nth 1 form))
 477                (> (nth 1 form) 0))
 478     (error "rx `>=' requires positive integer first arg"))
 479   (format "%s\\{%d,\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
 480
 481
 482 (defun rx-** (form)
 483   "Parse and produce code from FORM `(** N M ...)'."
 484   (rx-check form)
 485   (setq form (cons 'repeat (cdr (rx-trans-forms form 2))))
 486   (rx-to-string form))
 487
 488
 489 (defun rx-repeat (form)
 490   "Parse and produce code from FORM.
 491 FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'."
 492   (rx-check form)
 493   (cond ((= (length form) 3)
 494          (unless (and (integerp (nth 1 form))
 495                       (> (nth 1 form) 0))
 496            (error "rx `repeat' requires positive integer first arg"))
 497          (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
 498         ((or (not (integerp (nth 2 form)))
 499              (< (nth 2 form) 0)
 500              (not (integerp (nth 1 form)))
 501              (< (nth 1 form) 0)
 502              (< (nth 2 form) (nth 1 form)))
 503          (error "rx `repeat' range error"))
 504         (t
 505          (format "%s\\{%d,%d\\}" (rx-to-string (nth 3 form))
 506                  (nth 1 form) (nth 2 form)))))
 507
 508
 509 (defun rx-submatch (form)
 510   "Parse and produce code from FORM, which is `(submatch ...)'."
 511   (concat "\\("
 512           (mapconcat (function (lambda (x) (rx-to-string x 'no-group)))
 513                      (cdr form) nil)
 514           "\\)"))
 515
 516 (defun rx-backref (form)
 517   "Parse and produce code from FORM, which is `(backref N)'."
 518   (rx-check form)
 519   (format "\\%d" (nth 1 form)))
 520
 521 (defun rx-check-backref (arg)
 522   "Check arg ARG for Rx `backref'."
 523   (or (and (integerp arg) (>= arg 1) (<= arg 9))
 524       (error "rx `backref' requires numeric 1<=arg<=9: %s" arg)))
 525
 526 (defun rx-kleene (form)
 527   "Parse and produce code from FORM.
 528 FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
 529 `zero-or-more' etc.  operators.
 530 If OP is one of `*', `+', `?', produce a greedy regexp.
 531 If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
 532 If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
 533 is non-nil."
 534   (rx-check form)
 535   (setq form (rx-trans-forms form))
 536   (let ((suffix (cond ((memq (car form) '(* + ? )) "")
 537                       ((memq (car form) '(*? +? ??)) "?")
 538                       (rx-greedy-flag "")
 539                       (t "?")))
 540         (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
 541                   ((memq (car form) '(+ +? 1+ one-or-more))  "+")
 542                   (t "?")))
 543         (result (rx-to-string (cadr form) 'no-group)))
 544     (if (not (rx-atomic-p result))
 545         (setq result (concat "\\(?:" result "\\)")))
 546     (concat result op suffix)))
 547
 548 (defun rx-atomic-p (r)
 549   "Return non-nil if regexp string R is atomic.
 550 An atomic regexp R is one such that a suffix operator
 551 appended to R will apply to all of R.  For example, \"a\"
 552 \"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\",
 553 \"[ab]c\", and \"ab\\|ab*c\" are not atomic.
 554
 555 This function may return false negatives, but it will not
 556 return false positives.  It is nevertheless useful in
 557 situations where an efficiency shortcut can be taken iff a
 558 regexp is atomic.  The function can be improved to detect
 559 more cases of atomic regexps.  Presently, this function
 560 detects the following categories of atomic regexp;
 561
 562   a group or shy group:  \\(...\\)
 563   a character class:     [...]
 564   a single character:    a
 565
 566 On the other hand, false negatives will be returned for
 567 regexps that are atomic but end in operators, such as
 568 \"a+\".  I think these are rare.  Probably such cases could
 569 be detected without much effort.  A guarantee of no false
 570 negatives would require a theoretic specification of the set
 571 of all atomic regexps."
 572   (let ((l (length r)))
 573     (or (equal l 1)
 574         (and (>= l 6)
 575              (equal (substring r 0 2) "\\(")
 576              (equal (substring r -2) "\\)"))
 577         (and (>= l 2)
 578              (equal (substring r 0 1) "[")
 579              (equal (substring r -1) "]")))))
 580
 581
 582 (defun rx-syntax (form)
 583   "Parse and produce code from FORM, which is `(syntax SYMBOL)'."
 584   (rx-check form)
 585   (let* ((sym (cadr form))
 586          (syntax (assq sym rx-syntax)))
 587     (unless syntax
 588       ;; Try sregex compatibility.
 589       (let ((name (symbol-name sym)))
 590         (if (= 1 (length name))
 591             (setq syntax (rassq (aref name 0) rx-syntax))))
 592       (unless syntax
 593         (error "Unknown rx syntax `%s'" (cadr form))))
 594     (format "\\s%c" (cdr syntax))))
 595
 596
 597 (defun rx-check-category (form)
 598   "Check the argument FORM of a `(category FORM)'."
 599   (unless (or (integerp form)
 600               (cdr (assq form rx-categories)))
 601     (error "Unknown category `%s'" form))
 602   t)
 603
 604
 605 (defun rx-category (form)
 606   "Parse and produce code from FORM, which is `(category SYMBOL)'."
 607   (rx-check form)
 608   (let ((char (if (integerp (cadr form))
 609                   (cadr form)
 610                 (cdr (assq (cadr form) rx-categories)))))
 611     (format "\\c%c" char)))
 612
 613
 614 (defun rx-eval (form)
 615   "Parse and produce code from FORM, which is `(eval FORM)'."
 616   (rx-check form)
 617   (rx-to-string (eval (cadr form))))
 618
 619
 620 (defun rx-greedy (form)
 621   "Parse and produce code from FORM.
 622 If FORM is '(minimal-match FORM1)', non-greedy versions of `*',
 623 `+', and `?' operators will be used in FORM1.  If FORM is
 624 '(maximal-match FORM1)', greedy operators will be used."
 625   (rx-check form)
 626   (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
 627     (rx-to-string (cadr form))))
 628
 629
 630 (defun rx-regexp (form)
 631   "Parse and produce code from FORM, which is `(regexp STRING)'."
 632   (rx-check form)
 633   (concat "\\(?:" (cadr form) "\\)"))
 634
 635
 636 ;;;###autoload
 637 (defun rx-to-string (form &optional no-group)
 638   "Parse and produce code for regular expression FORM.
 639 FORM is a regular expression in sexp form.
 640 NO-GROUP non-nil means don't put shy groups around the result."
 641   (cond ((stringp form)
 642          (regexp-quote form))
 643         ((integerp form)
 644          (regexp-quote (char-to-string form)))
 645         ((symbolp form)
 646          (let ((info (rx-info form)))
 647            (cond ((stringp info)
 648                   info)
 649                  ((null info)
 650                   (error "Unknown rx form `%s'" form))
 651                  (t
 652                   (funcall (nth 0 info) form)))))
 653         ((consp form)
 654          (let ((info (rx-info (car form))))
 655            (unless (consp info)
 656              (error "Unknown rx form `%s'" (car form)))
 657            (let ((result (funcall (nth 0 info) form)))
 658              (if (or no-group (string-match "\\`\\\\[(]" result))
 659                  result
 660                (concat "\\(?:" result "\\)")))))
 661         (t
 662          (error "rx syntax error at `%s'" form))))
 663
 664
 665 ;;;###autoload
 666 (defmacro rx (&rest regexps)
 667   "Translate regular expressions REGEXPS in sexp form to a regexp string.
 668 REGEXPS is a non-empty sequence of forms of the sort listed below.
 669 See also `rx-to-string' for how to do such a translation at run-time.
 670
 671 The following are valid subforms of regular expressions in sexp
 672 notation.
 673
 674 STRING
 675      matches string STRING literally.
 676
 677 CHAR
 678      matches character CHAR literally.
 679
 680 `not-newline', `nonl'
 681      matches any character except a newline.
 682                         .
 683 `anything'
 684      matches any character
 685
 686 `(any SET ...)'
 687 `(in SET ...)'
 688 `(char SET ...)'
 689      matches any character in SET ....  SET may be a character or string.
 690      Ranges of characters can be specified as `A-Z' in strings.
 691      Ranges may also be specified as conses like `(?A . ?Z)'.
 692
 693      SET may also be the name of a character class: `digit',
 694      `control', `hex-digit', `blank', `graph', `print', `alnum',
 695      `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper',
 696      `word', or one of their synonyms.
 697
 698 `(not (any SET ...))'
 699      matches any character not in SET ...
 700
 701 `line-start', `bol'
 702      matches the empty string, but only at the beginning of a line
 703      in the text being matched
 704
 705 `line-end', `eol'
 706      is similar to `line-start' but matches only at the end of a line
 707
 708 `string-start', `bos', `bot'
 709      matches the empty string, but only at the beginning of the
 710      string being matched against.
 711
 712 `string-end', `eos', `eot'
 713      matches the empty string, but only at the end of the
 714      string being matched against.
 715
 716 `buffer-start'
 717      matches the empty string, but only at the beginning of the
 718      buffer being matched against.  Actually equivalent to `string-start'.
 719
 720 `buffer-end'
 721      matches the empty string, but only at the end of the
 722      buffer being matched against.  Actually equivalent to `string-end'.
 723
 724 `point'
 725      matches the empty string, but only at point.
 726
 727 `word-start', `bow'
 728      matches the empty string, but only at the beginning or end of a
 729      word.
 730
 731 `word-end', `eow'
 732      matches the empty string, but only at the end of a word.
 733
 734 `word-boundary'
 735      matches the empty string, but only at the beginning or end of a
 736      word.
 737
 738 `(not word-boundary)'
 739 `not-word-boundary'
 740      matches the empty string, but not at the beginning or end of a
 741      word.
 742
 743 `digit', `numeric', `num'
 744      matches 0 through 9.
 745
 746 `control', `cntrl'
 747      matches ASCII control characters.
 748
 749 `hex-digit', `hex', `xdigit'
 750      matches 0 through 9, a through f and A through F.
 751
 752 `blank'
 753      matches space and tab only.
 754
 755 `graphic', `graph'
 756      matches graphic characters--everything except ASCII control chars,
 757      space, and DEL.
 758
 759 `printing', `print'
 760      matches printing characters--everything except ASCII control chars
 761      and DEL.
 762
 763 `alphanumeric', `alnum'
 764      matches letters and digits.  (But at present, for multibyte characters,
 765      it matches anything that has word syntax.)
 766
 767 `letter', `alphabetic', `alpha'
 768      matches letters.  (But at present, for multibyte characters,
 769      it matches anything that has word syntax.)
 770
 771 `ascii'
 772      matches ASCII (unibyte) characters.
 773
 774 `nonascii'
 775      matches non-ASCII (multibyte) characters.
 776
 777 `lower', `lower-case'
 778      matches anything lower-case.
 779
 780 `upper', `upper-case'
 781      matches anything upper-case.
 782
 783 `punctuation', `punct'
 784      matches punctuation.  (But at present, for multibyte characters,
 785      it matches anything that has non-word syntax.)
 786
 787 `space', `whitespace', `white'
 788      matches anything that has whitespace syntax.
 789
 790 `word', `wordchar'
 791      matches anything that has word syntax.
 792
 793 `not-wordchar'
 794      matches anything that has non-word syntax.
 795
 796 `(syntax SYNTAX)'
 797      matches a character with syntax SYNTAX.  SYNTAX must be one
 798      of the following symbols, or a symbol corresponding to the syntax
 799      character, e.g. `\\.' for `\\s.'.
 800
 801      `whitespace'               (\\s- in string notation)
 802      `punctuation'              (\\s.)
 803      `word'                     (\\sw)
 804      `symbol'                   (\\s_)
 805      `open-parenthesis'         (\\s()
 806      `close-parenthesis'        (\\s))
 807      `expression-prefix'        (\\s')
 808      `string-quote'             (\\s\")
 809      `paired-delimiter'         (\\s$)
 810      `escape'                   (\\s\\)
 811      `character-quote'          (\\s/)
 812      `comment-start'            (\\s<)
 813      `comment-end'              (\\s>)
 814      `string-delimiter'         (\\s|)
 815      `comment-delimiter'        (\\s!)
 816
 817 `(not (syntax SYNTAX))'
 818      matches a character that doesn't have syntax SYNTAX.
 819
 820 `(category CATEGORY)'
 821      matches a character with category CATEGORY.  CATEGORY must be
 822      either a character to use for C, or one of the following symbols.
 823
 824      `consonant'                        (\\c0 in string notation)
 825      `base-vowel'                       (\\c1)
 826      `upper-diacritical-mark'           (\\c2)
 827      `lower-diacritical-mark'           (\\c3)
 828      `tone-mark'                        (\\c4)
 829      `symbol'                           (\\c5)
 830      `digit'                            (\\c6)
 831      `vowel-modifying-diacritical-mark' (\\c7)
 832      `vowel-sign'                       (\\c8)
 833      `semivowel-lower'                  (\\c9)
 834      `not-at-end-of-line'               (\\c<)
 835      `not-at-beginning-of-line'         (\\c>)
 836      `alpha-numeric-two-byte'           (\\cA)
 837      `chinse-two-byte'                  (\\cC)
 838      `greek-two-byte'                   (\\cG)
 839      `japanese-hiragana-two-byte'       (\\cH)
 840      `indian-tow-byte'                  (\\cI)
 841      `japanese-katakana-two-byte'       (\\cK)
 842      `korean-hangul-two-byte'           (\\cN)
 843      `cyrillic-two-byte'                (\\cY)
 844      `combining-diacritic'              (\\c^)
 845      `ascii'                            (\\ca)
 846      `arabic'                           (\\cb)
 847      `chinese'                          (\\cc)
 848      `ethiopic'                         (\\ce)
 849      `greek'                            (\\cg)
 850      `korean'                           (\\ch)
 851      `indian'                           (\\ci)
 852      `japanese'                         (\\cj)
 853      `japanese-katakana'                (\\ck)
 854      `latin'                            (\\cl)
 855      `lao'                              (\\co)
 856      `tibetan'                          (\\cq)
 857      `japanese-roman'                   (\\cr)
 858      `thai'                             (\\ct)
 859      `vietnamese'                       (\\cv)
 860      `hebrew'                           (\\cw)
 861      `cyrillic'                         (\\cy)
 862      `can-break'                        (\\c|)
 863
 864 `(not (category CATEGORY))'
 865      matches a character that doesn't have category CATEGORY.
 866
 867 `(and SEXP1 SEXP2 ...)'
 868 `(: SEXP1 SEXP2 ...)'
 869 `(seq SEXP1 SEXP2 ...)'
 870 `(sequence SEXP1 SEXP2 ...)'
 871      matches what SEXP1 matches, followed by what SEXP2 matches, etc.
 872
 873 `(submatch SEXP1 SEXP2 ...)'
 874 `(group SEXP1 SEXP2 ...)'
 875      like `and', but makes the match accessible with `match-end',
 876      `match-beginning', and `match-string'.
 877
 878 `(group SEXP1 SEXP2 ...)'
 879      another name for `submatch'.
 880
 881 `(or SEXP1 SEXP2 ...)'
 882 `(| SEXP1 SEXP2 ...)'
 883      matches anything that matches SEXP1 or SEXP2, etc.  If all
 884      args are strings, use `regexp-opt' to optimize the resulting
 885      regular expression.
 886
 887 `(minimal-match SEXP)'
 888      produce a non-greedy regexp for SEXP.  Normally, regexps matching
 889      zero or more occurrences of something are \"greedy\" in that they
 890      match as much as they can, as long as the overall regexp can
 891      still match.  A non-greedy regexp matches as little as possible.
 892
 893 `(maximal-match SEXP)'
 894      produce a greedy regexp for SEXP.  This is the default.
 895
 896 Below, `SEXP ...' represents a sequence of regexp forms, treated as if
 897 enclosed in `(and ...)'.
 898
 899 `(zero-or-more SEXP ...)'
 900 `(0+ SEXP ...)'
 901      matches zero or more occurrences of what SEXP ... matches.
 902
 903 `(* SEXP ...)'
 904      like `zero-or-more', but always produces a greedy regexp, independent
 905      of `rx-greedy-flag'.
 906
 907 `(*? SEXP ...)'
 908      like `zero-or-more', but always produces a non-greedy regexp,
 909      independent of `rx-greedy-flag'.
 910
 911 `(one-or-more SEXP ...)'
 912 `(1+ SEXP ...)'
 913      matches one or more occurrences of SEXP ...
 914
 915 `(+ SEXP ...)'
 916      like `one-or-more', but always produces a greedy regexp.
 917
 918 `(+? SEXP ...)'
 919      like `one-or-more', but always produces a non-greedy regexp.
 920
 921 `(zero-or-one SEXP ...)'
 922 `(optional SEXP ...)'
 923 `(opt SEXP ...)'
 924      matches zero or one occurrences of A.
 925
 926 `(? SEXP ...)'
 927      like `zero-or-one', but always produces a greedy regexp.
 928
 929 `(?? SEXP ...)'
 930      like `zero-or-one', but always produces a non-greedy regexp.
 931
 932 `(repeat N SEXP)'
 933 `(= N SEXP ...)'
 934      matches N occurrences.
 935
 936 `(>= N SEXP ...)'
 937      matches N or more occurrences.
 938
 939 `(repeat N M SEXP)'
 940 `(** N M SEXP ...)'
 941      matches N to M occurrences.
 942
 943 `(backref N)'
 944     matches what was matched previously by submatch N.
 945
 946 `(backref N)'
 947      matches what was matched previously by submatch N.
 948
 949 `(backref N)'
 950     matches what was matched previously by submatch N.
 951
 952 `(eval FORM)'
 953      evaluate FORM and insert result.  If result is a string,
 954      `regexp-quote' it.
 955
 956 `(regexp REGEXP)'
 957      include REGEXP in string notation in the result."
 958   (cond ((null regexps)
 959          (error "No regexp"))
 960         ((cdr regexps)
 961          (rx-to-string `(and ,@regexps) t))
 962         (t
 963          (rx-to-string (car regexps) t))))
 964 \f
 965 ;; ;; sregex.el replacement
 966
 967 ;; ;;;###autoload (provide 'sregex)
 968 ;; ;;;###autoload (autoload 'sregex "rx")
 969 ;; (defalias 'sregex 'rx-to-string)
 970 ;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
 971 ;; (defalias 'sregexq 'rx)
 972 \f
 973 (provide 'rx)
 974
 975 ;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b
 976 ;;; rx.el ends here