lisp/emacs-lisp/rx.el

   1 ;;; rx.el --- sexp notation for regular expressions
   2
   3 ;; Copyright (C) 2001, 2003, 2004  Free Software Foundation, Inc.
   4
   5 ;; Author: Gerd Moellmann <gerd@gnu.org>
   6 ;; Maintainer: FSF
   7 ;; Keywords: strings, regexps, extensions
   8
   9 ;; This file is part of GNU Emacs.
  10
  11 ;; GNU Emacs is free software; you can redistribute it and/or modify
  12 ;; it under the terms of the GNU General Public License as published by
  13 ;; the Free Software Foundation; either version 2, or (at your option)
  14 ;; any later version.
  15
  16 ;; GNU Emacs is distributed in the hope that it will be useful,
  17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 ;; GNU General Public License for more details.
  20
  21 ;; You should have received a copy of the GNU General Public License
  22 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 ;; Boston, MA 02111-1307, USA.
  25
  26 ;;; Commentary:
  27
  28 ;; This is another implementation of sexp-form regular expressions.
  29 ;; It was unfortunately written without being aware of the Sregex
  30 ;; package coming with Emacs, but as things stand, Rx completely
  31 ;; covers all regexp features, which Sregex doesn't, doesn't suffer
  32 ;; from the bugs mentioned in the commentary section of Sregex, and
  33 ;; uses a nicer syntax (IMHO, of course :-).
  34
  35 ;; This significantly extended version of the original, is almost
  36 ;; compatible with Sregex.  The only incompatibility I (fx) know of is
  37 ;; that the `repeat' form can't have multiple regexp args.
  38
  39 ;; Now alternative forms are provided for a degree of compatibility
  40 ;; with Shivers' attempted definitive SRE notation
  41 ;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>.  SRE forms not
  42 ;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
  43 ;; ,<exp>, (word ...), word+, posix-string, and character class forms.
  44 ;; Some forms are inconsistent with SRE, either for historical reasons
  45 ;; or because of the implementation -- simple translation into Emacs
  46 ;; regexp strings.  These include: any, word.  Also, case-sensitivity
  47 ;; and greediness are controlled by variables external to the regexp,
  48 ;; and you need to feed the forms to the `posix-' functions to get
  49 ;; SRE's POSIX semantics.  There are probably more difficulties.
  50
  51 ;; Rx translates a sexp notation for regular expressions into the
  52 ;; usual string notation.  The translation can be done at compile-time
  53 ;; by using the `rx' macro.  It can be done at run-time by calling
  54 ;; function `rx-to-string'.  See the documentation of `rx' for a
  55 ;; complete description of the sexp notation.
  56 ;;
  57 ;; Some examples of string regexps and their sexp counterparts:
  58 ;;
  59 ;; "^[a-z]*"
  60 ;; (rx (and line-start (0+ (in "a-z"))))
  61 ;;
  62 ;; "\n[^ \t]"
  63 ;; (rx (and "\n" (not blank))), or
  64 ;; (rx (and "\n" (not (any " \t"))))
  65 ;;
  66 ;; "\\*\\*\\* EOOH \\*\\*\\*\n"
  67 ;; (rx "*** EOOH ***\n")
  68 ;;
  69 ;; "\\<\\(catch\\|finally\\)\\>[^_]"
  70 ;; (rx (and word-start (submatch (or "catch" "finally")) word-end
  71 ;;          (not (any ?_))))
  72 ;;
  73 ;; "[ \t\n]*:\\([^:]+\\|$\\)"
  74 ;; (rx (and (zero-or-more (in " \t\n")) ":"
  75 ;;          (submatch (or line-end (one-or-more (not (any ?:)))))))
  76 ;;
  77 ;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
  78 ;; (rx (and line-start
  79 ;;          "content-transfer-encoding:"
  80 ;;          (+ (? ?\n)) blank
  81 ;;          "quoted-printable"
  82 ;;          (+ (? ?\n)) blank))
  83 ;;
  84 ;; (concat "^\\(?:" something-else "\\)")
  85 ;; (rx (and line-start (eval something-else))), statically or
  86 ;; (rx-to-string '(and line-start ,something-else)), dynamically.
  87 ;;
  88 ;; (regexp-opt '(STRING1 STRING2 ...))
  89 ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
  90 ;; calls `regexp-opt' as needed.
  91 ;;
  92 ;; "^;;\\s-*\n\\|^\n"
  93 ;; (rx (or (and line-start ";;" (0+ space) ?\n)
  94 ;;         (and line-start ?\n)))
  95 ;;
  96 ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
  97 ;; (rx (and "$Id: "
  98 ;;          (1+ (not (in " ")))
  99 ;;          " "
 100 ;;          (submatch (1+ (not (in " "))))
 101 ;;          " "))
 102 ;;
 103 ;; "\\\\\\\\\\[\\w+"
 104 ;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
 105 ;;
 106 ;; etc.
 107
 108 ;;; History:
 109 ;;
 110
 111 ;;; Code:
 112
 113 (defconst rx-constituents
 114   '((and                . (rx-and 1 nil))
 115     (seq                . and)          ; SRE
 116     (:                  . and)          ; SRE
 117     (sequence           . and)          ; sregex
 118     (or                 . (rx-or 1 nil))
 119     (|                  . or)           ; SRE
 120     (not-newline        . ".")
 121     (nonl               . not-newline)  ; SRE
 122     (anything           . ".\\|\n")
 123     (any                . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
 124     (in                 . any)
 125     (char               . any)          ; sregex
 126     (not-char           . (rx-not-char 1 nil rx-check-any)) ; sregex
 127     (not                . (rx-not 1 1 rx-check-not))
 128     ;; Partially consistent with sregex, whose `repeat' is like our
 129     ;; `**'.  (`repeat' with optional max arg and multiple sexp forms
 130     ;; is ambiguous.)
 131     (repeat             . (rx-repeat 2 3))
 132     (=                  . (rx-= 2 nil))    ; SRE
 133     (>=                 . (rx->= 2 nil))   ; SRE
 134     (**                 . (rx-** 2 nil))   ; SRE
 135     (submatch           . (rx-submatch 1 nil)) ; SRE
 136     (group              . submatch)
 137     (zero-or-more       . (rx-kleene 1 nil))
 138     (one-or-more        . (rx-kleene 1 nil))
 139     (zero-or-one        . (rx-kleene 1 nil))
 140     (\?                 . zero-or-one)  ; SRE
 141     (\??                . zero-or-one)
 142     (*                  . zero-or-more) ; SRE
 143     (*?                 . zero-or-more)
 144     (0+                 . zero-or-more)
 145     (+                  . one-or-more)  ; SRE
 146     (+?                 . one-or-more)
 147     (1+                 . one-or-more)
 148     (optional           . zero-or-one)
 149     (opt                . zero-or-one)  ; sregex
 150     (minimal-match      . (rx-greedy 1 1))
 151     (maximal-match      . (rx-greedy 1 1))
 152     (backref            . (rx-backref 1 1 rx-check-backref))
 153     (line-start         . "^")
 154     (bol                . line-start)   ; SRE
 155     (line-end           . "$")
 156     (eol                . line-end)     ; SRE
 157     (string-start       . "\\`")
 158     (bos                . string-start) ; SRE
 159     (bot                . string-start) ; sregex
 160     (string-end         . "\\'")
 161     (eos                . string-end)   ; SRE
 162     (eot                . string-end)   ; sregex
 163     (buffer-start       . "\\`")
 164     (buffer-end         . "\\'")
 165     (point              . "\\=")
 166     (word-start         . "\\<")
 167     (bow                . word-start)   ; SRE
 168     (word-end           . "\\>")
 169     (eow                . word-end)     ; SRE
 170     (word-boundary      . "\\b")
 171     (not-word-boundary  . "\\B")        ; sregex
 172     (syntax             . (rx-syntax 1 1))
 173     (not-syntax         . (rx-not-syntax 1 1)) ; sregex
 174     (category           . (rx-category 1 1 rx-check-category))
 175     (eval               . (rx-eval 1 1))
 176     (regexp             . (rx-regexp 1 1 stringp))
 177     (digit              . "[[:digit:]]")
 178     (numeric            . digit)        ; SRE
 179     (num                . digit)        ; SRE
 180     (control            . "[[:cntrl:]]") ; SRE
 181     (cntrl              . control)       ; SRE
 182     (hex-digit          . "[[:xdigit:]]") ; SRE
 183     (hex                . hex-digit)      ; SRE
 184     (xdigit             . hex-digit)      ; SRE
 185     (blank              . "[[:blank:]]")  ; SRE
 186     (graphic            . "[[:graph:]]")  ; SRE
 187     (graph              . graphic)        ; SRE
 188     (printing           . "[[:print:]]")  ; SRE
 189     (print              . printing)       ; SRE
 190     (alphanumeric       . "[[:alnum:]]")  ; SRE
 191     (alnum              . alphanumeric)   ; SRE
 192     (letter             . "[[:alpha:]]")
 193     (alphabetic         . letter)       ; SRE
 194     (alpha              . letter)       ; SRE
 195     (ascii              . "[[:ascii:]]") ; SRE
 196     (nonascii           . "[[:nonascii:]]")
 197     (lower              . "[[:lower:]]") ; SRE
 198     (lower-case         . lower)         ; SRE
 199     (punctuation        . "[[:punct:]]") ; SRE
 200     (punct              . punctuation)   ; SRE
 201     (space              . "[[:space:]]") ; SRE
 202     (whitespace         . space)         ; SRE
 203     (white              . space)         ; SRE
 204     (upper              . "[[:upper:]]") ; SRE
 205     (upper-case         . upper)         ; SRE
 206     (word               . "[[:word:]]")  ; inconsistent with SRE
 207     (wordchar           . word)          ; sregex
 208     (not-wordchar       . "[^[:word:]]") ; sregex (use \\W?)
 209     )
 210   "Alist of sexp form regexp constituents.
 211 Each element of the alist has the form (SYMBOL . DEFN).
 212 SYMBOL is a valid constituent of sexp regular expressions.
 213 If DEFN is a string, SYMBOL is translated into DEFN.
 214 If DEFN is a symbol, use the definition of DEFN, recursively.
 215 Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE).
 216 FUNCTION is used to produce code for SYMBOL.  MIN-ARGS and MAX-ARGS
 217 are the minimum and maximum number of arguments the function-form
 218 sexp constituent SYMBOL may have in sexp regular expressions.
 219 MAX-ARGS nil means no limit.  PREDICATE, if specified, means that
 220 all arguments must satisfy PREDICATE.")
 221
 222
 223 (defconst rx-syntax
 224   '((whitespace         . ?-)
 225     (punctuation        . ?.)
 226     (word               . ?w)
 227     (symbol             . ?_)
 228     (open-parenthesis   . ?\()
 229     (close-parenthesis  . ?\))
 230     (expression-prefix  . ?\')
 231     (string-quote       . ?\")
 232     (paired-delimiter   . ?$)
 233     (escape             . ?\\)
 234     (character-quote    . ?/)
 235     (comment-start      . ?<)
 236     (comment-end        . ?>)
 237     (string-delimiter   . ?|)
 238     (comment-delimiter  . ?!)
 239     ;; sregex compatibility
 240     (- . ?-)
 241     (\. . ?.)
 242     (w . ?w)
 243     (_ . ?_)
 244     (\( . ?\()
 245     (\) . ?\))
 246     (\' . ?\')
 247     (\" . ?\")
 248     (\$ . ?$)
 249     (\\ . ?\\)
 250     (/ . ?/)
 251     (< . ?<)
 252     (> . ?>)
 253     (| . ?|)
 254     (! . ?!))
 255   "Alist mapping Rx syntax symbols to syntax characters.
 256 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
 257 symbol in `(syntax SYMBOL)', and CHAR is the syntax character
 258 corresponding to SYMBOL, as it would be used with \\s or \\S in
 259 regular expressions.")
 260
 261
 262 (defconst rx-categories
 263   '((consonant                  . ?0)
 264     (base-vowel                 . ?1)
 265     (upper-diacritical-mark     . ?2)
 266     (lower-diacritical-mark     . ?3)
 267     (tone-mark                  . ?4)
 268     (symbol                     . ?5)
 269     (digit                      . ?6)
 270     (vowel-modifying-diacritical-mark . ?7)
 271     (vowel-sign                 . ?8)
 272     (semivowel-lower            . ?9)
 273     (not-at-end-of-line         . ?<)
 274     (not-at-beginning-of-line   . ?>)
 275     (alpha-numeric-two-byte     . ?A)
 276     (chinse-two-byte            . ?C)
 277     (greek-two-byte             . ?G)
 278     (japanese-hiragana-two-byte . ?H)
 279     (indian-two-byte            . ?I)
 280     (japanese-katakana-two-byte . ?K)
 281     (korean-hangul-two-byte     . ?N)
 282     (cyrillic-two-byte          . ?Y)
 283     (combining-diacritic        . ?^)
 284     (ascii                      . ?a)
 285     (arabic                     . ?b)
 286     (chinese                    . ?c)
 287     (ethiopic                   . ?e)
 288     (greek                      . ?g)
 289     (korean                     . ?h)
 290     (indian                     . ?i)
 291     (japanese                   . ?j)
 292     (japanese-katakana          . ?k)
 293     (latin                      . ?l)
 294     (lao                        . ?o)
 295     (tibetan                    . ?q)
 296     (japanese-roman             . ?r)
 297     (thai                       . ?t)
 298     (vietnamese                 . ?v)
 299     (hebrew                     . ?w)
 300     (cyrillic                   . ?y)
 301     (can-break                  . ?|))
 302   "Alist mapping symbols to category characters.
 303 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
 304 symbol in `(category SYMBOL)', and CHAR is the category character
 305 corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in
 306 regular expression strings.")
 307
 308
 309 (defvar rx-greedy-flag t
 310   "Non-nil means produce greedy regular expressions for `zero-or-one',
 311 `zero-or-more', and `one-or-more'.  Dynamically bound.")
 312
 313
 314 (defun rx-info (op)
 315   "Return parsing/code generation info for OP.
 316 If OP is the space character ASCII 32, return info for the symbol `?'.
 317 If OP is the character `?', return info for the symbol `??'.
 318 See also `rx-constituents'."
 319   (cond ((eq op ? ) (setq op '\?))
 320         ((eq op ??) (setq op '\??)))
 321   (while (and (not (null op)) (symbolp op))
 322     (setq op (cdr (assq op rx-constituents))))
 323   op)
 324
 325
 326 (defun rx-check (form)
 327   "Check FORM according to its car's parsing info."
 328   (unless (listp form)
 329     (error "rx `%s' needs argument(s)" form))
 330   (let* ((rx (rx-info (car form)))
 331          (nargs (1- (length form)))
 332          (min-args (nth 1 rx))
 333          (max-args (nth 2 rx))
 334          (type-pred (nth 3 rx)))
 335     (when (and (not (null min-args))
 336                (< nargs min-args))
 337       (error "rx form `%s' requires at least %d args"
 338              (car form) min-args))
 339     (when (and (not (null max-args))
 340                (> nargs max-args))
 341       (error "rx form `%s' accepts at most %d args"
 342              (car form) max-args))
 343     (when (not (null type-pred))
 344       (dolist (sub-form (cdr form))
 345         (unless (funcall type-pred sub-form)
 346           (error "rx form `%s' requires args satisfying `%s'"
 347                  (car form) type-pred))))))
 348
 349
 350 (defun rx-and (form)
 351   "Parse and produce code from FORM.
 352 FORM is of the form `(and FORM1 ...)'."
 353   (rx-check form)
 354   (concat "\\(?:"
 355           (mapconcat
 356            (function (lambda (x) (rx-to-string x 'no-group)))
 357            (cdr form) nil)
 358           "\\)"))
 359
 360
 361 (defun rx-or (form)
 362   "Parse and produce code from FORM, which is `(or FORM1 ...)'."
 363   (rx-check form)
 364   (let ((all-args-strings t))
 365     (dolist (arg (cdr form))
 366       (unless (stringp arg)
 367         (setq all-args-strings nil)))
 368     (concat "\\(?:"
 369             (if all-args-strings
 370                 (regexp-opt (cdr form))
 371               (mapconcat #'rx-to-string (cdr form) "\\|"))
 372             "\\)")))
 373
 374
 375 (defvar bracket)                       ; dynamically bound in `rx-any'
 376
 377 (defun rx-check-any (arg)
 378    "Check arg ARG for Rx `any'."
 379    (if (integerp arg)
 380        (setq arg (string arg)))
 381    (when (stringp arg)
 382      (if (zerop (length arg))
 383          (error "String arg for Rx `any' must not be empty"))
 384      ;; Quote ^ at start; don't bother to check whether this is first arg.
 385      (if (eq ?^ (aref arg 0))
 386          (setq arg (concat "\\" arg)))
 387      ;; Remove ] and set flag for adding it to start of overall result.
 388      (when (string-match "]" arg)
 389        (setq arg (replace-regexp-in-string "]" "" arg)
 390              bracket "]")))
 391    (when (symbolp arg)
 392      (let ((translation (condition-case nil
 393                             (rx-to-string arg 'no-group)
 394                           (error nil))))
 395        (unless translation (error "Invalid char class `%s' in Rx `any'" arg))
 396        (setq arg (substring translation 1 -1)))) ; strip outer brackets
 397    ;; sregex compatibility
 398    (when (and (integerp (car-safe arg))
 399               (integerp (cdr-safe arg)))
 400      (setq arg (string (car arg) ?- (cdr arg))))
 401    (unless (stringp arg)
 402      (error "rx `any' requires string, character, char pair or char class args"))
 403    arg)
 404
 405 (defun rx-any (form)
 406   "Parse and produce code from FORM, which is `(any ARG ...)'.
 407 ARG is optional."
 408   (rx-check form)
 409   (let* (bracket
 410          (args (mapcar #'rx-check-any (cdr form)))) ; side-effects `bracket'
 411     ;; If there was a ?- in the form, move it to the front to avoid
 412     ;; accidental range.
 413     (if (member "-" args)
 414         (setq args (cons "-" (delete "-" args))))
 415     (apply #'concat "[" bracket (append args '("]")))))
 416
 417
 418 (defun rx-check-not (arg)
 419   "Check arg ARG for Rx `not'."
 420   (unless (or (and (symbolp arg)
 421                    (string-match "\\`\\[\\[:[-a-z]:]]\\'"
 422                                  (condition-case nil
 423                                      (rx-to-string arg 'no-group)
 424                                    (error ""))))
 425               (eq arg 'word-boundary)
 426               (and (consp arg)
 427                    (memq (car arg) '(not any in syntax category))))
 428     (error "rx `not' syntax error: %s" arg))
 429   t)
 430
 431
 432 (defun rx-not (form)
 433   "Parse and produce code from FORM.  FORM is `(not ...)'."
 434   (rx-check form)
 435   (let ((result (rx-to-string (cadr form) 'no-group))
 436         case-fold-search)
 437     (cond ((string-match "\\`\\[^" result)
 438            (if (= (length result) 4)
 439                (substring result 2 3)
 440              (concat "[" (substring result 2))))
 441           ((eq ?\[ (aref result 0))
 442            (concat "[^" (substring result 1)))
 443           ((string-match "\\`\\\\[scb]" result)
 444            (concat (capitalize (substring result 0 2)) (substring result 2)))
 445           (t
 446            (concat "[^" result "]")))))
 447
 448
 449 (defun rx-not-char (form)
 450   "Parse and produce code from FORM.  FORM is `(not-char ...)'."
 451   (rx-check form)
 452   (rx-not `(not (in ,@(cdr form)))))
 453
 454
 455 (defun rx-not-syntax (form)
 456   "Parse and produce code from FORM.  FORM is `(not-syntax SYNTAX)'."
 457   (rx-check form)
 458   (rx-not `(not (syntax ,@(cdr form)))))
 459
 460
 461 (defun rx-trans-forms (form &optional skip)
 462   "If FORM's length is greater than two, transform it to length two.
 463 A form (HEAD REST ...) becomes (HEAD (and REST ...)).
 464 If SKIP is non-nil, allow that number of items after the head, i.e.
 465 `(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1."
 466   (unless skip (setq skip 0))
 467   (let ((tail (nthcdr (1+ skip) form)))
 468     (if (= (length tail) 1)
 469         form
 470       (let ((form (copy-sequence form)))
 471         (setcdr (nthcdr skip form) (list (cons 'and tail)))
 472         form))))
 473
 474
 475 (defun rx-= (form)
 476   "Parse and produce code from FORM `(= N ...)'."
 477   (rx-check form)
 478   (setq form (rx-trans-forms form 1))
 479   (unless (and (integerp (nth 1 form))
 480                (> (nth 1 form) 0))
 481     (error "rx `=' requires positive integer first arg"))
 482   (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
 483
 484
 485 (defun rx->= (form)
 486   "Parse and produce code from FORM `(>= N ...)'."
 487   (rx-check form)
 488   (setq form (rx-trans-forms form 1))
 489   (unless (and (integerp (nth 1 form))
 490                (> (nth 1 form) 0))
 491     (error "rx `>=' requires positive integer first arg"))
 492   (format "%s\\{%d,\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
 493
 494
 495 (defun rx-** (form)
 496   "Parse and produce code from FORM `(** N M ...)'."
 497   (rx-check form)
 498   (setq form (cons 'repeat (cdr (rx-trans-forms form 2))))
 499   (rx-to-string form))
 500
 501
 502 (defun rx-repeat (form)
 503   "Parse and produce code from FORM.
 504 FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'."
 505   (rx-check form)
 506   (cond ((= (length form) 3)
 507          (unless (and (integerp (nth 1 form))
 508                       (> (nth 1 form) 0))
 509            (error "rx `repeat' requires positive integer first arg"))
 510          (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
 511         ((or (not (integerp (nth 2 form)))
 512              (< (nth 2 form) 0)
 513              (not (integerp (nth 1 form)))
 514              (< (nth 1 form) 0)
 515              (< (nth 2 form) (nth 1 form)))
 516          (error "rx `repeat' range error"))
 517         (t
 518          (format "%s\\{%d,%d\\}" (rx-to-string (nth 3 form))
 519                  (nth 1 form) (nth 2 form)))))
 520
 521
 522 (defun rx-submatch (form)
 523   "Parse and produce code from FORM, which is `(submatch ...)'."
 524   (concat "\\("
 525           (mapconcat (function (lambda (x) (rx-to-string x 'no-group)))
 526                      (cdr form) nil)
 527           "\\)"))
 528
 529 (defun rx-backref (form)
 530   "Parse and produce code from FORM, which is `(backref N)'."
 531   (rx-check form)
 532   (format "\\%d" (nth 1 form)))
 533
 534 (defun rx-check-backref (arg)
 535   "Check arg ARG for Rx `backref'."
 536   (or (and (integerp arg) (>= arg 1) (<= arg 9))
 537       (error "rx `backref' requires numeric 1<=arg<=9: %s" arg)))
 538
 539 (defun rx-kleene (form)
 540   "Parse and produce code from FORM.
 541 FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
 542 `zero-or-more' etc.  operators.
 543 If OP is one of `*', `+', `?', produce a greedy regexp.
 544 If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
 545 If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
 546 is non-nil."
 547   (rx-check form)
 548   (setq form (rx-trans-forms form))
 549   (let ((suffix (cond ((memq (car form) '(* + ? )) "")
 550                       ((memq (car form) '(*? +? ??)) "?")
 551                       (rx-greedy-flag "")
 552                       (t "?")))
 553         (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
 554                   ((memq (car form) '(+ +? 1+ one-or-more))  "+")
 555                   (t "?")))
 556         (result (rx-to-string (cadr form) 'no-group)))
 557     (if (not (rx-atomic-p result))
 558         (setq result (concat "\\(?:" result "\\)")))
 559     (concat result op suffix)))
 560
 561 (defun rx-atomic-p (r)
 562   "Return non-nil if regexp string R is atomic.
 563 An atomic regexp R is one such that a suffix operator
 564 appended to R will apply to all of R.  For example, \"a\"
 565 \"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\",
 566 \"[ab]c\", and \"ab\\|ab*c\" are not atomic.
 567
 568 This function may return false negatives, but it will not
 569 return false positives.  It is nevertheless useful in
 570 situations where an efficiency shortcut can be taken iff a
 571 regexp is atomic.  The function can be improved to detect
 572 more cases of atomic regexps.  Presently, this function
 573 detects the following categories of atomic regexp;
 574
 575   a group or shy group:  \\(...\\)
 576   a character class:     [...]
 577   a single character:    a
 578
 579 On the other hand, false negatives will be returned for
 580 regexps that are atomic but end in operators, such as
 581 \"a+\".  I think these are rare.  Probably such cases could
 582 be detected without much effort.  A guarantee of no false
 583 negatives would require a theoretic specification of the set
 584 of all atomic regexps."
 585   (let ((l (length r)))
 586     (or (equal l 1)
 587         (and (>= l 6)
 588              (equal (substring r 0 2) "\\(")
 589              (equal (substring r -2) "\\)"))
 590         (and (>= l 2)
 591              (equal (substring r 0 1) "[")
 592              (equal (substring r -1) "]")))))
 593
 594
 595 (defun rx-syntax (form)
 596   "Parse and produce code from FORM, which is `(syntax SYMBOL)'."
 597   (rx-check form)
 598   (let ((syntax (assq (cadr form) rx-syntax)))
 599     (unless syntax
 600       (error "Unknown rx syntax `%s'" (cadr form)))
 601     (format "\\s%c" (cdr syntax))))
 602
 603
 604 (defun rx-check-category (form)
 605   "Check the argument FORM of a `(category FORM)'."
 606   (unless (or (integerp form)
 607               (cdr (assq form rx-categories)))
 608     (error "Unknown category `%s'" form))
 609   t)
 610
 611
 612 (defun rx-category (form)
 613   "Parse and produce code from FORM, which is `(category SYMBOL)'."
 614   (rx-check form)
 615   (let ((char (if (integerp (cadr form))
 616                   (cadr form)
 617                 (cdr (assq (cadr form) rx-categories)))))
 618     (format "\\c%c" char)))
 619
 620
 621 (defun rx-eval (form)
 622   "Parse and produce code from FORM, which is `(eval FORM)'."
 623   (rx-check form)
 624   (rx-to-string (eval (cadr form))))
 625
 626
 627 (defun rx-greedy (form)
 628   "Parse and produce code from FORM.
 629 If FORM is '(minimal-match FORM1)', non-greedy versions of `*',
 630 `+', and `?' operators will be used in FORM1.  If FORM is
 631 '(maximal-match FORM1)', greedy operators will be used."
 632   (rx-check form)
 633   (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
 634     (rx-to-string (cadr form))))
 635
 636
 637 (defun rx-regexp (form)
 638   "Parse and produce code from FORM, which is `(regexp STRING)'."
 639   (rx-check form)
 640   (concat "\\(?:" (cadr form) "\\)"))
 641
 642
 643 ;;;###autoload
 644 (defun rx-to-string (form &optional no-group)
 645   "Parse and produce code for regular expression FORM.
 646 FORM is a regular expression in sexp form.
 647 NO-GROUP non-nil means don't put shy groups around the result."
 648   (cond ((stringp form)
 649          (regexp-quote form))
 650         ((integerp form)
 651          (regexp-quote (char-to-string form)))
 652         ((symbolp form)
 653          (let ((info (rx-info form)))
 654            (cond ((stringp info)
 655                   info)
 656                  ((null info)
 657                   (error "Unknown rx form `%s'" form))
 658                  (t
 659                   (funcall (nth 0 info) form)))))
 660         ((consp form)
 661          (let ((info (rx-info (car form))))
 662            (unless (consp info)
 663              (error "Unknown rx form `%s'" (car form)))
 664            (let ((result (funcall (nth 0 info) form)))
 665              (if (or no-group (string-match "\\`\\\\[(]" result))
 666                  result
 667                (concat "\\(?:" result "\\)")))))
 668         (t
 669          (error "rx syntax error at `%s'" form))))
 670
 671
 672 ;;;###autoload
 673 (defmacro rx (&rest regexps)
 674   "Translate regular expressions REGEXPS in sexp form to a regexp string.
 675 REGEXPS is a non-empty sequence of forms of the sort listed below.
 676 See also `rx-to-string' for how to do such a translation at run-time.
 677
 678 The following are valid subforms of regular expressions in sexp
 679 notation.
 680
 681 STRING
 682      matches string STRING literally.
 683
 684 CHAR
 685      matches character CHAR literally.
 686
 687 `not-newline', `nonl'
 688      matches any character except a newline.
 689                         .
 690 `anything'
 691      matches any character
 692
 693 `(any SET ...)'
 694 `(in SET ...)'
 695 `(char SET ...)'
 696      matches any character in SET ....  SET may be a character or string.
 697      Ranges of characters can be specified as `A-Z' in strings.
 698      Ranges may also be specified as conses like `(?A . ?Z)'.
 699
 700      SET may also be the name of a character class: `digit',
 701      `control', `hex-digit', `blank', `graph', `print', `alnum',
 702      `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper',
 703      `word', or one of their synonyms.
 704
 705 `(not (any SET ...))'
 706      matches any character not in SET ...
 707
 708 `line-start', `bol'
 709      matches the empty string, but only at the beginning of a line
 710      in the text being matched
 711
 712 `line-end', `eol'
 713      is similar to `line-start' but matches only at the end of a line
 714
 715 `string-start', `bos', `bot'
 716      matches the empty string, but only at the beginning of the
 717      string being matched against.
 718
 719 `string-end', `eos', `eot'
 720      matches the empty string, but only at the end of the
 721      string being matched against.
 722
 723 `buffer-start'
 724      matches the empty string, but only at the beginning of the
 725      buffer being matched against.  Actually equivalent to `string-start'.
 726
 727 `buffer-end'
 728      matches the empty string, but only at the end of the
 729      buffer being matched against.  Actually equivalent to `string-end'.
 730
 731 `point'
 732      matches the empty string, but only at point.
 733
 734 `word-start', `bow'
 735      matches the empty string, but only at the beginning or end of a
 736      word.
 737
 738 `word-end', `eow'
 739      matches the empty string, but only at the end of a word.
 740
 741 `word-boundary'
 742      matches the empty string, but only at the beginning or end of a
 743      word.
 744
 745 `(not word-boundary)'
 746 `not-word-boundary'
 747      matches the empty string, but not at the beginning or end of a
 748      word.
 749
 750 `digit', `numeric', `num'
 751      matches 0 through 9.
 752
 753 `control', `cntrl'
 754      matches ASCII control characters.
 755
 756 `hex-digit', `hex', `xdigit'
 757      matches 0 through 9, a through f and A through F.
 758
 759 `blank'
 760      matches space and tab only.
 761
 762 `graphic', `graph'
 763      matches graphic characters--everything except ASCII control chars,
 764      space, and DEL.
 765
 766 `printing', `print'
 767      matches printing characters--everything except ASCII control chars
 768      and DEL.
 769
 770 `alphanumeric', `alnum'
 771      matches letters and digits.  (But at present, for multibyte characters,
 772      it matches anything that has word syntax.)
 773
 774 `letter', `alphabetic', `alpha'
 775      matches letters.  (But at present, for multibyte characters,
 776      it matches anything that has word syntax.)
 777
 778 `ascii'
 779      matches ASCII (unibyte) characters.
 780
 781 `nonascii'
 782      matches non-ASCII (multibyte) characters.
 783
 784 `lower', `lower-case'
 785      matches anything lower-case.
 786
 787 `upper', `upper-case'
 788      matches anything upper-case.
 789
 790 `punctuation', `punct'
 791      matches punctuation.  (But at present, for multibyte characters,
 792      it matches anything that has non-word syntax.)
 793
 794 `space', `whitespace', `white'
 795      matches anything that has whitespace syntax.
 796
 797 `word', `wordchar'
 798      matches anything that has word syntax.
 799
 800 `not-wordchar'
 801      matches anything that has non-word syntax.
 802
 803 `(syntax SYNTAX)'
 804      matches a character with syntax SYNTAX.  SYNTAX must be one
 805      of the following symbols, or a symbol corresponding to the syntax
 806      character, e.g. `\\.' for `\\s.'.
 807
 808      `whitespace'               (\\s- in string notation)
 809      `punctuation'              (\\s.)
 810      `word'                     (\\sw)
 811      `symbol'                   (\\s_)
 812      `open-parenthesis'         (\\s()
 813      `close-parenthesis'        (\\s))
 814      `expression-prefix'        (\\s')
 815      `string-quote'             (\\s\")
 816      `paired-delimiter'         (\\s$)
 817      `escape'                   (\\s\\)
 818      `character-quote'          (\\s/)
 819      `comment-start'            (\\s<)
 820      `comment-end'              (\\s>)
 821      `string-delimiter'         (\\s|)
 822      `comment-delimiter'        (\\s!)
 823
 824 `(not (syntax SYNTAX))'
 825      matches a character that doesn't have syntax SYNTAX.
 826
 827 `(category CATEGORY)'
 828      matches a character with category CATEGORY.  CATEGORY must be
 829      either a character to use for C, or one of the following symbols.
 830
 831      `consonant'                        (\\c0 in string notation)
 832      `base-vowel'                       (\\c1)
 833      `upper-diacritical-mark'           (\\c2)
 834      `lower-diacritical-mark'           (\\c3)
 835      `tone-mark'                        (\\c4)
 836      `symbol'                           (\\c5)
 837      `digit'                            (\\c6)
 838      `vowel-modifying-diacritical-mark' (\\c7)
 839      `vowel-sign'                       (\\c8)
 840      `semivowel-lower'                  (\\c9)
 841      `not-at-end-of-line'               (\\c<)
 842      `not-at-beginning-of-line'         (\\c>)
 843      `alpha-numeric-two-byte'           (\\cA)
 844      `chinse-two-byte'                  (\\cC)
 845      `greek-two-byte'                   (\\cG)
 846      `japanese-hiragana-two-byte'       (\\cH)
 847      `indian-tow-byte'                  (\\cI)
 848      `japanese-katakana-two-byte'       (\\cK)
 849      `korean-hangul-two-byte'           (\\cN)
 850      `cyrillic-two-byte'                (\\cY)
 851      `combining-diacritic'              (\\c^)
 852      `ascii'                            (\\ca)
 853      `arabic'                           (\\cb)
 854      `chinese'                          (\\cc)
 855      `ethiopic'                         (\\ce)
 856      `greek'                            (\\cg)
 857      `korean'                           (\\ch)
 858      `indian'                           (\\ci)
 859      `japanese'                         (\\cj)
 860      `japanese-katakana'                (\\ck)
 861      `latin'                            (\\cl)
 862      `lao'                              (\\co)
 863      `tibetan'                          (\\cq)
 864      `japanese-roman'                   (\\cr)
 865      `thai'                             (\\ct)
 866      `vietnamese'                       (\\cv)
 867      `hebrew'                           (\\cw)
 868      `cyrillic'                         (\\cy)
 869      `can-break'                        (\\c|)
 870
 871 `(not (category CATEGORY))'
 872      matches a character that doesn't have category CATEGORY.
 873
 874 `(and SEXP1 SEXP2 ...)'
 875 `(: SEXP1 SEXP2 ...)'
 876 `(seq SEXP1 SEXP2 ...)'
 877 `(sequence SEXP1 SEXP2 ...)'
 878      matches what SEXP1 matches, followed by what SEXP2 matches, etc.
 879
 880 `(submatch SEXP1 SEXP2 ...)'
 881 `(group SEXP1 SEXP2 ...)'
 882      like `and', but makes the match accessible with `match-end',
 883      `match-beginning', and `match-string'.
 884
 885 `(group SEXP1 SEXP2 ...)'
 886      another name for `submatch'.
 887
 888 `(or SEXP1 SEXP2 ...)'
 889 `(| SEXP1 SEXP2 ...)'
 890      matches anything that matches SEXP1 or SEXP2, etc.  If all
 891      args are strings, use `regexp-opt' to optimize the resulting
 892      regular expression.
 893
 894 `(minimal-match SEXP)'
 895      produce a non-greedy regexp for SEXP.  Normally, regexps matching
 896      zero or more occurrences of something are \"greedy\" in that they
 897      match as much as they can, as long as the overall regexp can
 898      still match.  A non-greedy regexp matches as little as possible.
 899
 900 `(maximal-match SEXP)'
 901      produce a greedy regexp for SEXP.  This is the default.
 902
 903 Below, `SEXP ...' represents a sequence of regexp forms, treated as if
 904 enclosed in `(and ...)'.
 905
 906 `(zero-or-more SEXP ...)'
 907 `(0+ SEXP ...)'
 908      matches zero or more occurrences of what SEXP ... matches.
 909
 910 `(* SEXP ...)'
 911      like `zero-or-more', but always produces a greedy regexp, independent
 912      of `rx-greedy-flag'.
 913
 914 `(*? SEXP ...)'
 915      like `zero-or-more', but always produces a non-greedy regexp,
 916      independent of `rx-greedy-flag'.
 917
 918 `(one-or-more SEXP ...)'
 919 `(1+ SEXP ...)'
 920      matches one or more occurrences of SEXP ...
 921
 922 `(+ SEXP ...)'
 923      like `one-or-more', but always produces a greedy regexp.
 924
 925 `(+? SEXP ...)'
 926      like `one-or-more', but always produces a non-greedy regexp.
 927
 928 `(zero-or-one SEXP ...)'
 929 `(optional SEXP ...)'
 930 `(opt SEXP ...)'
 931      matches zero or one occurrences of A.
 932
 933 `(? SEXP ...)'
 934      like `zero-or-one', but always produces a greedy regexp.
 935
 936 `(?? SEXP ...)'
 937      like `zero-or-one', but always produces a non-greedy regexp.
 938
 939 `(repeat N SEXP)'
 940 `(= N SEXP ...)'
 941      matches N occurrences.
 942
 943 `(>= N SEXP ...)'
 944      matches N or more occurrences.
 945
 946 `(repeat N M SEXP)'
 947 `(** N M SEXP ...)'
 948      matches N to M occurrences.
 949
 950 `(backref N)'
 951     matches what was matched previously by submatch N.
 952
 953 `(backref N)'
 954      matches what was matched previously by submatch N.
 955
 956 `(backref N)'
 957     matches what was matched previously by submatch N.
 958
 959 `(eval FORM)'
 960      evaluate FORM and insert result.  If result is a string,
 961      `regexp-quote' it.
 962
 963 `(regexp REGEXP)'
 964      include REGEXP in string notation in the result."
 965   (cond ((null regexps)
 966          (error "No regexp"))
 967         ((cdr regexps)
 968          (rx-to-string `(and ,@regexps) t))
 969         (t
 970          (rx-to-string (car regexps) t))))
 971 \f
 972 ;; ;; sregex.el replacement
 973
 974 ;; ;;;###autoload (provide 'sregex)
 975 ;; ;;;###autoload (autoload 'sregex "rx")
 976 ;; (defalias 'sregex 'rx-to-string)
 977 ;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
 978 ;; (defalias 'sregexq 'rx)
 979 \f
 980 (provide 'rx)
 981
 982 ;;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b
 983 ;;; rx.el ends here