lisp/emacs-lisp/rx.el

   1 ;;; rx.el --- sexp notation for regular expressions
   2
   3 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005,
   4 ;;   2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
   5
   6 ;; Author: Gerd Moellmann <gerd@gnu.org>
   7 ;; Maintainer: FSF
   8 ;; Keywords: strings, regexps, extensions
   9
  10 ;; This file is part of GNU Emacs.
  11
  12 ;; GNU Emacs is free software: you can redistribute it and/or modify
  13 ;; it under the terms of the GNU General Public License as published by
  14 ;; the Free Software Foundation, either version 3 of the License, or
  15 ;; (at your option) any later version.
  16
  17 ;; GNU Emacs is distributed in the hope that it will be useful,
  18 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ;; GNU General Public License for more details.
  21
  22 ;; You should have received a copy of the GNU General Public License
  23 ;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
  24
  25 ;;; Commentary:
  26
  27 ;; This is another implementation of sexp-form regular expressions.
  28 ;; It was unfortunately written without being aware of the Sregex
  29 ;; package coming with Emacs, but as things stand, Rx completely
  30 ;; covers all regexp features, which Sregex doesn't, doesn't suffer
  31 ;; from the bugs mentioned in the commentary section of Sregex, and
  32 ;; uses a nicer syntax (IMHO, of course :-).
  33
  34 ;; This significantly extended version of the original, is almost
  35 ;; compatible with Sregex.  The only incompatibility I (fx) know of is
  36 ;; that the `repeat' form can't have multiple regexp args.
  37
  38 ;; Now alternative forms are provided for a degree of compatibility
  39 ;; with Shivers' attempted definitive SRE notation
  40 ;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>.  SRE forms not
  41 ;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
  42 ;; ,<exp>, (word ...), word+, posix-string, and character class forms.
  43 ;; Some forms are inconsistent with SRE, either for historical reasons
  44 ;; or because of the implementation -- simple translation into Emacs
  45 ;; regexp strings.  These include: any, word.  Also, case-sensitivity
  46 ;; and greediness are controlled by variables external to the regexp,
  47 ;; and you need to feed the forms to the `posix-' functions to get
  48 ;; SRE's POSIX semantics.  There are probably more difficulties.
  49
  50 ;; Rx translates a sexp notation for regular expressions into the
  51 ;; usual string notation.  The translation can be done at compile-time
  52 ;; by using the `rx' macro.  It can be done at run-time by calling
  53 ;; function `rx-to-string'.  See the documentation of `rx' for a
  54 ;; complete description of the sexp notation.
  55 ;;
  56 ;; Some examples of string regexps and their sexp counterparts:
  57 ;;
  58 ;; "^[a-z]*"
  59 ;; (rx (and line-start (0+ (in "a-z"))))
  60 ;;
  61 ;; "\n[^ \t]"
  62 ;; (rx (and "\n" (not blank))), or
  63 ;; (rx (and "\n" (not (any " \t"))))
  64 ;;
  65 ;; "\\*\\*\\* EOOH \\*\\*\\*\n"
  66 ;; (rx "*** EOOH ***\n")
  67 ;;
  68 ;; "\\<\\(catch\\|finally\\)\\>[^_]"
  69 ;; (rx (and word-start (submatch (or "catch" "finally")) word-end
  70 ;;          (not (any ?_))))
  71 ;;
  72 ;; "[ \t\n]*:\\([^:]+\\|$\\)"
  73 ;; (rx (and (zero-or-more (in " \t\n")) ":"
  74 ;;          (submatch (or line-end (one-or-more (not (any ?:)))))))
  75 ;;
  76 ;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
  77 ;; (rx (and line-start
  78 ;;          "content-transfer-encoding:"
  79 ;;          (+ (? ?\n)) blank
  80 ;;          "quoted-printable"
  81 ;;          (+ (? ?\n)) blank))
  82 ;;
  83 ;; (concat "^\\(?:" something-else "\\)")
  84 ;; (rx (and line-start (eval something-else))), statically or
  85 ;; (rx-to-string '(and line-start ,something-else)), dynamically.
  86 ;;
  87 ;; (regexp-opt '(STRING1 STRING2 ...))
  88 ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
  89 ;; calls `regexp-opt' as needed.
  90 ;;
  91 ;; "^;;\\s-*\n\\|^\n"
  92 ;; (rx (or (and line-start ";;" (0+ space) ?\n)
  93 ;;         (and line-start ?\n)))
  94 ;;
  95 ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
  96 ;; (rx (and "$Id: "
  97 ;;          (1+ (not (in " ")))
  98 ;;          " "
  99 ;;          (submatch (1+ (not (in " "))))
 100 ;;          " "))
 101 ;;
 102 ;; "\\\\\\\\\\[\\w+"
 103 ;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
 104 ;;
 105 ;; etc.
 106
 107 ;;; History:
 108 ;;
 109
 110 ;;; Code:
 111
 112 (defconst rx-constituents
 113   '((and                . (rx-and 1 nil))
 114     (seq                . and)          ; SRE
 115     (:                  . and)          ; SRE
 116     (sequence           . and)          ; sregex
 117     (or                 . (rx-or 1 nil))
 118     (|                  . or)           ; SRE
 119     (not-newline        . ".")
 120     (nonl               . not-newline)  ; SRE
 121     (anything           . (rx-anything 0 nil))
 122     (any                . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
 123     (in                 . any)
 124     (char               . any)          ; sregex
 125     (not-char           . (rx-not-char 1 nil rx-check-any)) ; sregex
 126     (not                . (rx-not 1 1 rx-check-not))
 127     ;; Partially consistent with sregex, whose `repeat' is like our
 128     ;; `**'.  (`repeat' with optional max arg and multiple sexp forms
 129     ;; is ambiguous.)
 130     (repeat             . (rx-repeat 2 3))
 131     (=                  . (rx-= 2 nil))    ; SRE
 132     (>=                 . (rx->= 2 nil))   ; SRE
 133     (**                 . (rx-** 2 nil))   ; SRE
 134     (submatch           . (rx-submatch 1 nil)) ; SRE
 135     (group              . submatch)
 136     (zero-or-more       . (rx-kleene 1 nil))
 137     (one-or-more        . (rx-kleene 1 nil))
 138     (zero-or-one        . (rx-kleene 1 nil))
 139     (\?                 . zero-or-one)  ; SRE
 140     (\??                . zero-or-one)
 141     (*                  . zero-or-more) ; SRE
 142     (*?                 . zero-or-more)
 143     (0+                 . zero-or-more)
 144     (+                  . one-or-more)  ; SRE
 145     (+?                 . one-or-more)
 146     (1+                 . one-or-more)
 147     (optional           . zero-or-one)
 148     (opt                . zero-or-one)  ; sregex
 149     (minimal-match      . (rx-greedy 1 1))
 150     (maximal-match      . (rx-greedy 1 1))
 151     (backref            . (rx-backref 1 1 rx-check-backref))
 152     (line-start         . "^")
 153     (bol                . line-start)   ; SRE
 154     (line-end           . "$")
 155     (eol                . line-end)     ; SRE
 156     (string-start       . "\\`")
 157     (bos                . string-start) ; SRE
 158     (bot                . string-start) ; sregex
 159     (string-end         . "\\'")
 160     (eos                . string-end)   ; SRE
 161     (eot                . string-end)   ; sregex
 162     (buffer-start       . "\\`")
 163     (buffer-end         . "\\'")
 164     (point              . "\\=")
 165     (word-start         . "\\<")
 166     (bow                . word-start)   ; SRE
 167     (word-end           . "\\>")
 168     (eow                . word-end)     ; SRE
 169     (word-boundary      . "\\b")
 170     (not-word-boundary  . "\\B")        ; sregex
 171     (symbol-start       . "\\_<")
 172     (symbol-end         . "\\_>")
 173     (syntax             . (rx-syntax 1 1))
 174     (not-syntax         . (rx-not-syntax 1 1)) ; sregex
 175     (category           . (rx-category 1 1 rx-check-category))
 176     (eval               . (rx-eval 1 1))
 177     (regexp             . (rx-regexp 1 1 stringp))
 178     (digit              . "[[:digit:]]")
 179     (numeric            . digit)        ; SRE
 180     (num                . digit)        ; SRE
 181     (control            . "[[:cntrl:]]") ; SRE
 182     (cntrl              . control)       ; SRE
 183     (hex-digit          . "[[:xdigit:]]") ; SRE
 184     (hex                . hex-digit)      ; SRE
 185     (xdigit             . hex-digit)      ; SRE
 186     (blank              . "[[:blank:]]")  ; SRE
 187     (graphic            . "[[:graph:]]")  ; SRE
 188     (graph              . graphic)        ; SRE
 189     (printing           . "[[:print:]]")  ; SRE
 190     (print              . printing)       ; SRE
 191     (alphanumeric       . "[[:alnum:]]")  ; SRE
 192     (alnum              . alphanumeric)   ; SRE
 193     (letter             . "[[:alpha:]]")
 194     (alphabetic         . letter)       ; SRE
 195     (alpha              . letter)       ; SRE
 196     (ascii              . "[[:ascii:]]") ; SRE
 197     (nonascii           . "[[:nonascii:]]")
 198     (lower              . "[[:lower:]]") ; SRE
 199     (lower-case         . lower)         ; SRE
 200     (punctuation        . "[[:punct:]]") ; SRE
 201     (punct              . punctuation)   ; SRE
 202     (space              . "[[:space:]]") ; SRE
 203     (whitespace         . space)         ; SRE
 204     (white              . space)         ; SRE
 205     (upper              . "[[:upper:]]") ; SRE
 206     (upper-case         . upper)         ; SRE
 207     (word               . "[[:word:]]")  ; inconsistent with SRE
 208     (wordchar           . word)          ; sregex
 209     (not-wordchar       . "\\W"))
 210   "Alist of sexp form regexp constituents.
 211 Each element of the alist has the form (SYMBOL . DEFN).
 212 SYMBOL is a valid constituent of sexp regular expressions.
 213 If DEFN is a string, SYMBOL is translated into DEFN.
 214 If DEFN is a symbol, use the definition of DEFN, recursively.
 215 Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE).
 216 FUNCTION is used to produce code for SYMBOL.  MIN-ARGS and MAX-ARGS
 217 are the minimum and maximum number of arguments the function-form
 218 sexp constituent SYMBOL may have in sexp regular expressions.
 219 MAX-ARGS nil means no limit.  PREDICATE, if specified, means that
 220 all arguments must satisfy PREDICATE.")
 221
 222
 223 (defconst rx-syntax
 224   '((whitespace         . ?-)
 225     (punctuation        . ?.)
 226     (word               . ?w)
 227     (symbol             . ?_)
 228     (open-parenthesis   . ?\()
 229     (close-parenthesis  . ?\))
 230     (expression-prefix  . ?\')
 231     (string-quote       . ?\")
 232     (paired-delimiter   . ?$)
 233     (escape             . ?\\)
 234     (character-quote    . ?/)
 235     (comment-start      . ?<)
 236     (comment-end        . ?>)
 237     (string-delimiter   . ?|)
 238     (comment-delimiter  . ?!))
 239   "Alist mapping Rx syntax symbols to syntax characters.
 240 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
 241 symbol in `(syntax SYMBOL)', and CHAR is the syntax character
 242 corresponding to SYMBOL, as it would be used with \\s or \\S in
 243 regular expressions.")
 244
 245
 246 (defconst rx-categories
 247   '((consonant                  . ?0)
 248     (base-vowel                 . ?1)
 249     (upper-diacritical-mark     . ?2)
 250     (lower-diacritical-mark     . ?3)
 251     (tone-mark                  . ?4)
 252     (symbol                     . ?5)
 253     (digit                      . ?6)
 254     (vowel-modifying-diacritical-mark . ?7)
 255     (vowel-sign                 . ?8)
 256     (semivowel-lower            . ?9)
 257     (not-at-end-of-line         . ?<)
 258     (not-at-beginning-of-line   . ?>)
 259     (alpha-numeric-two-byte     . ?A)
 260     (chinse-two-byte            . ?C)
 261     (greek-two-byte             . ?G)
 262     (japanese-hiragana-two-byte . ?H)
 263     (indian-two-byte            . ?I)
 264     (japanese-katakana-two-byte . ?K)
 265     (korean-hangul-two-byte     . ?N)
 266     (cyrillic-two-byte          . ?Y)
 267     (combining-diacritic        . ?^)
 268     (ascii                      . ?a)
 269     (arabic                     . ?b)
 270     (chinese                    . ?c)
 271     (ethiopic                   . ?e)
 272     (greek                      . ?g)
 273     (korean                     . ?h)
 274     (indian                     . ?i)
 275     (japanese                   . ?j)
 276     (japanese-katakana          . ?k)
 277     (latin                      . ?l)
 278     (lao                        . ?o)
 279     (tibetan                    . ?q)
 280     (japanese-roman             . ?r)
 281     (thai                       . ?t)
 282     (vietnamese                 . ?v)
 283     (hebrew                     . ?w)
 284     (cyrillic                   . ?y)
 285     (can-break                  . ?|))
 286   "Alist mapping symbols to category characters.
 287 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
 288 symbol in `(category SYMBOL)', and CHAR is the category character
 289 corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in
 290 regular expression strings.")
 291
 292
 293 (defvar rx-greedy-flag t
 294   "Non-nil means produce greedy regular expressions for `zero-or-one',
 295 `zero-or-more', and `one-or-more'.  Dynamically bound.")
 296
 297
 298 (defun rx-info (op)
 299   "Return parsing/code generation info for OP.
 300 If OP is the space character ASCII 32, return info for the symbol `?'.
 301 If OP is the character `?', return info for the symbol `??'.
 302 See also `rx-constituents'."
 303   (cond ((eq op ? ) (setq op '\?))
 304         ((eq op ??) (setq op '\??)))
 305   (while (and (not (null op)) (symbolp op))
 306     (setq op (cdr (assq op rx-constituents))))
 307   op)
 308
 309
 310 (defun rx-check (form)
 311   "Check FORM according to its car's parsing info."
 312   (unless (listp form)
 313     (error "rx `%s' needs argument(s)" form))
 314   (let* ((rx (rx-info (car form)))
 315          (nargs (1- (length form)))
 316          (min-args (nth 1 rx))
 317          (max-args (nth 2 rx))
 318          (type-pred (nth 3 rx)))
 319     (when (and (not (null min-args))
 320                (< nargs min-args))
 321       (error "rx form `%s' requires at least %d args"
 322              (car form) min-args))
 323     (when (and (not (null max-args))
 324                (> nargs max-args))
 325       (error "rx form `%s' accepts at most %d args"
 326              (car form) max-args))
 327     (when (not (null type-pred))
 328       (dolist (sub-form (cdr form))
 329         (unless (funcall type-pred sub-form)
 330           (error "rx form `%s' requires args satisfying `%s'"
 331                  (car form) type-pred))))))
 332
 333
 334 (defun rx-group-if (regexp group)
 335   "Put shy groups around REGEXP if seemingly necessary when GROUP
 336 is non-nil."
 337   (cond
 338    ;; for some repetition
 339    ((eq group '*) (if (rx-atomic-p regexp) (setq group nil)))
 340    ;; for concatenation
 341    ((eq group ':)
 342     (if (rx-atomic-p
 343          (if (string-match
 344               "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp)
 345              (substring regexp 0 (match-beginning 0))
 346            regexp))
 347         (setq group nil)))
 348    ;; for OR
 349    ((eq group '|) (setq group nil))
 350    ;; do anyway
 351    ((eq group t))
 352    ((rx-atomic-p regexp t) (setq group nil)))
 353   (if group
 354       (concat "\\(?:" regexp "\\)")
 355     regexp))
 356
 357
 358 (defvar rx-parent)
 359 ;; dynamically bound in some functions.
 360
 361
 362 (defun rx-and (form)
 363   "Parse and produce code from FORM.
 364 FORM is of the form `(and FORM1 ...)'."
 365   (rx-check form)
 366   (rx-group-if
 367    (mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil)
 368    (and (memq rx-parent '(* t)) rx-parent)))
 369
 370
 371 (defun rx-or (form)
 372   "Parse and produce code from FORM, which is `(or FORM1 ...)'."
 373   (rx-check form)
 374   (rx-group-if
 375    (if (memq nil (mapcar 'stringp (cdr form)))
 376        (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")
 377      (regexp-opt (cdr form)))
 378    (and (memq rx-parent '(: * t)) rx-parent)))
 379
 380
 381 (defun rx-anything (form)
 382   "Match any character."
 383   (if (consp form)
 384       (error "rx `anythng' syntax error: %s" form))
 385   (rx-or (list 'or 'not-newline ?\n)))
 386
 387
 388 (defun rx-any-delete-from-range (char ranges)
 389   "Delete by side effect character CHAR from RANGES.
 390 Only both edges of each range is checked."
 391   (let (m)
 392     (cond
 393      ((memq char ranges) (setq ranges (delq char ranges)))
 394      ((setq m (assq char ranges))
 395       (if (eq (1+ char) (cdr m))
 396           (setcar (memq m ranges) (1+ char))
 397         (setcar m (1+ char))))
 398      ((setq m (rassq char ranges))
 399       (if (eq (1- char) (car m))
 400           (setcar (memq m ranges) (1- char))
 401         (setcdr m (1- char)))))
 402     ranges))
 403
 404
 405 (defun rx-any-condense-range (args)
 406   "Condense by side effect ARGS as range for Rx `any'."
 407   (let (str
 408         l)
 409     ;; set STR list of all strings
 410     ;; set L list of all ranges
 411     (mapc (lambda (e) (cond ((stringp e) (push e str))
 412                             ((numberp e) (push (cons e e) l))
 413                             (t (push e l))))
 414           args)
 415     ;; condense overlapped ranges in L
 416     (let ((tail (setq l (sort l #'car-less-than-car)))
 417           d)
 418       (while (setq d (cdr tail))
 419         (if (>= (cdar tail) (1- (caar d)))
 420             (progn
 421               (setcdr (car tail) (max (cdar tail) (cdar d)))
 422               (setcdr tail (cdr d)))
 423           (setq tail d))))
 424     ;; Separate small ranges to single number, and delete dups.
 425     (nconc
 426      (apply #'nconc
 427             (mapcar (lambda (e)
 428                       (cond
 429                        ((= (car e) (cdr e)) (list (car e)))
 430                        ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e)))
 431                        ((list e))))
 432                     l))
 433      (delete-dups str))))
 434
 435
 436 (defun rx-check-any-string (str)
 437   "Check string argument STR for Rx `any'."
 438   (let ((i 0)
 439         c1 c2 l)
 440     (if (= 0 (length str))
 441         (error "String arg for Rx `any' must not be empty"))
 442     (while (string-match ".-." str i)
 443       ;; string before range: convert it to characters
 444       (if (< i (match-beginning 0))
 445           (setq l (nconc
 446                    l
 447                    (append (substring str i (match-beginning 0)) nil))))
 448       ;; range
 449       (setq i (match-end 0)
 450             c1 (aref str (match-beginning 0))
 451             c2 (aref str (1- i)))
 452       (cond
 453        ((< c1 c2) (setq l (nconc l (list (cons c1 c2)))))
 454        ((= c1 c2) (setq l (nconc l (list c1))))))
 455     ;; rest?
 456     (if (< i (length str))
 457         (setq l (nconc l (append (substring str i) nil))))
 458     l))
 459
 460
 461 (defun rx-check-any (arg)
 462    "Check arg ARG for Rx `any'."
 463    (cond
 464     ((integerp arg) (list arg))
 465     ((symbolp arg)
 466      (let ((translation (condition-case nil
 467                             (rx-form arg)
 468                           (error nil))))
 469        (if (or (null translation)
 470                (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation)))
 471            (error "Invalid char class `%s' in Rx `any'" arg))
 472        (list (substring translation 1 -1)))) ; strip outer brackets
 473     ((and (integerp (car-safe arg)) (integerp (cdr-safe arg)))
 474      (list arg))
 475     ((stringp arg) (rx-check-any-string arg))
 476     ((error
 477       "rx `any' requires string, character, char pair or char class args"))))
 478
 479
 480 (defun rx-any (form)
 481   "Parse and produce code from FORM, which is `(any ARG ...)'.
 482 ARG is optional."
 483   (rx-check form)
 484   (let* ((args (rx-any-condense-range
 485                 (apply
 486                  #'nconc
 487                  (mapcar #'rx-check-any (cdr form)))))
 488          m
 489          s)
 490     (cond
 491      ;; single close bracket
 492      ;;  => "[]...-]" or "[]...--.]"
 493      ((memq ?\] args)
 494       ;; set ] at the beginning
 495       (setq args (cons ?\] (delq ?\] args)))
 496       ;; set - at the end
 497       (if (or (memq ?- args) (assq ?- args))
 498           (setq args (nconc (rx-any-delete-from-range ?- args)
 499                             (list ?-)))))
 500      ;; close bracket starts a range
 501      ;;  => "[]-....-]" or "[]-.--....]"
 502      ((setq m (assq ?\] args))
 503       ;; bring it to the beginning
 504       (setq args (cons m (delq m args)))
 505       (cond ((memq ?- args)
 506              ;; to the end
 507              (setq args (nconc (delq ?- args) (list ?-))))
 508             ((setq m (assq ?- args))
 509              ;; next to the bracket's range, make the second range
 510              (setcdr args (cons m (delq m args))))))
 511      ;; bracket in the end range
 512      ;;  => "[]...-]"
 513      ((setq m (rassq ?\] args))
 514       ;; set ] at the beginning
 515       (setq args (cons ?\] (rx-any-delete-from-range ?\] args)))
 516       ;; set - at the end
 517       (if (or (memq ?- args) (assq ?- args))
 518           (setq args (nconc (rx-any-delete-from-range ?- args)
 519                             (list ?-)))))
 520      ;; {no close bracket appears}
 521      ;;
 522      ;; bring single bar to the beginning
 523      ((memq ?- args)
 524       (setq args (cons ?- (delq ?- args))))
 525      ;; bar start a range, bring it to the beginning
 526      ((setq m (assq ?- args))
 527       (setq args (cons m (delq m args))))
 528      ;;
 529      ;; hat at the beginning?
 530      ((or (eq (car args) ?^) (eq (car-safe (car args)) ?^))
 531       (setq args (if (cdr args)
 532                      `(,(cadr args) ,(car args) ,@(cddr args))
 533                    (nconc (rx-any-delete-from-range ?^ args)
 534                           (list ?^))))))
 535     ;; some 1-char?
 536     (if (and (null (cdr args)) (numberp (car args))
 537              (or (= 1 (length
 538                        (setq s (regexp-quote (string (car args))))))
 539                  (and (equal (car args) ?^) ;; unnecessary predicate?
 540                       (null (eq rx-parent '!)))))
 541         s
 542       (concat "["
 543               (mapconcat
 544                (lambda (e) (cond
 545                             ((numberp e) (string e))
 546                             ((consp e)
 547                              (if (and (= (1+ (car e)) (cdr e))
 548                                       ;; rx-any-condense-range should
 549                                       ;; prevent this case from happening.
 550                                       (null (memq (car e) '(?\] ?-)))
 551                                       (null (memq (cdr e) '(?\] ?-))))
 552                                  (string (car e) (cdr e))
 553                                (string (car e) ?- (cdr e))))
 554                             (e)))
 555                args
 556                nil)
 557               "]"))))
 558
 559
 560 (defun rx-check-not (arg)
 561   "Check arg ARG for Rx `not'."
 562   (unless (or (and (symbolp arg)
 563                    (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'"
 564                                  (condition-case nil
 565                                      (rx-form arg)
 566                                    (error ""))))
 567               (eq arg 'word-boundary)
 568               (and (consp arg)
 569                    (memq (car arg) '(not any in syntax category))))
 570     (error "rx `not' syntax error: %s" arg))
 571   t)
 572
 573
 574 (defun rx-not (form)
 575   "Parse and produce code from FORM.  FORM is `(not ...)'."
 576   (rx-check form)
 577   (let ((result (rx-form (cadr form) '!))
 578         case-fold-search)
 579     (cond ((string-match "\\`\\[^" result)
 580            (cond
 581             ((equal result "[^]") "[^^]")
 582             ((and (= (length result) 4) (null (eq rx-parent '!)))
 583              (regexp-quote (substring result 2 3)))
 584             ((concat "[" (substring result 2)))))
 585           ((eq ?\[ (aref result 0))
 586            (concat "[^" (substring result 1)))
 587           ((string-match "\\`\\\\[scbw]" result)
 588            (concat (upcase (substring result 0 2))
 589                    (substring result 2)))
 590           ((string-match "\\`\\\\[SCBW]" result)
 591            (concat (downcase (substring result 0 2))
 592                    (substring result 2)))
 593           (t
 594            (concat "[^" result "]")))))
 595
 596
 597 (defun rx-not-char (form)
 598   "Parse and produce code from FORM.  FORM is `(not-char ...)'."
 599   (rx-check form)
 600   (rx-not `(not (in ,@(cdr form)))))
 601
 602
 603 (defun rx-not-syntax (form)
 604   "Parse and produce code from FORM.  FORM is `(not-syntax SYNTAX)'."
 605   (rx-check form)
 606   (rx-not `(not (syntax ,@(cdr form)))))
 607
 608
 609 (defun rx-trans-forms (form &optional skip)
 610   "If FORM's length is greater than two, transform it to length two.
 611 A form (HEAD REST ...) becomes (HEAD (and REST ...)).
 612 If SKIP is non-nil, allow that number of items after the head, i.e.
 613 `(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1."
 614   (unless skip (setq skip 0))
 615   (let ((tail (nthcdr (1+ skip) form)))
 616     (if (= (length tail) 1)
 617         form
 618       (let ((form (copy-sequence form)))
 619         (setcdr (nthcdr skip form) (list (cons 'and tail)))
 620         form))))
 621
 622
 623 (defun rx-= (form)
 624   "Parse and produce code from FORM `(= N ...)'."
 625   (rx-check form)
 626   (setq form (rx-trans-forms form 1))
 627   (unless (and (integerp (nth 1 form))
 628                (> (nth 1 form) 0))
 629     (error "rx `=' requires positive integer first arg"))
 630   (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
 631
 632
 633 (defun rx->= (form)
 634   "Parse and produce code from FORM `(>= N ...)'."
 635   (rx-check form)
 636   (setq form (rx-trans-forms form 1))
 637   (unless (and (integerp (nth 1 form))
 638                (> (nth 1 form) 0))
 639     (error "rx `>=' requires positive integer first arg"))
 640   (format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
 641
 642
 643 (defun rx-** (form)
 644   "Parse and produce code from FORM `(** N M ...)'."
 645   (rx-check form)
 646   (setq form (cons 'repeat (cdr (rx-trans-forms form 2))))
 647   (rx-form form '*))
 648
 649
 650 (defun rx-repeat (form)
 651   "Parse and produce code from FORM.
 652 FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'."
 653   (rx-check form)
 654   (cond ((= (length form) 3)
 655          (unless (and (integerp (nth 1 form))
 656                       (> (nth 1 form) 0))
 657            (error "rx `repeat' requires positive integer first arg"))
 658          (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
 659         ((or (not (integerp (nth 2 form)))
 660              (< (nth 2 form) 0)
 661              (not (integerp (nth 1 form)))
 662              (< (nth 1 form) 0)
 663              (< (nth 2 form) (nth 1 form)))
 664          (error "rx `repeat' range error"))
 665         (t
 666          (format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*)
 667                  (nth 1 form) (nth 2 form)))))
 668
 669
 670 (defun rx-submatch (form)
 671   "Parse and produce code from FORM, which is `(submatch ...)'."
 672   (concat "\\("
 673           (if (= 2 (length form))
 674               ;; Only one sub-form.
 675               (rx-form (cadr form))
 676             ;; Several sub-forms implicitly concatenated.
 677             (mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil))
 678           "\\)"))
 679
 680
 681 (defun rx-backref (form)
 682   "Parse and produce code from FORM, which is `(backref N)'."
 683   (rx-check form)
 684   (format "\\%d" (nth 1 form)))
 685
 686 (defun rx-check-backref (arg)
 687   "Check arg ARG for Rx `backref'."
 688   (or (and (integerp arg) (>= arg 1) (<= arg 9))
 689       (error "rx `backref' requires numeric 1<=arg<=9: %s" arg)))
 690
 691 (defun rx-kleene (form)
 692   "Parse and produce code from FORM.
 693 FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
 694 `zero-or-more' etc.  operators.
 695 If OP is one of `*', `+', `?', produce a greedy regexp.
 696 If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
 697 If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
 698 is non-nil."
 699   (rx-check form)
 700   (setq form (rx-trans-forms form))
 701   (let ((suffix (cond ((memq (car form) '(* + ?\s)) "")
 702                       ((memq (car form) '(*? +? ??)) "?")
 703                       (rx-greedy-flag "")
 704                       (t "?")))
 705         (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
 706                   ((memq (car form) '(+ +? 1+ one-or-more))  "+")
 707                   (t "?"))))
 708     (rx-group-if
 709      (concat (rx-form (cadr form) '*) op suffix)
 710      (and (memq rx-parent '(t *)) rx-parent))))
 711
 712
 713 (defun rx-atomic-p (r &optional lax)
 714   "Return non-nil if regexp string R is atomic.
 715 An atomic regexp R is one such that a suffix operator
 716 appended to R will apply to all of R.  For example, \"a\"
 717 \"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\",
 718 \"[ab]c\", and \"ab\\|ab*c\" are not atomic.
 719
 720 This function may return false negatives, but it will not
 721 return false positives.  It is nevertheless useful in
 722 situations where an efficiency shortcut can be taken only if a
 723 regexp is atomic.  The function can be improved to detect
 724 more cases of atomic regexps.  Presently, this function
 725 detects the following categories of atomic regexp;
 726
 727   a group or shy group:  \\(...\\)
 728   a character class:     [...]
 729   a single character:    a
 730
 731 On the other hand, false negatives will be returned for
 732 regexps that are atomic but end in operators, such as
 733 \"a+\".  I think these are rare.  Probably such cases could
 734 be detected without much effort.  A guarantee of no false
 735 negatives would require a theoretic specification of the set
 736 of all atomic regexps."
 737   (let ((l (length r)))
 738     (cond
 739      ((<= l 1))
 740      ((= l 2) (= (aref r 0) ?\\))
 741      ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
 742      ((null lax)
 743       (cond
 744        ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^\]]\\)*\\]\\'" r))
 745        ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^\)]\\)*\\\\)\\'" r)))))))
 746
 747
 748 (defun rx-syntax (form)
 749   "Parse and produce code from FORM, which is `(syntax SYMBOL)'."
 750   (rx-check form)
 751   (let* ((sym (cadr form))
 752          (syntax (assq sym rx-syntax)))
 753     (unless syntax
 754       ;; Try sregex compatibility.
 755       (let ((name (symbol-name sym)))
 756         (if (= 1 (length name))
 757             (setq syntax (rassq (aref name 0) rx-syntax))))
 758       (unless syntax
 759         (error "Unknown rx syntax `%s'" (cadr form))))
 760     (format "\\s%c" (cdr syntax))))
 761
 762
 763 (defun rx-check-category (form)
 764   "Check the argument FORM of a `(category FORM)'."
 765   (unless (or (integerp form)
 766               (cdr (assq form rx-categories)))
 767     (error "Unknown category `%s'" form))
 768   t)
 769
 770
 771 (defun rx-category (form)
 772   "Parse and produce code from FORM, which is `(category SYMBOL)'."
 773   (rx-check form)
 774   (let ((char (if (integerp (cadr form))
 775                   (cadr form)
 776                 (cdr (assq (cadr form) rx-categories)))))
 777     (format "\\c%c" char)))
 778
 779
 780 (defun rx-eval (form)
 781   "Parse and produce code from FORM, which is `(eval FORM)'."
 782   (rx-check form)
 783   (rx-form (eval (cadr form)) rx-parent))
 784
 785
 786 (defun rx-greedy (form)
 787   "Parse and produce code from FORM.
 788 If FORM is '(minimal-match FORM1)', non-greedy versions of `*',
 789 `+', and `?' operators will be used in FORM1.  If FORM is
 790 '(maximal-match FORM1)', greedy operators will be used."
 791   (rx-check form)
 792   (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
 793     (rx-form (cadr form) rx-parent)))
 794
 795
 796 (defun rx-regexp (form)
 797   "Parse and produce code from FORM, which is `(regexp STRING)'."
 798   (rx-check form)
 799   (rx-group-if (cadr form) rx-parent))
 800
 801
 802 (defun rx-form (form &optional rx-parent)
 803   "Parse and produce code for regular expression FORM.
 804 FORM is a regular expression in sexp form.
 805 RX-PARENT shows which type of expression calls and controls putting of
 806 shy groups around the result and some more in other functions."
 807   (if (stringp form)
 808       (rx-group-if (regexp-quote form)
 809                    (if (and (eq rx-parent '*) (< 1 (length form)))
 810                        rx-parent))
 811     (cond ((integerp form)
 812            (regexp-quote (char-to-string form)))
 813           ((symbolp form)
 814            (let ((info (rx-info form)))
 815              (cond ((stringp info)
 816                     info)
 817                    ((null info)
 818                     (error "Unknown rx form `%s'" form))
 819                    (t
 820                     (funcall (nth 0 info) form)))))
 821           ((consp form)
 822            (let ((info (rx-info (car form))))
 823              (unless (consp info)
 824                (error "Unknown rx form `%s'" (car form)))
 825              (funcall (nth 0 info) form)))
 826           (t
 827            (error "rx syntax error at `%s'" form)))))
 828
 829
 830 ;;;###autoload
 831 (defun rx-to-string (form &optional no-group)
 832   "Parse and produce code for regular expression FORM.
 833 FORM is a regular expression in sexp form.
 834 NO-GROUP non-nil means don't put shy groups around the result."
 835   (rx-group-if (rx-form form) (null no-group)))
 836
 837
 838 ;;;###autoload
 839 (defmacro rx (&rest regexps)
 840   "Translate regular expressions REGEXPS in sexp form to a regexp string.
 841 REGEXPS is a non-empty sequence of forms of the sort listed below.
 842
 843 Note that `rx' is a Lisp macro; when used in a Lisp program being
 844  compiled, the translation is performed by the compiler.
 845 See `rx-to-string' for how to do such a translation at run-time.
 846
 847 The following are valid subforms of regular expressions in sexp
 848 notation.
 849
 850 STRING
 851      matches string STRING literally.
 852
 853 CHAR
 854      matches character CHAR literally.
 855
 856 `not-newline', `nonl'
 857      matches any character except a newline.
 858
 859 `anything'
 860      matches any character
 861
 862 `(any SET ...)'
 863 `(in SET ...)'
 864 `(char SET ...)'
 865      matches any character in SET ....  SET may be a character or string.
 866      Ranges of characters can be specified as `A-Z' in strings.
 867      Ranges may also be specified as conses like `(?A . ?Z)'.
 868
 869      SET may also be the name of a character class: `digit',
 870      `control', `hex-digit', `blank', `graph', `print', `alnum',
 871      `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper',
 872      `word', or one of their synonyms.
 873
 874 `(not (any SET ...))'
 875      matches any character not in SET ...
 876
 877 `line-start', `bol'
 878      matches the empty string, but only at the beginning of a line
 879      in the text being matched
 880
 881 `line-end', `eol'
 882      is similar to `line-start' but matches only at the end of a line
 883
 884 `string-start', `bos', `bot'
 885      matches the empty string, but only at the beginning of the
 886      string being matched against.
 887
 888 `string-end', `eos', `eot'
 889      matches the empty string, but only at the end of the
 890      string being matched against.
 891
 892 `buffer-start'
 893      matches the empty string, but only at the beginning of the
 894      buffer being matched against.  Actually equivalent to `string-start'.
 895
 896 `buffer-end'
 897      matches the empty string, but only at the end of the
 898      buffer being matched against.  Actually equivalent to `string-end'.
 899
 900 `point'
 901      matches the empty string, but only at point.
 902
 903 `word-start', `bow'
 904      matches the empty string, but only at the beginning of a word.
 905
 906 `word-end', `eow'
 907      matches the empty string, but only at the end of a word.
 908
 909 `word-boundary'
 910      matches the empty string, but only at the beginning or end of a
 911      word.
 912
 913 `(not word-boundary)'
 914 `not-word-boundary'
 915      matches the empty string, but not at the beginning or end of a
 916      word.
 917
 918 `symbol-start'
 919      matches the empty string, but only at the beginning of a symbol.
 920
 921 `symbol-end'
 922      matches the empty string, but only at the end of a symbol.
 923
 924 `digit', `numeric', `num'
 925      matches 0 through 9.
 926
 927 `control', `cntrl'
 928      matches ASCII control characters.
 929
 930 `hex-digit', `hex', `xdigit'
 931      matches 0 through 9, a through f and A through F.
 932
 933 `blank'
 934      matches space and tab only.
 935
 936 `graphic', `graph'
 937      matches graphic characters--everything except ASCII control chars,
 938      space, and DEL.
 939
 940 `printing', `print'
 941      matches printing characters--everything except ASCII control chars
 942      and DEL.
 943
 944 `alphanumeric', `alnum'
 945      matches letters and digits.  (But at present, for multibyte characters,
 946      it matches anything that has word syntax.)
 947
 948 `letter', `alphabetic', `alpha'
 949      matches letters.  (But at present, for multibyte characters,
 950      it matches anything that has word syntax.)
 951
 952 `ascii'
 953      matches ASCII (unibyte) characters.
 954
 955 `nonascii'
 956      matches non-ASCII (multibyte) characters.
 957
 958 `lower', `lower-case'
 959      matches anything lower-case.
 960
 961 `upper', `upper-case'
 962      matches anything upper-case.
 963
 964 `punctuation', `punct'
 965      matches punctuation.  (But at present, for multibyte characters,
 966      it matches anything that has non-word syntax.)
 967
 968 `space', `whitespace', `white'
 969      matches anything that has whitespace syntax.
 970
 971 `word', `wordchar'
 972      matches anything that has word syntax.
 973
 974 `not-wordchar'
 975      matches anything that has non-word syntax.
 976
 977 `(syntax SYNTAX)'
 978      matches a character with syntax SYNTAX.  SYNTAX must be one
 979      of the following symbols, or a symbol corresponding to the syntax
 980      character, e.g. `\\.' for `\\s.'.
 981
 982      `whitespace'               (\\s- in string notation)
 983      `punctuation'              (\\s.)
 984      `word'                     (\\sw)
 985      `symbol'                   (\\s_)
 986      `open-parenthesis'         (\\s()
 987      `close-parenthesis'        (\\s))
 988      `expression-prefix'        (\\s')
 989      `string-quote'             (\\s\")
 990      `paired-delimiter'         (\\s$)
 991      `escape'                   (\\s\\)
 992      `character-quote'          (\\s/)
 993      `comment-start'            (\\s<)
 994      `comment-end'              (\\s>)
 995      `string-delimiter'         (\\s|)
 996      `comment-delimiter'        (\\s!)
 997
 998 `(not (syntax SYNTAX))'
 999      matches a character that doesn't have syntax SYNTAX.
1000
1001 `(category CATEGORY)'
1002      matches a character with category CATEGORY.  CATEGORY must be
1003      either a character to use for C, or one of the following symbols.
1004
1005      `consonant'                        (\\c0 in string notation)
1006      `base-vowel'                       (\\c1)
1007      `upper-diacritical-mark'           (\\c2)
1008      `lower-diacritical-mark'           (\\c3)
1009      `tone-mark'                        (\\c4)
1010      `symbol'                           (\\c5)
1011      `digit'                            (\\c6)
1012      `vowel-modifying-diacritical-mark' (\\c7)
1013      `vowel-sign'                       (\\c8)
1014      `semivowel-lower'                  (\\c9)
1015      `not-at-end-of-line'               (\\c<)
1016      `not-at-beginning-of-line'         (\\c>)
1017      `alpha-numeric-two-byte'           (\\cA)
1018      `chinse-two-byte'                  (\\cC)
1019      `greek-two-byte'                   (\\cG)
1020      `japanese-hiragana-two-byte'       (\\cH)
1021      `indian-tow-byte'                  (\\cI)
1022      `japanese-katakana-two-byte'       (\\cK)
1023      `korean-hangul-two-byte'           (\\cN)
1024      `cyrillic-two-byte'                (\\cY)
1025      `combining-diacritic'              (\\c^)
1026      `ascii'                            (\\ca)
1027      `arabic'                           (\\cb)
1028      `chinese'                          (\\cc)
1029      `ethiopic'                         (\\ce)
1030      `greek'                            (\\cg)
1031      `korean'                           (\\ch)
1032      `indian'                           (\\ci)
1033      `japanese'                         (\\cj)
1034      `japanese-katakana'                (\\ck)
1035      `latin'                            (\\cl)
1036      `lao'                              (\\co)
1037      `tibetan'                          (\\cq)
1038      `japanese-roman'                   (\\cr)
1039      `thai'                             (\\ct)
1040      `vietnamese'                       (\\cv)
1041      `hebrew'                           (\\cw)
1042      `cyrillic'                         (\\cy)
1043      `can-break'                        (\\c|)
1044
1045 `(not (category CATEGORY))'
1046      matches a character that doesn't have category CATEGORY.
1047
1048 `(and SEXP1 SEXP2 ...)'
1049 `(: SEXP1 SEXP2 ...)'
1050 `(seq SEXP1 SEXP2 ...)'
1051 `(sequence SEXP1 SEXP2 ...)'
1052      matches what SEXP1 matches, followed by what SEXP2 matches, etc.
1053
1054 `(submatch SEXP1 SEXP2 ...)'
1055 `(group SEXP1 SEXP2 ...)'
1056      like `and', but makes the match accessible with `match-end',
1057      `match-beginning', and `match-string'.
1058
1059 `(or SEXP1 SEXP2 ...)'
1060 `(| SEXP1 SEXP2 ...)'
1061      matches anything that matches SEXP1 or SEXP2, etc.  If all
1062      args are strings, use `regexp-opt' to optimize the resulting
1063      regular expression.
1064
1065 `(minimal-match SEXP)'
1066      produce a non-greedy regexp for SEXP.  Normally, regexps matching
1067      zero or more occurrences of something are \"greedy\" in that they
1068      match as much as they can, as long as the overall regexp can
1069      still match.  A non-greedy regexp matches as little as possible.
1070
1071 `(maximal-match SEXP)'
1072      produce a greedy regexp for SEXP.  This is the default.
1073
1074 Below, `SEXP ...' represents a sequence of regexp forms, treated as if
1075 enclosed in `(and ...)'.
1076
1077 `(zero-or-more SEXP ...)'
1078 `(0+ SEXP ...)'
1079      matches zero or more occurrences of what SEXP ... matches.
1080
1081 `(* SEXP ...)'
1082      like `zero-or-more', but always produces a greedy regexp, independent
1083      of `rx-greedy-flag'.
1084
1085 `(*? SEXP ...)'
1086      like `zero-or-more', but always produces a non-greedy regexp,
1087      independent of `rx-greedy-flag'.
1088
1089 `(one-or-more SEXP ...)'
1090 `(1+ SEXP ...)'
1091      matches one or more occurrences of SEXP ...
1092
1093 `(+ SEXP ...)'
1094      like `one-or-more', but always produces a greedy regexp.
1095
1096 `(+? SEXP ...)'
1097      like `one-or-more', but always produces a non-greedy regexp.
1098
1099 `(zero-or-one SEXP ...)'
1100 `(optional SEXP ...)'
1101 `(opt SEXP ...)'
1102      matches zero or one occurrences of A.
1103
1104 `(? SEXP ...)'
1105      like `zero-or-one', but always produces a greedy regexp.
1106
1107 `(?? SEXP ...)'
1108      like `zero-or-one', but always produces a non-greedy regexp.
1109
1110 `(repeat N SEXP)'
1111 `(= N SEXP ...)'
1112      matches N occurrences.
1113
1114 `(>= N SEXP ...)'
1115      matches N or more occurrences.
1116
1117 `(repeat N M SEXP)'
1118 `(** N M SEXP ...)'
1119      matches N to M occurrences.
1120
1121 `(backref N)'
1122      matches what was matched previously by submatch N.
1123
1124 `(eval FORM)'
1125      evaluate FORM and insert result.  If result is a string,
1126      `regexp-quote' it.
1127
1128 `(regexp REGEXP)'
1129      include REGEXP in string notation in the result."
1130   (cond ((null regexps)
1131          (error "No regexp"))
1132         ((cdr regexps)
1133          (rx-to-string `(and ,@regexps) t))
1134         (t
1135          (rx-to-string (car regexps) t))))
1136 \f
1137 ;; ;; sregex.el replacement
1138
1139 ;; ;;;###autoload (provide 'sregex)
1140 ;; ;;;###autoload (autoload 'sregex "rx")
1141 ;; (defalias 'sregex 'rx-to-string)
1142 ;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
1143 ;; (defalias 'sregexq 'rx)
1144 \f
1145 (provide 'rx)
1146
1147 ;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b
1148 ;;; rx.el ends here