(Scan Line Formats): Replace @samp with @kbd.
[emacs.git] / lisp / progmodes / ebnf-ebx.el
blob7e65611cedfb867309b47107553eb3cd235c3881
1 ;;; ebnf-ebx.el --- parser for EBNF used to specify XML (EBNFX)
3 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006
4 ;; Free Sofware Foundation, Inc.
6 ;; Author: Vinicius Jose Latorre <viniciusjl@ig.com.br>
7 ;; Maintainer: Vinicius Jose Latorre <viniciusjl@ig.com.br>
8 ;; Time-stamp: <2004/04/03 16:45:34 vinicius>
9 ;; Keywords: wp, ebnf, PostScript
10 ;; Version: 1.1
12 ;; This file is part of GNU Emacs.
14 ;; GNU Emacs is free software; you can redistribute it and/or modify
15 ;; it under the terms of the GNU General Public License as published by
16 ;; the Free Software Foundation; either version 2, or (at your option)
17 ;; any later version.
19 ;; GNU Emacs is distributed in the hope that it will be useful,
20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;; GNU General Public License for more details.
24 ;; You should have received a copy of the GNU General Public License
25 ;; along with GNU Emacs; see the file COPYING. If not, write to the
26 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
27 ;; Boston, MA 02110-1301, USA.
29 ;;; Commentary:
31 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
34 ;; This is part of ebnf2ps package.
36 ;; This package defines a parser for EBNF used to specify XML (EBNFX).
38 ;; See ebnf2ps.el for documentation.
41 ;; EBNFX Syntax
42 ;; ------------
44 ;; See the URL:
45 ;; `http://www.w3.org/TR/2004/REC-xml-20040204/#sec-notation'
46 ;; (Extensible Markup Language (XML) 1.0 (Third Edition))
49 ;; rule ::= symbol '::=' expression
50 ;; /* rules are separated by at least one blank line. */
52 ;; expression ::= concatenation ('|' concatenation)*
54 ;; concatenation ::= exception*
56 ;; exception ::= term ('-' term)?
58 ;; term ::= factor ('*' | '+' | '?')?
60 ;; factor ::= hex-char+
61 ;; | '[' '^'? ( char ( '-' char )? )+ ']'
62 ;; | '"' 'string' '"'
63 ;; | "'" "string" "'"
64 ;; | '(' expression ')'
65 ;; | symbol
67 ;; symbol ::= 'upper or lower case letter'
68 ;; ('upper or lower case letter' | '-' | '_')*
69 ;; /* upper and lower 8-bit accentuated characters are included */
71 ;; hex-char ::= '#x' [0-9A-Fa-f]+
73 ;; char ::= hex-char | 'any character except control characters'
74 ;; /* 8-bit accentuated characters are included */
76 ;; any-char ::= char | 'newline' | 'tab'
78 ;; ignore ::= '[' ('wfc' | 'WFC' | 'vc' | 'VC') ':' ( any-char - ']' )* ']'
80 ;; comment ::= '/*' ( any-char - '*/' ) '*/'
83 ;; Below is the Notation section extracted from the URL cited above.
85 ;; 6 Notation
87 ;; The formal grammar of XML is given in this specification using a simple
88 ;; Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines
89 ;; one symbol, in the form
91 ;; symbol ::= expression
93 ;; Symbols are written with an initial capital letter if they are the start
94 ;; symbol of a regular language, otherwise with an initial lowercase letter.
95 ;; Literal strings are quoted.
97 ;; Within the expression on the right-hand side of a rule, the following
98 ;; expressions are used to match strings of one or more characters:
100 ;; #xN
102 ;; where N is a hexadecimal integer, the expression matches the character
103 ;; whose number (code point) in ISO/IEC 10646 is N. The number of leading
104 ;; zeros in the #xN form is insignificant.
106 ;; [a-zA-Z], [#xN-#xN]
108 ;; matches any Char with a value in the range(s) indicated (inclusive).
110 ;; [abc], [#xN#xN#xN]
112 ;; matches any Char with a value among the characters enumerated.
113 ;; Enumerations and ranges can be mixed in one set of brackets.
115 ;; [^a-z], [^#xN-#xN]
117 ;; matches any Char with a value outside the range indicated.
119 ;; [^abc], [^#xN#xN#xN]
121 ;; matches any Char with a value not among the characters given.
122 ;; Enumerations and ranges of forbidden values can be mixed in one set of
123 ;; brackets.
125 ;; "string"
127 ;; matches a literal string matching that given inside the double quotes.
129 ;; 'string'
131 ;; matches a literal string matching that given inside the single quotes.
133 ;; These symbols may be combined to match more complex patterns as follows,
134 ;; where A and B represent simple expressions:
136 ;; (expression)
138 ;; expression is treated as a unit and may be combined as described in this
139 ;; list.
141 ;; A?
143 ;; matches A or nothing; optional A.
145 ;; A B
147 ;; matches A followed by B. This operator has higher precedence than
148 ;; alternation; thus A B | C D is identical to (A B) | (C D).
150 ;; A | B
152 ;; matches A or B.
154 ;; A - B
156 ;; matches any string that matches A but does not match B.
158 ;; A+
160 ;; matches one or more occurrences of A. Concatenation has higher
161 ;; precedence than alternation; thus A+ | B+ is identical to (A+) | (B+).
163 ;; A*
165 ;; matches zero or more occurrences of A. Concatenation has higher
166 ;; precedence than alternation; thus A* | B* is identical to (A*) | (B*).
168 ;; Other notations used in the productions are:
170 ;; /* ... */
172 ;; comment.
174 ;; [ wfc: ... ]
176 ;; well-formedness constraint; this identifies by name a constraint on
177 ;; well-formed documents associated with a production.
179 ;; [ vc: ... ]
181 ;; validity constraint; this identifies by name a constraint on valid
182 ;; documents associated with a production.
185 ;; Differences Between EBNFX And ebnf2ps EBNFX
186 ;; -------------------------------------------
188 ;; Besides the characters that EBNFX accepts, ebnf2ps EBNFX accepts also the
189 ;; underscore (_) and minus (-) for rule name and european 8-bit accentuated
190 ;; characters (from \240 to \377) for rule name, string and comment. Also
191 ;; rule name can start with upper case letter.
194 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
196 ;;; Code:
199 (require 'ebnf-otz)
202 (defvar ebnf-ebx-lex nil
203 "Value returned by `ebnf-ebx-lex' function.")
206 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
207 ;; Syntactic analyzer
210 ;;; rulelist ::= rule+
212 (defun ebnf-ebx-parser (start)
213 "EBNFX parser."
214 (let ((total (+ (- ebnf-limit start) 1))
215 (bias (1- start))
216 (origin (point))
217 rule-list token rule)
218 (goto-char start)
219 (setq token (ebnf-ebx-lex))
220 (and (eq token 'end-of-input)
221 (error "Invalid EBNFX file format"))
222 (and (eq token 'end-of-rule)
223 (setq token (ebnf-ebx-lex)))
224 (while (not (eq token 'end-of-input))
225 (ebnf-message-float
226 "Parsing...%s%%"
227 (/ (* (- (point) bias) 100.0) total))
228 (setq token (ebnf-ebx-rule token)
229 rule (cdr token)
230 token (car token))
231 (or (ebnf-add-empty-rule-list rule)
232 (setq rule-list (cons rule rule-list))))
233 (goto-char origin)
234 rule-list))
237 ;;; rule ::= symbol '::=' expression
240 (defun ebnf-ebx-rule (token)
241 (let ((name ebnf-ebx-lex)
242 (action ebnf-action)
243 elements)
244 (setq ebnf-action nil)
245 (or (eq token 'non-terminal)
246 (error "Invalid rule name"))
247 (setq token (ebnf-ebx-lex))
248 (or (eq token 'production)
249 (error "Invalid rule: missing `::='"))
250 (setq elements (ebnf-ebx-expression))
251 (or (memq (car elements) '(end-of-rule end-of-input))
252 (error "Invalid rule: there is no end of rule"))
253 (setq elements (cdr elements))
254 (ebnf-eps-add-production name)
255 (cons (ebnf-ebx-lex)
256 (ebnf-make-production name elements action))))
259 ;; expression ::= concatenation ('|' concatenation)*
262 (defun ebnf-ebx-expression ()
263 (let (body concatenation)
264 (while (eq (car (setq concatenation
265 (ebnf-ebx-concatenation (ebnf-ebx-lex))))
266 'alternative)
267 (setq body (cons (cdr concatenation) body)))
268 (ebnf-token-alternative body concatenation)))
271 ;; concatenation ::= exception*
274 (defun ebnf-ebx-concatenation (token)
275 (let ((term (ebnf-ebx-exception token))
276 seq)
277 (or (setq token (car term)
278 term (cdr term))
279 (error "Empty element"))
280 (setq seq (cons term seq))
281 (while (setq term (ebnf-ebx-exception token)
282 token (car term)
283 term (cdr term))
284 (setq seq (cons term seq)))
285 (cons token
286 (ebnf-token-sequence seq))))
289 ;;; exception ::= term ('-' term)?
292 (defun ebnf-ebx-exception (token)
293 (let ((term (ebnf-ebx-term token)))
294 (if (eq (car term) 'exception)
295 (let ((except (ebnf-ebx-term (ebnf-ebx-lex))))
296 (cons (car except)
297 (ebnf-make-except (cdr term) (cdr except))))
298 term)))
302 ;;; term ::= factor ('*' | '+' | '?')?
305 (defun ebnf-ebx-term (token)
306 (let ((factor (ebnf-ebx-factor token)))
307 (when factor
308 (setq token (ebnf-ebx-lex))
309 (cond ((eq token 'zero-or-more)
310 (setq factor (ebnf-make-zero-or-more factor)
311 token (ebnf-ebx-lex)))
312 ((eq token 'one-or-more)
313 (setq factor (ebnf-make-one-or-more factor)
314 token (ebnf-ebx-lex)))
315 ((eq token 'optional)
316 (setq factor (ebnf-token-optional factor)
317 token (ebnf-ebx-lex)))))
318 (cons token factor)))
321 ;;; factor ::= hex-char+
322 ;;; | '[' '^'? ( char ( '-' char )? )+ ']'
323 ;;; | '"' 'string' '"'
324 ;;; | "'" "string" "'"
325 ;;; | '(' expression ')'
326 ;;; | symbol
328 ;;; symbol ::= 'upper or lower case letter'
329 ;;; ('upper or lower case letter' | '-' | '_')*
330 ;;; /* upper and lower 8-bit accentuated characters are included */
332 ;;; hex-char ::= '#x' [0-9A-Fa-f]+
334 ;;; char ::= hex-char | 'any character except control characters'
335 ;;; /* 8-bit accentuated characters are included */
337 ;;; any-char ::= char | 'newline' | 'tab'
340 (defun ebnf-ebx-factor (token)
341 (cond
342 ;; terminal
343 ((eq token 'terminal)
344 (ebnf-make-terminal ebnf-ebx-lex))
345 ;; non-terminal
346 ((eq token 'non-terminal)
347 (ebnf-make-non-terminal ebnf-ebx-lex))
348 ;; group
349 ((eq token 'begin-group)
350 (let ((body (ebnf-ebx-expression)))
351 (or (eq (car body) 'end-group)
352 (error "Missing `)'"))
353 (cdr body)))
354 ;; no element
356 nil)
360 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
361 ;; Lexical analyzer
364 (defconst ebnf-ebx-token-table (make-vector 256 'error)
365 "Vector used to map characters to a lexical token.")
368 (defun ebnf-ebx-initialize ()
369 "Initialize EBNFX token table."
370 ;; control character & control 8-bit character are set to `error'
371 (let ((char ?\101))
372 ;; printable character: A-Z
373 (while (< char ?\133)
374 (aset ebnf-ebx-token-table char 'non-terminal)
375 (setq char (1+ char)))
376 ;; printable character: a-z
377 (setq char ?\141)
378 (while (< char ?\173)
379 (aset ebnf-ebx-token-table char 'non-terminal)
380 (setq char (1+ char)))
381 ;; European 8-bit accentuated characters:
382 (setq char ?\240)
383 (while (< char ?\400)
384 (aset ebnf-ebx-token-table char 'non-terminal)
385 (setq char (1+ char)))
386 ;; Override end of line characters:
387 (aset ebnf-ebx-token-table ?\n 'end-of-rule) ; [NL] linefeed
388 (aset ebnf-ebx-token-table ?\r 'end-of-rule) ; [CR] carriage return
389 ;; Override space characters:
390 (aset ebnf-ebx-token-table ?\013 'space) ; [VT] vertical tab
391 (aset ebnf-ebx-token-table ?\t 'space) ; [HT] horizontal tab
392 (aset ebnf-ebx-token-table ?\ 'space) ; [SP] space
393 ;; Override form feed character:
394 (aset ebnf-ebx-token-table ?\f 'form-feed) ; [FF] form feed
395 ;; Override other lexical characters:
396 (aset ebnf-ebx-token-table ?# 'hash)
397 (aset ebnf-ebx-token-table ?\" 'double-quote)
398 (aset ebnf-ebx-token-table ?\' 'single-quote)
399 (aset ebnf-ebx-token-table ?\( 'begin-group)
400 (aset ebnf-ebx-token-table ?\) 'end-group)
401 (aset ebnf-ebx-token-table ?- 'exception)
402 (aset ebnf-ebx-token-table ?: 'colon)
403 (aset ebnf-ebx-token-table ?\[ 'begin-square)
404 (aset ebnf-ebx-token-table ?| 'alternative)
405 (aset ebnf-ebx-token-table ?* 'zero-or-more)
406 (aset ebnf-ebx-token-table ?+ 'one-or-more)
407 (aset ebnf-ebx-token-table ?\? 'optional)
408 ;; Override comment character:
409 (aset ebnf-ebx-token-table ?/ 'comment)))
412 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
413 (defconst ebnf-ebx-non-terminal-chars
414 (ebnf-range-regexp "-_A-Za-z" ?\240 ?\377))
415 (defconst ebnf-ebx-non-terminal-letter-chars
416 (ebnf-range-regexp "A-Za-z" ?\240 ?\377))
419 (defun ebnf-ebx-lex ()
420 "Lexical analyzer for EBNFX.
422 Return a lexical token.
424 See documentation for variable `ebnf-ebx-lex'."
425 (if (>= (point) ebnf-limit)
426 'end-of-input
427 (let (token)
428 ;; skip spaces and comments
429 (while (if (> (following-char) 255)
430 (progn
431 (setq token 'error)
432 nil)
433 (setq token (aref ebnf-ebx-token-table (following-char)))
434 (cond
435 ((eq token 'space)
436 (skip-chars-forward " \013\t" ebnf-limit)
437 (< (point) ebnf-limit))
438 ((eq token 'comment)
439 (ebnf-ebx-skip-comment))
440 ((eq token 'form-feed)
441 (forward-char)
442 (setq ebnf-action 'form-feed))
443 ((eq token 'end-of-rule)
444 (ebnf-ebx-skip-end-of-rule))
445 ((and (eq token 'begin-square)
446 (let ((case-fold-search t))
447 (looking-at "\\[\\(wfc\\|vc\\):")))
448 (ebnf-ebx-skip-constraint))
449 (t nil)
451 (cond
452 ;; end of input
453 ((>= (point) ebnf-limit)
454 'end-of-input)
455 ;; error
456 ((eq token 'error)
457 (error "Invalid character"))
458 ;; end of rule
459 ((eq token 'end-of-rule)
460 'end-of-rule)
461 ;; terminal: #x [0-9A-Fa-f]+
462 ((eq token 'hash)
463 (setq ebnf-ebx-lex (ebnf-ebx-character))
464 'terminal)
465 ;; terminal: "string"
466 ((eq token 'double-quote)
467 (setq ebnf-ebx-lex (ebnf-ebx-string ?\"))
468 'terminal)
469 ;; terminal: 'string'
470 ((eq token 'single-quote)
471 (setq ebnf-ebx-lex (ebnf-ebx-string ?\'))
472 'terminal)
473 ;; terminal: [ ^? ( char ( - char )? )+ ]
474 ((eq token 'begin-square)
475 (setq ebnf-ebx-lex (ebnf-ebx-range))
476 'terminal)
477 ;; non-terminal: NAME
478 ((eq token 'non-terminal)
479 (setq ebnf-ebx-lex
480 (ebnf-buffer-substring ebnf-ebx-non-terminal-chars))
481 'non-terminal)
482 ;; colon: ::=
483 ((eq token 'colon)
484 (or (looking-at "::=")
485 (error "Missing `::=' token"))
486 (forward-char 3)
487 'production)
488 ;; miscellaneous: (, ), *, +, ?, |, -
490 (forward-char)
491 token)
492 ))))
495 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
496 (defconst ebnf-ebx-constraint-chars
497 (ebnf-range-regexp "^\000-\010\016-\037]" ?\177 ?\237))
500 (defun ebnf-ebx-skip-constraint ()
501 (or (> (skip-chars-forward ebnf-ebx-constraint-chars ebnf-limit) 0)
502 (error "Invalid character"))
503 (or (= (following-char) ?\])
504 (error "Missing end of constraint `]'"))
505 (forward-char)
510 (defun ebnf-ebx-skip-end-of-rule ()
511 (let (eor-p)
512 (while (progn
513 ;; end of rule ==> 2 or more consecutive end of lines
514 (setq eor-p (or (> (skip-chars-forward "\r\n" ebnf-limit) 1)
515 eor-p))
516 ;; skip spaces
517 (skip-chars-forward " \013\t" ebnf-limit)
518 ;; skip comments
519 (and (= (following-char) ?/)
520 (ebnf-ebx-skip-comment))))
521 (not eor-p)))
524 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
525 (defconst ebnf-ebx-comment-chars
526 (ebnf-range-regexp "^\000-\010\016-\037\\*" ?\177 ?\237))
527 (defconst ebnf-ebx-filename-chars
528 (ebnf-range-regexp "^\000-\037\\*" ?\177 ?\237))
531 (defun ebnf-ebx-skip-comment ()
532 (forward-char)
533 (or (= (following-char) ?*)
534 (error "Invalid beginning of comment"))
535 (forward-char)
536 (cond
537 ;; open EPS file
538 ((and ebnf-eps-executing (= (following-char) ?\[))
539 (ebnf-eps-add-context (ebnf-ebx-eps-filename)))
540 ;; close EPS file
541 ((and ebnf-eps-executing (= (following-char) ?\]))
542 (ebnf-eps-remove-context (ebnf-ebx-eps-filename)))
543 ;; any other action in comment
545 (setq ebnf-action (aref ebnf-comment-table (following-char))))
547 (while (progn
548 (skip-chars-forward ebnf-ebx-comment-chars ebnf-limit)
549 (or (= (following-char) ?*)
550 (error "Missing end of comment"))
551 (forward-char)
552 (and (/= (following-char) ?/)
553 (< (point) ebnf-limit))))
554 ;; check for a valid end of comment
555 (and (>= (point) ebnf-limit)
556 (error "Missing end of comment"))
557 (forward-char)
561 (defun ebnf-ebx-eps-filename ()
562 (forward-char)
563 (let (fname nchar)
564 (while (progn
565 (setq fname
566 (concat fname
567 (ebnf-buffer-substring ebnf-ebx-filename-chars)))
568 (and (< (point) ebnf-limit)
569 (> (setq nchar (skip-chars-forward "*" ebnf-limit)) 0)
570 (< (point) ebnf-limit)
571 (/= (following-char) ?/)))
572 (setq fname (concat fname (make-string nchar ?*))
573 nchar nil))
574 (if (or (not nchar) (= nchar 0))
575 fname
576 (and (< (point) ebnf-limit)
577 (= (following-char) ?/)
578 (setq nchar (1- nchar)))
579 (concat fname (make-string nchar ?*)))))
582 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
583 (defconst ebnf-ebx-double-string-chars
584 (ebnf-range-regexp "\t -!#-~" ?\240 ?\377))
585 (defconst ebnf-ebx-single-string-chars
586 (ebnf-range-regexp "\t -&(-~" ?\240 ?\377))
589 (defun ebnf-ebx-string (delim)
590 (buffer-substring-no-properties
591 (progn
592 (forward-char)
593 (point))
594 (progn
595 (skip-chars-forward (if (= delim ?\")
596 ebnf-ebx-double-string-chars
597 ebnf-ebx-single-string-chars)
598 ebnf-limit)
599 (or (= (following-char) delim)
600 (error "Missing string delimiter `%c'" delim))
601 (prog1
602 (point)
603 (forward-char)))))
606 (defun ebnf-ebx-character ()
607 ;; #x [0-9A-Fa-f]+
608 (buffer-substring-no-properties
609 (point)
610 (progn
611 (ebnf-ebx-hex-character)
612 (point))))
615 (defun ebnf-ebx-range ()
616 ;; [ ^? ( char ( - char )? )+ ]
617 (buffer-substring-no-properties
618 (point)
619 (progn
620 (forward-char)
621 (and (= (following-char) ?^)
622 (forward-char))
623 (and (= (following-char) ?-)
624 (forward-char))
625 (while (progn
626 (ebnf-ebx-any-character)
627 (when (= (following-char) ?-)
628 (forward-char)
629 (ebnf-ebx-any-character))
630 (and (/= (following-char) ?\])
631 (< (point) ebnf-limit))))
632 (and (>= (point) ebnf-limit)
633 (error "Missing end of character range `]'"))
634 (forward-char)
635 (point))))
638 (defun ebnf-ebx-any-character ()
639 (let ((char (following-char)))
640 (cond ((= char ?#)
641 (ebnf-ebx-hex-character t))
642 ((or (and (<= ?\ char) (<= char ?\")) ; #
643 (and (<= ?$ char) (<= char ?,)) ; -
644 (and (<= ?. char) (<= char ?\\)) ; ]
645 (and (<= ?^ char) (<= char ?~))
646 (and (<= ?\240 char) (<= char ?\377)))
647 (forward-char))
649 (error "Invalid character `%c'" char)))))
652 (defun ebnf-ebx-hex-character (&optional no-error)
653 ;; #x [0-9A-Fa-f]+
654 (forward-char)
655 (if (/= (following-char) ?x)
656 (or no-error
657 (error "Invalid hexadecimal character"))
658 (forward-char)
659 (or (> (skip-chars-forward "0-9A-Fa-f" ebnf-limit) 0)
660 (error "Invalid hexadecimal character"))))
663 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
666 (provide 'ebnf-ebx)
668 ;;; arch-tag: bfe2f95b-66bc-4dc6-8b7e-b7831e68f5fb
669 ;;; ebnf-ebx.el ends here