1 ;;; ebnf-ebx.el --- parser for EBNF used to specify XML (EBNFX)
3 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007
4 ;; Free Software Foundation, Inc.
6 ;; Author: Vinicius Jose Latorre <viniciusjl@ig.com.br>
7 ;; Maintainer: Vinicius Jose Latorre <viniciusjl@ig.com.br>
8 ;; Keywords: wp, ebnf, PostScript
11 ;; This file is part of GNU Emacs.
13 ;; GNU Emacs is free software; you can redistribute it and/or modify
14 ;; it under the terms of the GNU General Public License as published by
15 ;; the Free Software Foundation; either version 3, or (at your option)
18 ;; GNU Emacs is distributed in the hope that it will be useful,
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;; GNU General Public License for more details.
23 ;; You should have received a copy of the GNU General Public License
24 ;; along with GNU Emacs; see the file COPYING. If not, write to the
25 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26 ;; Boston, MA 02110-1301, USA.
30 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
33 ;; This is part of ebnf2ps package.
35 ;; This package defines a parser for EBNF used to specify XML (EBNFX).
37 ;; See ebnf2ps.el for documentation.
44 ;; `http://www.w3.org/TR/2004/REC-xml-20040204/#sec-notation'
45 ;; (Extensible Markup Language (XML) 1.0 (Third Edition))
48 ;; rule ::= symbol '::=' expression
49 ;; /* rules are separated by at least one blank line. */
51 ;; expression ::= concatenation ('|' concatenation)*
53 ;; concatenation ::= exception*
55 ;; exception ::= term ('-' term)?
57 ;; term ::= factor ('*' | '+' | '?')?
59 ;; factor ::= hex-char+
60 ;; | '[' '^'? ( char ( '-' char )? )+ ']'
63 ;; | '(' expression ')'
66 ;; symbol ::= 'upper or lower case letter'
67 ;; ('upper or lower case letter' | '-' | '_')*
68 ;; /* upper and lower 8-bit accentuated characters are included */
70 ;; hex-char ::= '#x' [0-9A-Fa-f]+
72 ;; char ::= hex-char | 'any character except control characters'
73 ;; /* 8-bit accentuated characters are included */
75 ;; any-char ::= char | 'newline' | 'tab'
77 ;; ignore ::= '[' ('wfc' | 'WFC' | 'vc' | 'VC') ':' ( any-char - ']' )* ']'
79 ;; comment ::= '/*' ( any-char - '*/' ) '*/'
82 ;; Below is the Notation section extracted from the URL cited above.
86 ;; The formal grammar of XML is given in this specification using a simple
87 ;; Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines
88 ;; one symbol, in the form
90 ;; symbol ::= expression
92 ;; Symbols are written with an initial capital letter if they are the start
93 ;; symbol of a regular language, otherwise with an initial lowercase letter.
94 ;; Literal strings are quoted.
96 ;; Within the expression on the right-hand side of a rule, the following
97 ;; expressions are used to match strings of one or more characters:
101 ;; where N is a hexadecimal integer, the expression matches the character
102 ;; whose number (code point) in ISO/IEC 10646 is N. The number of leading
103 ;; zeros in the #xN form is insignificant.
105 ;; [a-zA-Z], [#xN-#xN]
107 ;; matches any Char with a value in the range(s) indicated (inclusive).
109 ;; [abc], [#xN#xN#xN]
111 ;; matches any Char with a value among the characters enumerated.
112 ;; Enumerations and ranges can be mixed in one set of brackets.
114 ;; [^a-z], [^#xN-#xN]
116 ;; matches any Char with a value outside the range indicated.
118 ;; [^abc], [^#xN#xN#xN]
120 ;; matches any Char with a value not among the characters given.
121 ;; Enumerations and ranges of forbidden values can be mixed in one set of
126 ;; matches a literal string matching that given inside the double quotes.
130 ;; matches a literal string matching that given inside the single quotes.
132 ;; These symbols may be combined to match more complex patterns as follows,
133 ;; where A and B represent simple expressions:
137 ;; expression is treated as a unit and may be combined as described in this
142 ;; matches A or nothing; optional A.
146 ;; matches A followed by B. This operator has higher precedence than
147 ;; alternation; thus A B | C D is identical to (A B) | (C D).
155 ;; matches any string that matches A but does not match B.
159 ;; matches one or more occurrences of A. Concatenation has higher
160 ;; precedence than alternation; thus A+ | B+ is identical to (A+) | (B+).
164 ;; matches zero or more occurrences of A. Concatenation has higher
165 ;; precedence than alternation; thus A* | B* is identical to (A*) | (B*).
167 ;; Other notations used in the productions are:
175 ;; well-formedness constraint; this identifies by name a constraint on
176 ;; well-formed documents associated with a production.
180 ;; validity constraint; this identifies by name a constraint on valid
181 ;; documents associated with a production.
184 ;; Differences Between EBNFX And ebnf2ps EBNFX
185 ;; -------------------------------------------
187 ;; Besides the characters that EBNFX accepts, ebnf2ps EBNFX accepts also the
188 ;; underscore (_) and minus (-) for rule name and european 8-bit accentuated
189 ;; characters (from \240 to \377) for rule name, string and comment. Also
190 ;; rule name can start with upper case letter.
193 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
201 (defvar ebnf-ebx-lex nil
202 "Value returned by `ebnf-ebx-lex' function.")
205 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
206 ;; Syntactic analyzer
209 ;;; rulelist ::= rule+
211 (defun ebnf-ebx-parser (start)
213 (let ((total (+ (- ebnf-limit start
) 1))
216 rule-list token rule
)
218 (setq token
(ebnf-ebx-lex))
219 (and (eq token
'end-of-input
)
220 (error "Invalid EBNFX file format"))
221 (and (eq token
'end-of-rule
)
222 (setq token
(ebnf-ebx-lex)))
223 (while (not (eq token
'end-of-input
))
226 (/ (* (- (point) bias
) 100.0) total
))
227 (setq token
(ebnf-ebx-rule token
)
230 (or (ebnf-add-empty-rule-list rule
)
231 (setq rule-list
(cons rule rule-list
))))
236 ;;; rule ::= symbol '::=' expression
239 (defun ebnf-ebx-rule (token)
240 (let ((name ebnf-ebx-lex
)
243 (setq ebnf-action nil
)
244 (or (eq token
'non-terminal
)
245 (error "Invalid rule name"))
246 (setq token
(ebnf-ebx-lex))
247 (or (eq token
'production
)
248 (error "Invalid rule: missing `::='"))
249 (setq elements
(ebnf-ebx-expression))
250 (or (memq (car elements
) '(end-of-rule end-of-input
))
251 (error "Invalid rule: there is no end of rule"))
252 (setq elements
(cdr elements
))
253 (ebnf-eps-add-production name
)
255 (ebnf-make-production name elements action
))))
258 ;; expression ::= concatenation ('|' concatenation)*
261 (defun ebnf-ebx-expression ()
262 (let (body concatenation
)
263 (while (eq (car (setq concatenation
264 (ebnf-ebx-concatenation (ebnf-ebx-lex))))
266 (setq body
(cons (cdr concatenation
) body
)))
267 (ebnf-token-alternative body concatenation
)))
270 ;; concatenation ::= exception*
273 (defun ebnf-ebx-concatenation (token)
274 (let ((term (ebnf-ebx-exception token
))
276 (or (setq token
(car term
)
278 (error "Empty element"))
279 (setq seq
(cons term seq
))
280 (while (setq term
(ebnf-ebx-exception token
)
283 (setq seq
(cons term seq
)))
285 (ebnf-token-sequence seq
))))
288 ;;; exception ::= term ('-' term)?
291 (defun ebnf-ebx-exception (token)
292 (let ((term (ebnf-ebx-term token
)))
293 (if (eq (car term
) 'exception
)
294 (let ((except (ebnf-ebx-term (ebnf-ebx-lex))))
296 (ebnf-make-except (cdr term
) (cdr except
))))
301 ;;; term ::= factor ('*' | '+' | '?')?
304 (defun ebnf-ebx-term (token)
305 (let ((factor (ebnf-ebx-factor token
)))
307 (setq token
(ebnf-ebx-lex))
308 (cond ((eq token
'zero-or-more
)
309 (setq factor
(ebnf-make-zero-or-more factor
)
310 token
(ebnf-ebx-lex)))
311 ((eq token
'one-or-more
)
312 (setq factor
(ebnf-make-one-or-more factor
)
313 token
(ebnf-ebx-lex)))
314 ((eq token
'optional
)
315 (setq factor
(ebnf-token-optional factor
)
316 token
(ebnf-ebx-lex)))))
317 (cons token factor
)))
320 ;;; factor ::= hex-char+
321 ;;; | '[' '^'? ( char ( '-' char )? )+ ']'
322 ;;; | '"' 'string' '"'
323 ;;; | "'" "string" "'"
324 ;;; | '(' expression ')'
327 ;;; symbol ::= 'upper or lower case letter'
328 ;;; ('upper or lower case letter' | '-' | '_')*
329 ;;; /* upper and lower 8-bit accentuated characters are included */
331 ;;; hex-char ::= '#x' [0-9A-Fa-f]+
333 ;;; char ::= hex-char | 'any character except control characters'
334 ;;; /* 8-bit accentuated characters are included */
336 ;;; any-char ::= char | 'newline' | 'tab'
339 (defun ebnf-ebx-factor (token)
342 ((eq token
'terminal
)
343 (ebnf-make-terminal ebnf-ebx-lex
))
345 ((eq token
'non-terminal
)
346 (ebnf-make-non-terminal ebnf-ebx-lex
))
348 ((eq token
'begin-group
)
349 (let ((body (ebnf-ebx-expression)))
350 (or (eq (car body
) 'end-group
)
351 (error "Missing `)'"))
359 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
363 (defconst ebnf-ebx-token-table
(make-vector 256 'error
)
364 "Vector used to map characters to a lexical token.")
367 (defun ebnf-ebx-initialize ()
368 "Initialize EBNFX token table."
369 ;; control character & control 8-bit character are set to `error'
371 ;; printable character: A-Z
372 (while (< char ?
\133)
373 (aset ebnf-ebx-token-table char
'non-terminal
)
374 (setq char
(1+ char
)))
375 ;; printable character: a-z
377 (while (< char ?
\173)
378 (aset ebnf-ebx-token-table char
'non-terminal
)
379 (setq char
(1+ char
)))
380 ;; European 8-bit accentuated characters:
382 (while (< char ?
\400)
383 (aset ebnf-ebx-token-table char
'non-terminal
)
384 (setq char
(1+ char
)))
385 ;; Override end of line characters:
386 (aset ebnf-ebx-token-table ?
\n 'end-of-rule
) ; [NL] linefeed
387 (aset ebnf-ebx-token-table ?
\r 'end-of-rule
) ; [CR] carriage return
388 ;; Override space characters:
389 (aset ebnf-ebx-token-table ?
\013 'space
) ; [VT] vertical tab
390 (aset ebnf-ebx-token-table ?
\t 'space
) ; [HT] horizontal tab
391 (aset ebnf-ebx-token-table ?\
'space
) ; [SP] space
392 ;; Override form feed character:
393 (aset ebnf-ebx-token-table ?
\f 'form-feed
) ; [FF] form feed
394 ;; Override other lexical characters:
395 (aset ebnf-ebx-token-table ?
# 'hash
)
396 (aset ebnf-ebx-token-table ?
\" 'double-quote
)
397 (aset ebnf-ebx-token-table ?
\' 'single-quote
)
398 (aset ebnf-ebx-token-table ?\
( 'begin-group
)
399 (aset ebnf-ebx-token-table ?\
) 'end-group
)
400 (aset ebnf-ebx-token-table ?-
'exception
)
401 (aset ebnf-ebx-token-table ?
: 'colon
)
402 (aset ebnf-ebx-token-table ?\
[ 'begin-square
)
403 (aset ebnf-ebx-token-table ?|
'alternative
)
404 (aset ebnf-ebx-token-table ?
* 'zero-or-more
)
405 (aset ebnf-ebx-token-table ?
+ 'one-or-more
)
406 (aset ebnf-ebx-token-table ?
\? 'optional
)
407 ;; Override comment character:
408 (aset ebnf-ebx-token-table ?
/ 'comment
)))
411 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
412 (defconst ebnf-ebx-non-terminal-chars
413 (ebnf-range-regexp "-_A-Za-z" ?
\240 ?
\377))
414 (defconst ebnf-ebx-non-terminal-letter-chars
415 (ebnf-range-regexp "A-Za-z" ?
\240 ?
\377))
418 (defun ebnf-ebx-lex ()
419 "Lexical analyzer for EBNFX.
421 Return a lexical token.
423 See documentation for variable `ebnf-ebx-lex'."
424 (if (>= (point) ebnf-limit
)
427 ;; skip spaces and comments
428 (while (if (> (following-char) 255)
432 (setq token
(aref ebnf-ebx-token-table
(following-char)))
435 (skip-chars-forward " \013\t" ebnf-limit
)
436 (< (point) ebnf-limit
))
438 (ebnf-ebx-skip-comment))
439 ((eq token
'form-feed
)
441 (setq ebnf-action
'form-feed
))
442 ((eq token
'end-of-rule
)
443 (ebnf-ebx-skip-end-of-rule))
444 ((and (eq token
'begin-square
)
445 (let ((case-fold-search t
))
446 (looking-at "\\[\\(wfc\\|vc\\):")))
447 (ebnf-ebx-skip-constraint))
452 ((>= (point) ebnf-limit
)
456 (error "Invalid character"))
458 ((eq token
'end-of-rule
)
460 ;; terminal: #x [0-9A-Fa-f]+
462 (setq ebnf-ebx-lex
(ebnf-ebx-character))
464 ;; terminal: "string"
465 ((eq token
'double-quote
)
466 (setq ebnf-ebx-lex
(ebnf-ebx-string ?
\"))
468 ;; terminal: 'string'
469 ((eq token
'single-quote
)
470 (setq ebnf-ebx-lex
(ebnf-ebx-string ?
\'))
472 ;; terminal: [ ^? ( char ( - char )? )+ ]
473 ((eq token
'begin-square
)
474 (setq ebnf-ebx-lex
(ebnf-ebx-range))
476 ;; non-terminal: NAME
477 ((eq token
'non-terminal
)
479 (ebnf-buffer-substring ebnf-ebx-non-terminal-chars
))
483 (or (looking-at "::=")
484 (error "Missing `::=' token"))
487 ;; miscellaneous: (, ), *, +, ?, |, -
494 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
495 (defconst ebnf-ebx-constraint-chars
496 (ebnf-range-regexp "^\000-\010\016-\037]" ?
\177 ?
\237))
499 (defun ebnf-ebx-skip-constraint ()
500 (or (> (skip-chars-forward ebnf-ebx-constraint-chars ebnf-limit
) 0)
501 (error "Invalid character"))
502 (or (= (following-char) ?\
])
503 (error "Missing end of constraint `]'"))
509 (defun ebnf-ebx-skip-end-of-rule ()
512 ;; end of rule ==> 2 or more consecutive end of lines
513 (setq eor-p
(or (> (skip-chars-forward "\r\n" ebnf-limit
) 1)
516 (skip-chars-forward " \013\t" ebnf-limit
)
518 (and (= (following-char) ?
/)
519 (ebnf-ebx-skip-comment))))
523 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
524 (defconst ebnf-ebx-comment-chars
525 (ebnf-range-regexp "^\000-\010\016-\037\\*" ?
\177 ?
\237))
526 (defconst ebnf-ebx-filename-chars
527 (ebnf-range-regexp "^\000-\037\\*" ?
\177 ?
\237))
530 (defun ebnf-ebx-skip-comment ()
532 (or (= (following-char) ?
*)
533 (error "Invalid beginning of comment"))
537 ((and ebnf-eps-executing
(= (following-char) ?\
[))
538 (ebnf-eps-add-context (ebnf-ebx-eps-filename)))
540 ((and ebnf-eps-executing
(= (following-char) ?\
]))
541 (ebnf-eps-remove-context (ebnf-ebx-eps-filename)))
543 ((and ebnf-eps-executing
(= (following-char) ?H
))
544 (ebnf-eps-header-comment (ebnf-ebx-eps-filename)))
546 ((and ebnf-eps-executing
(= (following-char) ?F
))
547 (ebnf-eps-footer-comment (ebnf-ebx-eps-filename)))
548 ;; any other action in comment
550 (setq ebnf-action
(aref ebnf-comment-table
(following-char))))
553 (skip-chars-forward ebnf-ebx-comment-chars ebnf-limit
)
554 (or (= (following-char) ?
*)
555 (error "Missing end of comment"))
557 (and (/= (following-char) ?
/)
558 (< (point) ebnf-limit
))))
559 ;; check for a valid end of comment
560 (and (>= (point) ebnf-limit
)
561 (error "Missing end of comment"))
566 (defun ebnf-ebx-eps-filename ()
572 (ebnf-buffer-substring ebnf-ebx-filename-chars
)))
573 (and (< (point) ebnf-limit
)
574 (> (setq nchar
(skip-chars-forward "*" ebnf-limit
)) 0)
575 (< (point) ebnf-limit
)
576 (/= (following-char) ?
/)))
577 (setq fname
(concat fname
(make-string nchar ?
*))
579 (if (or (not nchar
) (= nchar
0))
581 (and (< (point) ebnf-limit
)
582 (= (following-char) ?
/)
583 (setq nchar
(1- nchar
)))
584 (concat fname
(make-string nchar ?
*)))))
587 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
588 (defconst ebnf-ebx-double-string-chars
589 (ebnf-range-regexp "\t -!#-~" ?
\240 ?
\377))
590 (defconst ebnf-ebx-single-string-chars
591 (ebnf-range-regexp "\t -&(-~" ?
\240 ?
\377))
594 (defun ebnf-ebx-string (delim)
595 (buffer-substring-no-properties
600 (skip-chars-forward (if (= delim ?
\")
601 ebnf-ebx-double-string-chars
602 ebnf-ebx-single-string-chars
)
604 (or (= (following-char) delim
)
605 (error "Missing string delimiter `%c'" delim
))
611 (defun ebnf-ebx-character ()
613 (buffer-substring-no-properties
616 (ebnf-ebx-hex-character)
620 (defun ebnf-ebx-range ()
621 ;; [ ^? ( char ( - char )? )+ ]
622 (buffer-substring-no-properties
626 (and (= (following-char) ?^
)
628 (and (= (following-char) ?-
)
631 (ebnf-ebx-any-character)
632 (when (= (following-char) ?-
)
634 (ebnf-ebx-any-character))
635 (and (/= (following-char) ?\
])
636 (< (point) ebnf-limit
))))
637 (and (>= (point) ebnf-limit
)
638 (error "Missing end of character range `]'"))
643 (defun ebnf-ebx-any-character ()
644 (let ((char (following-char)))
646 (ebnf-ebx-hex-character t
))
647 ((or (and (<= ?\ char
) (<= char ?
\")) ; #
648 (and (<= ?$ char
) (<= char ?
,)) ; -
649 (and (<= ?. char
) (<= char ?
\\)) ; ]
650 (and (<= ?^ char
) (<= char ?~
))
651 (and (<= ?
\240 char
) (<= char ?
\377)))
654 (error "Invalid character `%c'" char
)))))
657 (defun ebnf-ebx-hex-character (&optional no-error
)
660 (if (/= (following-char) ?x
)
662 (error "Invalid hexadecimal character"))
664 (or (> (skip-chars-forward "0-9A-Fa-f" ebnf-limit
) 0)
665 (error "Invalid hexadecimal character"))))
668 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
673 ;;; arch-tag: bfe2f95b-66bc-4dc6-8b7e-b7831e68f5fb
674 ;;; ebnf-ebx.el ends here