1 ;;; ebnf-ebx.el --- parser for EBNF used to specify XML (EBNFX)
3 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
4 ;; Free Software Foundation, Inc.
6 ;; Author: Vinicius Jose Latorre <viniciusjl@ig.com.br>
7 ;; Maintainer: Vinicius Jose Latorre <viniciusjl@ig.com.br>
8 ;; Keywords: wp, ebnf, PostScript
12 ;; This file is part of GNU Emacs.
14 ;; GNU Emacs is free software: you can redistribute it and/or modify
15 ;; it under the terms of the GNU General Public License as published by
16 ;; the Free Software Foundation, either version 3 of the License, or
17 ;; (at your option) any later version.
19 ;; GNU Emacs is distributed in the hope that it will be useful,
20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;; GNU General Public License for more details.
24 ;; You should have received a copy of the GNU General Public License
25 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
29 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
32 ;; This is part of ebnf2ps package.
34 ;; This package defines a parser for EBNF used to specify XML (EBNFX).
36 ;; See ebnf2ps.el for documentation.
43 ;; `http://www.w3.org/TR/2004/REC-xml-20040204/#sec-notation'
44 ;; (Extensible Markup Language (XML) 1.0 (Third Edition))
47 ;; rule ::= symbol '::=' expression
48 ;; /* rules are separated by at least one blank line. */
50 ;; expression ::= concatenation ('|' concatenation)*
52 ;; concatenation ::= exception*
54 ;; exception ::= term ('-' term)?
56 ;; term ::= factor ('*' | '+' | '?')?
58 ;; factor ::= hex-char+
59 ;; | '[' '^'? ( char ( '-' char )? )+ ']'
62 ;; | '(' expression ')'
65 ;; symbol ::= 'upper or lower case letter'
66 ;; ('upper or lower case letter' | '-' | '_')*
67 ;; /* upper and lower 8-bit accentuated characters are included */
69 ;; hex-char ::= '#x' [0-9A-Fa-f]+
71 ;; char ::= hex-char | 'any character except control characters'
72 ;; /* 8-bit accentuated characters are included */
74 ;; any-char ::= char | 'newline' | 'tab'
76 ;; ignore ::= '[' ('wfc' | 'WFC' | 'vc' | 'VC') ':' ( any-char - ']' )* ']'
78 ;; comment ::= '/*' ( any-char - '*/' ) '*/'
81 ;; Below is the Notation section extracted from the URL cited above.
85 ;; The formal grammar of XML is given in this specification using a simple
86 ;; Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines
87 ;; one symbol, in the form
89 ;; symbol ::= expression
91 ;; Symbols are written with an initial capital letter if they are the start
92 ;; symbol of a regular language, otherwise with an initial lowercase letter.
93 ;; Literal strings are quoted.
95 ;; Within the expression on the right-hand side of a rule, the following
96 ;; expressions are used to match strings of one or more characters:
100 ;; where N is a hexadecimal integer, the expression matches the character
101 ;; whose number (code point) in ISO/IEC 10646 is N. The number of leading
102 ;; zeros in the #xN form is insignificant.
104 ;; [a-zA-Z], [#xN-#xN]
106 ;; matches any Char with a value in the range(s) indicated (inclusive).
108 ;; [abc], [#xN#xN#xN]
110 ;; matches any Char with a value among the characters enumerated.
111 ;; Enumerations and ranges can be mixed in one set of brackets.
113 ;; [^a-z], [^#xN-#xN]
115 ;; matches any Char with a value outside the range indicated.
117 ;; [^abc], [^#xN#xN#xN]
119 ;; matches any Char with a value not among the characters given.
120 ;; Enumerations and ranges of forbidden values can be mixed in one set of
125 ;; matches a literal string matching that given inside the double quotes.
129 ;; matches a literal string matching that given inside the single quotes.
131 ;; These symbols may be combined to match more complex patterns as follows,
132 ;; where A and B represent simple expressions:
136 ;; expression is treated as a unit and may be combined as described in this
141 ;; matches A or nothing; optional A.
145 ;; matches A followed by B. This operator has higher precedence than
146 ;; alternation; thus A B | C D is identical to (A B) | (C D).
154 ;; matches any string that matches A but does not match B.
158 ;; matches one or more occurrences of A. Concatenation has higher
159 ;; precedence than alternation; thus A+ | B+ is identical to (A+) | (B+).
163 ;; matches zero or more occurrences of A. Concatenation has higher
164 ;; precedence than alternation; thus A* | B* is identical to (A*) | (B*).
166 ;; Other notations used in the productions are:
174 ;; well-formedness constraint; this identifies by name a constraint on
175 ;; well-formed documents associated with a production.
179 ;; validity constraint; this identifies by name a constraint on valid
180 ;; documents associated with a production.
183 ;; Differences Between EBNFX And ebnf2ps EBNFX
184 ;; -------------------------------------------
186 ;; Besides the characters that EBNFX accepts, ebnf2ps EBNFX accepts also the
187 ;; underscore (_) and minus (-) for rule name and european 8-bit accentuated
188 ;; characters (from \240 to \377) for rule name, string and comment. Also
189 ;; rule name can start with upper case letter.
192 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
200 (defvar ebnf-ebx-lex nil
201 "Value returned by `ebnf-ebx-lex' function.")
204 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
205 ;; Syntactic analyzer
208 ;;; rulelist ::= rule+
210 (defun ebnf-ebx-parser (start)
212 (let ((total (+ (- ebnf-limit start
) 1))
215 rule-list token rule
)
217 (setq token
(ebnf-ebx-lex))
218 (and (eq token
'end-of-input
)
219 (error "Invalid EBNFX file format"))
220 (and (eq token
'end-of-rule
)
221 (setq token
(ebnf-ebx-lex)))
222 (while (not (eq token
'end-of-input
))
225 (/ (* (- (point) bias
) 100.0) total
))
226 (setq token
(ebnf-ebx-rule token
)
229 (or (ebnf-add-empty-rule-list rule
)
230 (setq rule-list
(cons rule rule-list
))))
235 ;;; rule ::= symbol '::=' expression
238 (defun ebnf-ebx-rule (token)
239 (let ((name ebnf-ebx-lex
)
242 (setq ebnf-action nil
)
243 (or (eq token
'non-terminal
)
244 (error "Invalid rule name"))
245 (setq token
(ebnf-ebx-lex))
246 (or (eq token
'production
)
247 (error "Invalid rule: missing `::='"))
248 (setq elements
(ebnf-ebx-expression))
249 (or (memq (car elements
) '(end-of-rule end-of-input
))
250 (error "Invalid rule: there is no end of rule"))
251 (setq elements
(cdr elements
))
252 (ebnf-eps-add-production name
)
254 (ebnf-make-production name elements action
))))
257 ;; expression ::= concatenation ('|' concatenation)*
260 (defun ebnf-ebx-expression ()
261 (let (body concatenation
)
262 (while (eq (car (setq concatenation
263 (ebnf-ebx-concatenation (ebnf-ebx-lex))))
265 (setq body
(cons (cdr concatenation
) body
)))
266 (ebnf-token-alternative body concatenation
)))
269 ;; concatenation ::= exception*
272 (defun ebnf-ebx-concatenation (token)
273 (let ((term (ebnf-ebx-exception token
))
275 (or (setq token
(car term
)
277 (error "Empty element"))
278 (setq seq
(cons term seq
))
279 (while (setq term
(ebnf-ebx-exception token
)
282 (setq seq
(cons term seq
)))
284 (ebnf-token-sequence seq
))))
287 ;;; exception ::= term ('-' term)?
290 (defun ebnf-ebx-exception (token)
291 (let ((term (ebnf-ebx-term token
)))
292 (if (eq (car term
) 'exception
)
293 (let ((except (ebnf-ebx-term (ebnf-ebx-lex))))
295 (ebnf-make-except (cdr term
) (cdr except
))))
300 ;;; term ::= factor ('*' | '+' | '?')?
303 (defun ebnf-ebx-term (token)
304 (let ((factor (ebnf-ebx-factor token
)))
306 (setq token
(ebnf-ebx-lex))
307 (cond ((eq token
'zero-or-more
)
308 (setq factor
(ebnf-make-zero-or-more factor
)
309 token
(ebnf-ebx-lex)))
310 ((eq token
'one-or-more
)
311 (setq factor
(ebnf-make-one-or-more factor
)
312 token
(ebnf-ebx-lex)))
313 ((eq token
'optional
)
314 (setq factor
(ebnf-token-optional factor
)
315 token
(ebnf-ebx-lex)))))
316 (cons token factor
)))
319 ;;; factor ::= hex-char+
320 ;;; | '[' '^'? ( char ( '-' char )? )+ ']'
321 ;;; | '"' 'string' '"'
322 ;;; | "'" "string" "'"
323 ;;; | '(' expression ')'
326 ;;; symbol ::= 'upper or lower case letter'
327 ;;; ('upper or lower case letter' | '-' | '_')*
328 ;;; /* upper and lower 8-bit accentuated characters are included */
330 ;;; hex-char ::= '#x' [0-9A-Fa-f]+
332 ;;; char ::= hex-char | 'any character except control characters'
333 ;;; /* 8-bit accentuated characters are included */
335 ;;; any-char ::= char | 'newline' | 'tab'
338 (defun ebnf-ebx-factor (token)
341 ((eq token
'terminal
)
342 (ebnf-make-terminal ebnf-ebx-lex
))
344 ((eq token
'non-terminal
)
345 (ebnf-make-non-terminal ebnf-ebx-lex
))
347 ((eq token
'begin-group
)
348 (let ((body (ebnf-ebx-expression)))
349 (or (eq (car body
) 'end-group
)
350 (error "Missing `)'"))
358 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
362 (defconst ebnf-ebx-token-table
(make-vector 256 'error
)
363 "Vector used to map characters to a lexical token.")
366 (defun ebnf-ebx-initialize ()
367 "Initialize EBNFX token table."
368 ;; control character & control 8-bit character are set to `error'
370 ;; printable character: A-Z
371 (while (< char ?
\133)
372 (aset ebnf-ebx-token-table char
'non-terminal
)
373 (setq char
(1+ char
)))
374 ;; printable character: a-z
376 (while (< char ?
\173)
377 (aset ebnf-ebx-token-table char
'non-terminal
)
378 (setq char
(1+ char
)))
379 ;; European 8-bit accentuated characters:
381 (while (< char ?
\400)
382 (aset ebnf-ebx-token-table char
'non-terminal
)
383 (setq char
(1+ char
)))
384 ;; Override end of line characters:
385 (aset ebnf-ebx-token-table ?
\n 'end-of-rule
) ; [NL] linefeed
386 (aset ebnf-ebx-token-table ?
\r 'end-of-rule
) ; [CR] carriage return
387 ;; Override space characters:
388 (aset ebnf-ebx-token-table ?
\013 'space
) ; [VT] vertical tab
389 (aset ebnf-ebx-token-table ?
\t 'space
) ; [HT] horizontal tab
390 (aset ebnf-ebx-token-table ?\
'space
) ; [SP] space
391 ;; Override form feed character:
392 (aset ebnf-ebx-token-table ?
\f 'form-feed
) ; [FF] form feed
393 ;; Override other lexical characters:
394 (aset ebnf-ebx-token-table ?
# 'hash
)
395 (aset ebnf-ebx-token-table ?
\" 'double-quote
)
396 (aset ebnf-ebx-token-table ?
\' 'single-quote
)
397 (aset ebnf-ebx-token-table ?\
( 'begin-group
)
398 (aset ebnf-ebx-token-table ?\
) 'end-group
)
399 (aset ebnf-ebx-token-table ?-
'exception
)
400 (aset ebnf-ebx-token-table ?
: 'colon
)
401 (aset ebnf-ebx-token-table ?\
[ 'begin-square
)
402 (aset ebnf-ebx-token-table ?|
'alternative
)
403 (aset ebnf-ebx-token-table ?
* 'zero-or-more
)
404 (aset ebnf-ebx-token-table ?
+ 'one-or-more
)
405 (aset ebnf-ebx-token-table ?
\? 'optional
)
406 ;; Override comment character:
407 (aset ebnf-ebx-token-table ?
/ 'comment
)))
410 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
411 (defconst ebnf-ebx-non-terminal-chars
412 (ebnf-range-regexp "-_A-Za-z" ?
\240 ?
\377))
413 (defconst ebnf-ebx-non-terminal-letter-chars
414 (ebnf-range-regexp "A-Za-z" ?
\240 ?
\377))
417 (defun ebnf-ebx-lex ()
418 "Lexical analyzer for EBNFX.
420 Return a lexical token.
422 See documentation for variable `ebnf-ebx-lex'."
423 (if (>= (point) ebnf-limit
)
426 ;; skip spaces and comments
427 (while (if (> (following-char) 255)
431 (setq token
(aref ebnf-ebx-token-table
(following-char)))
434 (skip-chars-forward " \013\t" ebnf-limit
)
435 (< (point) ebnf-limit
))
437 (ebnf-ebx-skip-comment))
438 ((eq token
'form-feed
)
440 (setq ebnf-action
'form-feed
))
441 ((eq token
'end-of-rule
)
442 (ebnf-ebx-skip-end-of-rule))
443 ((and (eq token
'begin-square
)
444 (let ((case-fold-search t
))
445 (looking-at "\\[\\(wfc\\|vc\\):")))
446 (ebnf-ebx-skip-constraint))
451 ((>= (point) ebnf-limit
)
455 (error "Invalid character"))
457 ((eq token
'end-of-rule
)
459 ;; terminal: #x [0-9A-Fa-f]+
461 (setq ebnf-ebx-lex
(ebnf-ebx-character))
463 ;; terminal: "string"
464 ((eq token
'double-quote
)
465 (setq ebnf-ebx-lex
(ebnf-ebx-string ?
\"))
467 ;; terminal: 'string'
468 ((eq token
'single-quote
)
469 (setq ebnf-ebx-lex
(ebnf-ebx-string ?
\'))
471 ;; terminal: [ ^? ( char ( - char )? )+ ]
472 ((eq token
'begin-square
)
473 (setq ebnf-ebx-lex
(ebnf-ebx-range))
475 ;; non-terminal: NAME
476 ((eq token
'non-terminal
)
478 (ebnf-buffer-substring ebnf-ebx-non-terminal-chars
))
482 (or (looking-at "::=")
483 (error "Missing `::=' token"))
486 ;; miscellaneous: (, ), *, +, ?, |, -
493 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
494 (defconst ebnf-ebx-constraint-chars
495 (ebnf-range-regexp "^\000-\010\016-\037]" ?
\177 ?
\237))
498 (defun ebnf-ebx-skip-constraint ()
499 (or (> (skip-chars-forward ebnf-ebx-constraint-chars ebnf-limit
) 0)
500 (error "Invalid character"))
501 (or (= (following-char) ?\
])
502 (error "Missing end of constraint `]'"))
508 (defun ebnf-ebx-skip-end-of-rule ()
511 ;; end of rule ==> 2 or more consecutive end of lines
512 (setq eor-p
(or (> (skip-chars-forward "\r\n" ebnf-limit
) 1)
515 (skip-chars-forward " \013\t" ebnf-limit
)
517 (and (= (following-char) ?
/)
518 (ebnf-ebx-skip-comment))))
522 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
523 (defconst ebnf-ebx-comment-chars
524 (ebnf-range-regexp "^\000-\010\016-\037\\*" ?
\177 ?
\237))
525 (defconst ebnf-ebx-filename-chars
526 (ebnf-range-regexp "^\000-\037\\*" ?
\177 ?
\237))
529 (defun ebnf-ebx-skip-comment ()
531 (or (= (following-char) ?
*)
532 (error "Invalid beginning of comment"))
536 ((and ebnf-eps-executing
(= (following-char) ?\
[))
537 (ebnf-eps-add-context (ebnf-ebx-eps-filename)))
539 ((and ebnf-eps-executing
(= (following-char) ?\
]))
540 (ebnf-eps-remove-context (ebnf-ebx-eps-filename)))
542 ((and ebnf-eps-executing
(= (following-char) ?H
))
543 (ebnf-eps-header-comment (ebnf-ebx-eps-filename)))
545 ((and ebnf-eps-executing
(= (following-char) ?F
))
546 (ebnf-eps-footer-comment (ebnf-ebx-eps-filename)))
547 ;; any other action in comment
549 (setq ebnf-action
(aref ebnf-comment-table
(following-char))))
552 (skip-chars-forward ebnf-ebx-comment-chars ebnf-limit
)
553 (or (= (following-char) ?
*)
554 (error "Missing end of comment"))
556 (and (/= (following-char) ?
/)
557 (< (point) ebnf-limit
))))
558 ;; check for a valid end of comment
559 (and (>= (point) ebnf-limit
)
560 (error "Missing end of comment"))
565 (defun ebnf-ebx-eps-filename ()
571 (ebnf-buffer-substring ebnf-ebx-filename-chars
)))
572 (and (< (point) ebnf-limit
)
573 (> (setq nchar
(skip-chars-forward "*" ebnf-limit
)) 0)
574 (< (point) ebnf-limit
)
575 (/= (following-char) ?
/)))
576 (setq fname
(concat fname
(make-string nchar ?
*))
578 (if (or (not nchar
) (= nchar
0))
580 (and (< (point) ebnf-limit
)
581 (= (following-char) ?
/)
582 (setq nchar
(1- nchar
)))
583 (concat fname
(make-string nchar ?
*)))))
586 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
587 (defconst ebnf-ebx-double-string-chars
588 (ebnf-range-regexp "\t -!#-~" ?
\240 ?
\377))
589 (defconst ebnf-ebx-single-string-chars
590 (ebnf-range-regexp "\t -&(-~" ?
\240 ?
\377))
593 (defun ebnf-ebx-string (delim)
594 (buffer-substring-no-properties
599 (skip-chars-forward (if (= delim ?
\")
600 ebnf-ebx-double-string-chars
601 ebnf-ebx-single-string-chars
)
603 (or (= (following-char) delim
)
604 (error "Missing string delimiter `%c'" delim
))
610 (defun ebnf-ebx-character ()
612 (buffer-substring-no-properties
615 (ebnf-ebx-hex-character)
619 (defun ebnf-ebx-range ()
620 ;; [ ^? ( char ( - char )? )+ ]
621 (buffer-substring-no-properties
625 (and (= (following-char) ?^
)
627 (and (= (following-char) ?-
)
630 (ebnf-ebx-any-character)
631 (when (= (following-char) ?-
)
633 (ebnf-ebx-any-character))
634 (and (/= (following-char) ?\
])
635 (< (point) ebnf-limit
))))
636 (and (>= (point) ebnf-limit
)
637 (error "Missing end of character range `]'"))
642 (defun ebnf-ebx-any-character ()
643 (let ((char (following-char)))
645 (ebnf-ebx-hex-character t
))
646 ((or (and (<= ?\ char
) (<= char ?
\")) ; #
647 (and (<= ?$ char
) (<= char ?
,)) ; -
648 (and (<= ?. char
) (<= char ?
\\)) ; ]
649 (and (<= ?^ char
) (<= char ?~
))
650 (and (<= ?
\240 char
) (<= char ?
\377)))
653 (error "Invalid character `%c'" char
)))))
656 (defun ebnf-ebx-hex-character (&optional no-error
)
659 (if (/= (following-char) ?x
)
661 (error "Invalid hexadecimal character"))
663 (or (> (skip-chars-forward "0-9A-Fa-f" ebnf-limit
) 0)
664 (error "Invalid hexadecimal character"))))
667 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
672 ;; arch-tag: bfe2f95b-66bc-4dc6-8b7e-b7831e68f5fb
673 ;;; ebnf-ebx.el ends here