1 ;;; ebnf-ebx.el --- parser for EBNF used to specify XML (EBNFX)
3 ;; Copyright (C) 2001-2015 Free Software Foundation, Inc.
5 ;; Author: Vinicius Jose Latorre <viniciusjl@ig.com.br>
6 ;; Maintainer: Vinicius Jose Latorre <viniciusjl@ig.com.br>
7 ;; Keywords: wp, ebnf, PostScript
11 ;; This file is part of GNU Emacs.
13 ;; GNU Emacs is free software: you can redistribute it and/or modify
14 ;; it under the terms of the GNU General Public License as published by
15 ;; the Free Software Foundation, either version 3 of the License, or
16 ;; (at your option) any later version.
18 ;; GNU Emacs is distributed in the hope that it will be useful,
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;; GNU General Public License for more details.
23 ;; You should have received a copy of the GNU General Public License
24 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
28 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31 ;; This is part of ebnf2ps package.
33 ;; This package defines a parser for EBNF used to specify XML (EBNFX).
35 ;; See ebnf2ps.el for documentation.
42 ;; `http://www.w3.org/TR/2004/REC-xml-20040204/#sec-notation'
43 ;; (Extensible Markup Language (XML) 1.0 (Third Edition))
46 ;; rule ::= symbol '::=' expression
47 ;; /* rules are separated by at least one blank line. */
49 ;; expression ::= concatenation ('|' concatenation)*
51 ;; concatenation ::= exception*
53 ;; exception ::= term ('-' term)?
55 ;; term ::= factor ('*' | '+' | '?')?
57 ;; factor ::= hex-char+
58 ;; | '[' '^'? ( char ( '-' char )? )+ ']'
61 ;; | '(' expression ')'
64 ;; symbol ::= 'upper or lower case letter'
65 ;; ('upper or lower case letter' | '-' | '_')*
66 ;; /* upper and lower 8-bit accentuated characters are included */
68 ;; hex-char ::= '#x' [0-9A-Fa-f]+
70 ;; char ::= hex-char | 'any character except control characters'
71 ;; /* 8-bit accentuated characters are included */
73 ;; any-char ::= char | 'newline' | 'tab'
75 ;; ignore ::= '[' ('wfc' | 'WFC' | 'vc' | 'VC') ':' ( any-char - ']' )* ']'
77 ;; comment ::= '/*' ( any-char - '*/' ) '*/'
80 ;; Below is the Notation section extracted from the URL cited above.
84 ;; The formal grammar of XML is given in this specification using a simple
85 ;; Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines
86 ;; one symbol, in the form
88 ;; symbol ::= expression
90 ;; Symbols are written with an initial capital letter if they are the start
91 ;; symbol of a regular language, otherwise with an initial lowercase letter.
92 ;; Literal strings are quoted.
94 ;; Within the expression on the right-hand side of a rule, the following
95 ;; expressions are used to match strings of one or more characters:
99 ;; where N is a hexadecimal integer, the expression matches the character
100 ;; whose number (code point) in ISO/IEC 10646 is N. The number of leading
101 ;; zeros in the #xN form is insignificant.
103 ;; [a-zA-Z], [#xN-#xN]
105 ;; matches any Char with a value in the range(s) indicated (inclusive).
107 ;; [abc], [#xN#xN#xN]
109 ;; matches any Char with a value among the characters enumerated.
110 ;; Enumerations and ranges can be mixed in one set of brackets.
112 ;; [^a-z], [^#xN-#xN]
114 ;; matches any Char with a value outside the range indicated.
116 ;; [^abc], [^#xN#xN#xN]
118 ;; matches any Char with a value not among the characters given.
119 ;; Enumerations and ranges of forbidden values can be mixed in one set of
124 ;; matches a literal string matching that given inside the double quotes.
128 ;; matches a literal string matching that given inside the single quotes.
130 ;; These symbols may be combined to match more complex patterns as follows,
131 ;; where A and B represent simple expressions:
135 ;; expression is treated as a unit and may be combined as described in this
140 ;; matches A or nothing; optional A.
144 ;; matches A followed by B. This operator has higher precedence than
145 ;; alternation; thus A B | C D is identical to (A B) | (C D).
153 ;; matches any string that matches A but does not match B.
157 ;; matches one or more occurrences of A. Concatenation has higher
158 ;; precedence than alternation; thus A+ | B+ is identical to (A+) | (B+).
162 ;; matches zero or more occurrences of A. Concatenation has higher
163 ;; precedence than alternation; thus A* | B* is identical to (A*) | (B*).
165 ;; Other notations used in the productions are:
173 ;; well-formedness constraint; this identifies by name a constraint on
174 ;; well-formed documents associated with a production.
178 ;; validity constraint; this identifies by name a constraint on valid
179 ;; documents associated with a production.
182 ;; Differences Between EBNFX And ebnf2ps EBNFX
183 ;; -------------------------------------------
185 ;; Besides the characters that EBNFX accepts, ebnf2ps EBNFX accepts also the
186 ;; underscore (_) and minus (-) for rule name and european 8-bit accentuated
187 ;; characters (from \240 to \377) for rule name, string and comment. Also
188 ;; rule name can start with upper case letter.
191 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
199 (defvar ebnf-ebx-lex nil
200 "Value returned by `ebnf-ebx-lex' function.")
203 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
204 ;; Syntactic analyzer
207 ;;; rulelist ::= rule+
209 (defun ebnf-ebx-parser (start)
211 (let ((total (+ (- ebnf-limit start
) 1))
214 rule-list token rule
)
216 (setq token
(ebnf-ebx-lex))
217 (and (eq token
'end-of-input
)
218 (error "Invalid EBNFX file format"))
219 (and (eq token
'end-of-rule
)
220 (setq token
(ebnf-ebx-lex)))
221 (while (not (eq token
'end-of-input
))
224 (/ (* (- (point) bias
) 100.0) total
))
225 (setq token
(ebnf-ebx-rule token
)
228 (or (ebnf-add-empty-rule-list rule
)
229 (setq rule-list
(cons rule rule-list
))))
234 ;;; rule ::= symbol '::=' expression
237 (defun ebnf-ebx-rule (token)
238 (let ((name ebnf-ebx-lex
)
241 (setq ebnf-action nil
)
242 (or (eq token
'non-terminal
)
243 (error "Invalid rule name"))
244 (setq token
(ebnf-ebx-lex))
245 (or (eq token
'production
)
246 (error "Invalid rule: missing `::='"))
247 (setq elements
(ebnf-ebx-expression))
248 (or (memq (car elements
) '(end-of-rule end-of-input
))
249 (error "Invalid rule: there is no end of rule"))
250 (setq elements
(cdr elements
))
251 (ebnf-eps-add-production name
)
253 (ebnf-make-production name elements action
))))
256 ;; expression ::= concatenation ('|' concatenation)*
259 (defun ebnf-ebx-expression ()
260 (let (body concatenation
)
261 (while (eq (car (setq concatenation
262 (ebnf-ebx-concatenation (ebnf-ebx-lex))))
264 (setq body
(cons (cdr concatenation
) body
)))
265 (ebnf-token-alternative body concatenation
)))
268 ;; concatenation ::= exception*
271 (defun ebnf-ebx-concatenation (token)
272 (let ((term (ebnf-ebx-exception token
))
274 (or (setq token
(car term
)
276 (error "Empty element"))
277 (setq seq
(cons term seq
))
278 (while (setq term
(ebnf-ebx-exception token
)
281 (setq seq
(cons term seq
)))
283 (ebnf-token-sequence seq
))))
286 ;;; exception ::= term ('-' term)?
289 (defun ebnf-ebx-exception (token)
290 (let ((term (ebnf-ebx-term token
)))
291 (if (eq (car term
) 'exception
)
292 (let ((except (ebnf-ebx-term (ebnf-ebx-lex))))
294 (ebnf-make-except (cdr term
) (cdr except
))))
299 ;;; term ::= factor ('*' | '+' | '?')?
302 (defun ebnf-ebx-term (token)
303 (let ((factor (ebnf-ebx-factor token
)))
305 (setq token
(ebnf-ebx-lex))
306 (cond ((eq token
'zero-or-more
)
307 (setq factor
(ebnf-make-zero-or-more factor
)
308 token
(ebnf-ebx-lex)))
309 ((eq token
'one-or-more
)
310 (setq factor
(ebnf-make-one-or-more factor
)
311 token
(ebnf-ebx-lex)))
312 ((eq token
'optional
)
313 (setq factor
(ebnf-token-optional factor
)
314 token
(ebnf-ebx-lex)))))
315 (cons token factor
)))
318 ;;; factor ::= hex-char+
319 ;;; | '[' '^'? ( char ( '-' char )? )+ ']'
320 ;;; | '"' 'string' '"'
321 ;;; | "'" "string" "'"
322 ;;; | '(' expression ')'
325 ;;; symbol ::= 'upper or lower case letter'
326 ;;; ('upper or lower case letter' | '-' | '_')*
327 ;;; /* upper and lower 8-bit accentuated characters are included */
329 ;;; hex-char ::= '#x' [0-9A-Fa-f]+
331 ;;; char ::= hex-char | 'any character except control characters'
332 ;;; /* 8-bit accentuated characters are included */
334 ;;; any-char ::= char | 'newline' | 'tab'
337 (defun ebnf-ebx-factor (token)
340 ((eq token
'terminal
)
341 (ebnf-make-terminal ebnf-ebx-lex
))
343 ((eq token
'non-terminal
)
344 (ebnf-make-non-terminal ebnf-ebx-lex
))
346 ((eq token
'begin-group
)
347 (let ((body (ebnf-ebx-expression)))
348 (or (eq (car body
) 'end-group
)
349 (error "Missing `)'"))
357 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
361 (defconst ebnf-ebx-token-table
(make-vector 256 'error
)
362 "Vector used to map characters to a lexical token.")
365 (defun ebnf-ebx-initialize ()
366 "Initialize EBNFX token table."
367 ;; control character & control 8-bit character are set to `error'
369 ;; printable character: A-Z
370 (while (< char ?
\133)
371 (aset ebnf-ebx-token-table char
'non-terminal
)
372 (setq char
(1+ char
)))
373 ;; printable character: a-z
375 (while (< char ?
\173)
376 (aset ebnf-ebx-token-table char
'non-terminal
)
377 (setq char
(1+ char
)))
378 ;; European 8-bit accentuated characters:
380 (while (< char ?
\400)
381 (aset ebnf-ebx-token-table char
'non-terminal
)
382 (setq char
(1+ char
)))
383 ;; Override end of line characters:
384 (aset ebnf-ebx-token-table ?
\n 'end-of-rule
) ; [NL] linefeed
385 (aset ebnf-ebx-token-table ?
\r 'end-of-rule
) ; [CR] carriage return
386 ;; Override space characters:
387 (aset ebnf-ebx-token-table ?
\013 'space
) ; [VT] vertical tab
388 (aset ebnf-ebx-token-table ?
\t 'space
) ; [HT] horizontal tab
389 (aset ebnf-ebx-token-table ?\
'space
) ; [SP] space
390 ;; Override form feed character:
391 (aset ebnf-ebx-token-table ?
\f 'form-feed
) ; [FF] form feed
392 ;; Override other lexical characters:
393 (aset ebnf-ebx-token-table ?
# 'hash
)
394 (aset ebnf-ebx-token-table ?
\" 'double-quote
)
395 (aset ebnf-ebx-token-table ?
\' 'single-quote
)
396 (aset ebnf-ebx-token-table ?\
( 'begin-group
)
397 (aset ebnf-ebx-token-table ?\
) 'end-group
)
398 (aset ebnf-ebx-token-table ?-
'exception
)
399 (aset ebnf-ebx-token-table ?
: 'colon
)
400 (aset ebnf-ebx-token-table ?\
[ 'begin-square
)
401 (aset ebnf-ebx-token-table ?|
'alternative
)
402 (aset ebnf-ebx-token-table ?
* 'zero-or-more
)
403 (aset ebnf-ebx-token-table ?
+ 'one-or-more
)
404 (aset ebnf-ebx-token-table ?
\? 'optional
)
405 ;; Override comment character:
406 (aset ebnf-ebx-token-table ?
/ 'comment
)))
409 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
410 (defconst ebnf-ebx-non-terminal-chars
411 (ebnf-range-regexp "-_A-Za-z" ?
\240 ?
\377))
412 (defconst ebnf-ebx-non-terminal-letter-chars
413 (ebnf-range-regexp "A-Za-z" ?
\240 ?
\377))
416 (defun ebnf-ebx-lex ()
417 "Lexical analyzer for EBNFX.
419 Return a lexical token.
421 See documentation for variable `ebnf-ebx-lex'."
422 (if (>= (point) ebnf-limit
)
425 ;; skip spaces and comments
426 (while (if (> (following-char) 255)
430 (setq token
(aref ebnf-ebx-token-table
(following-char)))
433 (skip-chars-forward " \013\t" ebnf-limit
)
434 (< (point) ebnf-limit
))
436 (ebnf-ebx-skip-comment))
437 ((eq token
'form-feed
)
439 (setq ebnf-action
'form-feed
))
440 ((eq token
'end-of-rule
)
441 (ebnf-ebx-skip-end-of-rule))
442 ((and (eq token
'begin-square
)
443 (let ((case-fold-search t
))
444 (looking-at "\\[\\(wfc\\|vc\\):")))
445 (ebnf-ebx-skip-constraint))
450 ((>= (point) ebnf-limit
)
454 (error "Invalid character"))
456 ((eq token
'end-of-rule
)
458 ;; terminal: #x [0-9A-Fa-f]+
460 (setq ebnf-ebx-lex
(ebnf-ebx-character))
462 ;; terminal: "string"
463 ((eq token
'double-quote
)
464 (setq ebnf-ebx-lex
(ebnf-ebx-string ?
\"))
466 ;; terminal: 'string'
467 ((eq token
'single-quote
)
468 (setq ebnf-ebx-lex
(ebnf-ebx-string ?
\'))
470 ;; terminal: [ ^? ( char ( - char )? )+ ]
471 ((eq token
'begin-square
)
472 (setq ebnf-ebx-lex
(ebnf-ebx-range))
474 ;; non-terminal: NAME
475 ((eq token
'non-terminal
)
477 (ebnf-buffer-substring ebnf-ebx-non-terminal-chars
))
481 (or (looking-at "::=")
482 (error "Missing `::=' token"))
485 ;; miscellaneous: (, ), *, +, ?, |, -
492 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
493 (defconst ebnf-ebx-constraint-chars
494 (ebnf-range-regexp "^\000-\010\016-\037]" ?
\177 ?
\237))
497 (defun ebnf-ebx-skip-constraint ()
498 (or (> (skip-chars-forward ebnf-ebx-constraint-chars ebnf-limit
) 0)
499 (error "Invalid character"))
500 (or (= (following-char) ?\
])
501 (error "Missing end of constraint `]'"))
507 (defun ebnf-ebx-skip-end-of-rule ()
510 ;; end of rule ==> 2 or more consecutive end of lines
511 (setq eor-p
(or (> (skip-chars-forward "\r\n" ebnf-limit
) 1)
514 (skip-chars-forward " \013\t" ebnf-limit
)
516 (and (= (following-char) ?
/)
517 (ebnf-ebx-skip-comment))))
521 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
522 (defconst ebnf-ebx-comment-chars
523 (ebnf-range-regexp "^\000-\010\016-\037\\*" ?
\177 ?
\237))
524 (defconst ebnf-ebx-filename-chars
525 (ebnf-range-regexp "^\000-\037\\*" ?
\177 ?
\237))
528 (defun ebnf-ebx-skip-comment ()
530 (or (= (following-char) ?
*)
531 (error "Invalid beginning of comment"))
535 ((and ebnf-eps-executing
(= (following-char) ?\
[))
536 (ebnf-eps-add-context (ebnf-ebx-eps-filename)))
538 ((and ebnf-eps-executing
(= (following-char) ?\
]))
539 (ebnf-eps-remove-context (ebnf-ebx-eps-filename)))
541 ((and ebnf-eps-executing
(= (following-char) ?H
))
542 (ebnf-eps-header-comment (ebnf-ebx-eps-filename)))
544 ((and ebnf-eps-executing
(= (following-char) ?F
))
545 (ebnf-eps-footer-comment (ebnf-ebx-eps-filename)))
546 ;; any other action in comment
548 (setq ebnf-action
(aref ebnf-comment-table
(following-char))))
551 (skip-chars-forward ebnf-ebx-comment-chars ebnf-limit
)
552 (or (= (following-char) ?
*)
553 (error "Missing end of comment"))
555 (and (/= (following-char) ?
/)
556 (< (point) ebnf-limit
))))
557 ;; check for a valid end of comment
558 (and (>= (point) ebnf-limit
)
559 (error "Missing end of comment"))
564 (defun ebnf-ebx-eps-filename ()
570 (ebnf-buffer-substring ebnf-ebx-filename-chars
)))
571 (and (< (point) ebnf-limit
)
572 (> (setq nchar
(skip-chars-forward "*" ebnf-limit
)) 0)
573 (< (point) ebnf-limit
)
574 (/= (following-char) ?
/)))
575 (setq fname
(concat fname
(make-string nchar ?
*))
577 (if (or (not nchar
) (= nchar
0))
579 (and (< (point) ebnf-limit
)
580 (= (following-char) ?
/)
581 (setq nchar
(1- nchar
)))
582 (concat fname
(make-string nchar ?
*)))))
585 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
586 (defconst ebnf-ebx-double-string-chars
587 (ebnf-range-regexp "\t -!#-~" ?
\240 ?
\377))
588 (defconst ebnf-ebx-single-string-chars
589 (ebnf-range-regexp "\t -&(-~" ?
\240 ?
\377))
592 (defun ebnf-ebx-string (delim)
593 (buffer-substring-no-properties
598 (skip-chars-forward (if (= delim ?
\")
599 ebnf-ebx-double-string-chars
600 ebnf-ebx-single-string-chars
)
602 (or (= (following-char) delim
)
603 (error "Missing string delimiter `%c'" delim
))
609 (defun ebnf-ebx-character ()
611 (buffer-substring-no-properties
614 (ebnf-ebx-hex-character)
618 (defun ebnf-ebx-range ()
619 ;; [ ^? ( char ( - char )? )+ ]
620 (buffer-substring-no-properties
624 (and (= (following-char) ?^
)
626 (and (= (following-char) ?-
)
629 (ebnf-ebx-any-character)
630 (when (= (following-char) ?-
)
632 (ebnf-ebx-any-character))
633 (and (/= (following-char) ?\
])
634 (< (point) ebnf-limit
))))
635 (and (>= (point) ebnf-limit
)
636 (error "Missing end of character range `]'"))
641 (defun ebnf-ebx-any-character ()
642 (let ((char (following-char)))
644 (ebnf-ebx-hex-character t
))
645 ((or (and (<= ?\ char
) (<= char ?
\")) ; #
646 (and (<= ?$ char
) (<= char ?
,)) ; -
647 (and (<= ?. char
) (<= char ?
\\)) ; ]
648 (and (<= ?^ char
) (<= char ?~
))
649 (and (<= ?
\240 char
) (<= char ?
\377)))
652 (error "Invalid character `%c'" char
)))))
655 (defun ebnf-ebx-hex-character (&optional no-error
)
658 (if (/= (following-char) ?x
)
660 (error "Invalid hexadecimal character"))
662 (or (> (skip-chars-forward "0-9A-Fa-f" ebnf-limit
) 0)
663 (error "Invalid hexadecimal character"))))
666 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
671 ;;; ebnf-ebx.el ends here