1 ;;; ebnf-ebx.el --- parser for EBNF used to specify XML (EBNFX)
3 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006
4 ;; Free Sofware Foundation, Inc.
6 ;; Author: Vinicius Jose Latorre <viniciusjl@ig.com.br>
7 ;; Maintainer: Vinicius Jose Latorre <viniciusjl@ig.com.br>
8 ;; Time-stamp: <2004/04/03 16:45:34 vinicius>
9 ;; Keywords: wp, ebnf, PostScript
12 ;; This file is part of GNU Emacs.
14 ;; GNU Emacs is free software; you can redistribute it and/or modify
15 ;; it under the terms of the GNU General Public License as published by
16 ;; the Free Software Foundation; either version 2, or (at your option)
19 ;; GNU Emacs is distributed in the hope that it will be useful,
20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;; GNU General Public License for more details.
24 ;; You should have received a copy of the GNU General Public License
25 ;; along with GNU Emacs; see the file COPYING. If not, write to the
26 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
27 ;; Boston, MA 02110-1301, USA.
31 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
34 ;; This is part of ebnf2ps package.
36 ;; This package defines a parser for EBNF used to specify XML (EBNFX).
38 ;; See ebnf2ps.el for documentation.
45 ;; `http://www.w3.org/TR/2004/REC-xml-20040204/#sec-notation'
46 ;; (Extensible Markup Language (XML) 1.0 (Third Edition))
49 ;; rule ::= symbol '::=' expression
50 ;; /* rules are separated by at least one blank line. */
52 ;; expression ::= concatenation ('|' concatenation)*
54 ;; concatenation ::= exception*
56 ;; exception ::= term ('-' term)?
58 ;; term ::= factor ('*' | '+' | '?')?
60 ;; factor ::= hex-char+
61 ;; | '[' '^'? ( char ( '-' char )? )+ ']'
64 ;; | '(' expression ')'
67 ;; symbol ::= 'upper or lower case letter'
68 ;; ('upper or lower case letter' | '-' | '_')*
69 ;; /* upper and lower 8-bit accentuated characters are included */
71 ;; hex-char ::= '#x' [0-9A-Fa-f]+
73 ;; char ::= hex-char | 'any character except control characters'
74 ;; /* 8-bit accentuated characters are included */
76 ;; any-char ::= char | 'newline' | 'tab'
78 ;; ignore ::= '[' ('wfc' | 'WFC' | 'vc' | 'VC') ':' ( any-char - ']' )* ']'
80 ;; comment ::= '/*' ( any-char - '*/' ) '*/'
83 ;; Below is the Notation section extracted from the URL cited above.
87 ;; The formal grammar of XML is given in this specification using a simple
88 ;; Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines
89 ;; one symbol, in the form
91 ;; symbol ::= expression
93 ;; Symbols are written with an initial capital letter if they are the start
94 ;; symbol of a regular language, otherwise with an initial lowercase letter.
95 ;; Literal strings are quoted.
97 ;; Within the expression on the right-hand side of a rule, the following
98 ;; expressions are used to match strings of one or more characters:
102 ;; where N is a hexadecimal integer, the expression matches the character
103 ;; whose number (code point) in ISO/IEC 10646 is N. The number of leading
104 ;; zeros in the #xN form is insignificant.
106 ;; [a-zA-Z], [#xN-#xN]
108 ;; matches any Char with a value in the range(s) indicated (inclusive).
110 ;; [abc], [#xN#xN#xN]
112 ;; matches any Char with a value among the characters enumerated.
113 ;; Enumerations and ranges can be mixed in one set of brackets.
115 ;; [^a-z], [^#xN-#xN]
117 ;; matches any Char with a value outside the range indicated.
119 ;; [^abc], [^#xN#xN#xN]
121 ;; matches any Char with a value not among the characters given.
122 ;; Enumerations and ranges of forbidden values can be mixed in one set of
127 ;; matches a literal string matching that given inside the double quotes.
131 ;; matches a literal string matching that given inside the single quotes.
133 ;; These symbols may be combined to match more complex patterns as follows,
134 ;; where A and B represent simple expressions:
138 ;; expression is treated as a unit and may be combined as described in this
143 ;; matches A or nothing; optional A.
147 ;; matches A followed by B. This operator has higher precedence than
148 ;; alternation; thus A B | C D is identical to (A B) | (C D).
156 ;; matches any string that matches A but does not match B.
160 ;; matches one or more occurrences of A. Concatenation has higher
161 ;; precedence than alternation; thus A+ | B+ is identical to (A+) | (B+).
165 ;; matches zero or more occurrences of A. Concatenation has higher
166 ;; precedence than alternation; thus A* | B* is identical to (A*) | (B*).
168 ;; Other notations used in the productions are:
176 ;; well-formedness constraint; this identifies by name a constraint on
177 ;; well-formed documents associated with a production.
181 ;; validity constraint; this identifies by name a constraint on valid
182 ;; documents associated with a production.
185 ;; Differences Between EBNFX And ebnf2ps EBNFX
186 ;; -------------------------------------------
188 ;; Besides the characters that EBNFX accepts, ebnf2ps EBNFX accepts also the
189 ;; underscore (_) and minus (-) for rule name and european 8-bit accentuated
190 ;; characters (from \240 to \377) for rule name, string and comment. Also
191 ;; rule name can start with upper case letter.
194 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
202 (defvar ebnf-ebx-lex nil
203 "Value returned by `ebnf-ebx-lex' function.")
206 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
207 ;; Syntactic analyzer
210 ;;; rulelist ::= rule+
212 (defun ebnf-ebx-parser (start)
214 (let ((total (+ (- ebnf-limit start
) 1))
217 rule-list token rule
)
219 (setq token
(ebnf-ebx-lex))
220 (and (eq token
'end-of-input
)
221 (error "Invalid EBNFX file format"))
222 (and (eq token
'end-of-rule
)
223 (setq token
(ebnf-ebx-lex)))
224 (while (not (eq token
'end-of-input
))
227 (/ (* (- (point) bias
) 100.0) total
))
228 (setq token
(ebnf-ebx-rule token
)
231 (or (ebnf-add-empty-rule-list rule
)
232 (setq rule-list
(cons rule rule-list
))))
237 ;;; rule ::= symbol '::=' expression
240 (defun ebnf-ebx-rule (token)
241 (let ((name ebnf-ebx-lex
)
244 (setq ebnf-action nil
)
245 (or (eq token
'non-terminal
)
246 (error "Invalid rule name"))
247 (setq token
(ebnf-ebx-lex))
248 (or (eq token
'production
)
249 (error "Invalid rule: missing `::='"))
250 (setq elements
(ebnf-ebx-expression))
251 (or (memq (car elements
) '(end-of-rule end-of-input
))
252 (error "Invalid rule: there is no end of rule"))
253 (setq elements
(cdr elements
))
254 (ebnf-eps-add-production name
)
256 (ebnf-make-production name elements action
))))
259 ;; expression ::= concatenation ('|' concatenation)*
262 (defun ebnf-ebx-expression ()
263 (let (body concatenation
)
264 (while (eq (car (setq concatenation
265 (ebnf-ebx-concatenation (ebnf-ebx-lex))))
267 (setq body
(cons (cdr concatenation
) body
)))
268 (ebnf-token-alternative body concatenation
)))
271 ;; concatenation ::= exception*
274 (defun ebnf-ebx-concatenation (token)
275 (let ((term (ebnf-ebx-exception token
))
277 (or (setq token
(car term
)
279 (error "Empty element"))
280 (setq seq
(cons term seq
))
281 (while (setq term
(ebnf-ebx-exception token
)
284 (setq seq
(cons term seq
)))
286 (ebnf-token-sequence seq
))))
289 ;;; exception ::= term ('-' term)?
292 (defun ebnf-ebx-exception (token)
293 (let ((term (ebnf-ebx-term token
)))
294 (if (eq (car term
) 'exception
)
295 (let ((except (ebnf-ebx-term (ebnf-ebx-lex))))
297 (ebnf-make-except (cdr term
) (cdr except
))))
302 ;;; term ::= factor ('*' | '+' | '?')?
305 (defun ebnf-ebx-term (token)
306 (let ((factor (ebnf-ebx-factor token
)))
308 (setq token
(ebnf-ebx-lex))
309 (cond ((eq token
'zero-or-more
)
310 (setq factor
(ebnf-make-zero-or-more factor
)
311 token
(ebnf-ebx-lex)))
312 ((eq token
'one-or-more
)
313 (setq factor
(ebnf-make-one-or-more factor
)
314 token
(ebnf-ebx-lex)))
315 ((eq token
'optional
)
316 (setq factor
(ebnf-token-optional factor
)
317 token
(ebnf-ebx-lex)))))
318 (cons token factor
)))
321 ;;; factor ::= hex-char+
322 ;;; | '[' '^'? ( char ( '-' char )? )+ ']'
323 ;;; | '"' 'string' '"'
324 ;;; | "'" "string" "'"
325 ;;; | '(' expression ')'
328 ;;; symbol ::= 'upper or lower case letter'
329 ;;; ('upper or lower case letter' | '-' | '_')*
330 ;;; /* upper and lower 8-bit accentuated characters are included */
332 ;;; hex-char ::= '#x' [0-9A-Fa-f]+
334 ;;; char ::= hex-char | 'any character except control characters'
335 ;;; /* 8-bit accentuated characters are included */
337 ;;; any-char ::= char | 'newline' | 'tab'
340 (defun ebnf-ebx-factor (token)
343 ((eq token
'terminal
)
344 (ebnf-make-terminal ebnf-ebx-lex
))
346 ((eq token
'non-terminal
)
347 (ebnf-make-non-terminal ebnf-ebx-lex
))
349 ((eq token
'begin-group
)
350 (let ((body (ebnf-ebx-expression)))
351 (or (eq (car body
) 'end-group
)
352 (error "Missing `)'"))
360 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
364 (defconst ebnf-ebx-token-table
(make-vector 256 'error
)
365 "Vector used to map characters to a lexical token.")
368 (defun ebnf-ebx-initialize ()
369 "Initialize EBNFX token table."
370 ;; control character & control 8-bit character are set to `error'
372 ;; printable character: A-Z
373 (while (< char ?
\133)
374 (aset ebnf-ebx-token-table char
'non-terminal
)
375 (setq char
(1+ char
)))
376 ;; printable character: a-z
378 (while (< char ?
\173)
379 (aset ebnf-ebx-token-table char
'non-terminal
)
380 (setq char
(1+ char
)))
381 ;; European 8-bit accentuated characters:
383 (while (< char ?
\400)
384 (aset ebnf-ebx-token-table char
'non-terminal
)
385 (setq char
(1+ char
)))
386 ;; Override end of line characters:
387 (aset ebnf-ebx-token-table ?
\n 'end-of-rule
) ; [NL] linefeed
388 (aset ebnf-ebx-token-table ?
\r 'end-of-rule
) ; [CR] carriage return
389 ;; Override space characters:
390 (aset ebnf-ebx-token-table ?
\013 'space
) ; [VT] vertical tab
391 (aset ebnf-ebx-token-table ?
\t 'space
) ; [HT] horizontal tab
392 (aset ebnf-ebx-token-table ?\
'space
) ; [SP] space
393 ;; Override form feed character:
394 (aset ebnf-ebx-token-table ?
\f 'form-feed
) ; [FF] form feed
395 ;; Override other lexical characters:
396 (aset ebnf-ebx-token-table ?
# 'hash
)
397 (aset ebnf-ebx-token-table ?
\" 'double-quote
)
398 (aset ebnf-ebx-token-table ?
\' 'single-quote
)
399 (aset ebnf-ebx-token-table ?\
( 'begin-group
)
400 (aset ebnf-ebx-token-table ?\
) 'end-group
)
401 (aset ebnf-ebx-token-table ?-
'exception
)
402 (aset ebnf-ebx-token-table ?
: 'colon
)
403 (aset ebnf-ebx-token-table ?\
[ 'begin-square
)
404 (aset ebnf-ebx-token-table ?|
'alternative
)
405 (aset ebnf-ebx-token-table ?
* 'zero-or-more
)
406 (aset ebnf-ebx-token-table ?
+ 'one-or-more
)
407 (aset ebnf-ebx-token-table ?
\? 'optional
)
408 ;; Override comment character:
409 (aset ebnf-ebx-token-table ?
/ 'comment
)))
412 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
413 (defconst ebnf-ebx-non-terminal-chars
414 (ebnf-range-regexp "-_A-Za-z" ?
\240 ?
\377))
415 (defconst ebnf-ebx-non-terminal-letter-chars
416 (ebnf-range-regexp "A-Za-z" ?
\240 ?
\377))
419 (defun ebnf-ebx-lex ()
420 "Lexical analyzer for EBNFX.
422 Return a lexical token.
424 See documentation for variable `ebnf-ebx-lex'."
425 (if (>= (point) ebnf-limit
)
428 ;; skip spaces and comments
429 (while (if (> (following-char) 255)
433 (setq token
(aref ebnf-ebx-token-table
(following-char)))
436 (skip-chars-forward " \013\t" ebnf-limit
)
437 (< (point) ebnf-limit
))
439 (ebnf-ebx-skip-comment))
440 ((eq token
'form-feed
)
442 (setq ebnf-action
'form-feed
))
443 ((eq token
'end-of-rule
)
444 (ebnf-ebx-skip-end-of-rule))
445 ((and (eq token
'begin-square
)
446 (let ((case-fold-search t
))
447 (looking-at "\\[\\(wfc\\|vc\\):")))
448 (ebnf-ebx-skip-constraint))
453 ((>= (point) ebnf-limit
)
457 (error "Invalid character"))
459 ((eq token
'end-of-rule
)
461 ;; terminal: #x [0-9A-Fa-f]+
463 (setq ebnf-ebx-lex
(ebnf-ebx-character))
465 ;; terminal: "string"
466 ((eq token
'double-quote
)
467 (setq ebnf-ebx-lex
(ebnf-ebx-string ?
\"))
469 ;; terminal: 'string'
470 ((eq token
'single-quote
)
471 (setq ebnf-ebx-lex
(ebnf-ebx-string ?
\'))
473 ;; terminal: [ ^? ( char ( - char )? )+ ]
474 ((eq token
'begin-square
)
475 (setq ebnf-ebx-lex
(ebnf-ebx-range))
477 ;; non-terminal: NAME
478 ((eq token
'non-terminal
)
480 (ebnf-buffer-substring ebnf-ebx-non-terminal-chars
))
484 (or (looking-at "::=")
485 (error "Missing `::=' token"))
488 ;; miscellaneous: (, ), *, +, ?, |, -
495 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
496 (defconst ebnf-ebx-constraint-chars
497 (ebnf-range-regexp "^\000-\010\016-\037]" ?
\177 ?
\237))
500 (defun ebnf-ebx-skip-constraint ()
501 (or (> (skip-chars-forward ebnf-ebx-constraint-chars ebnf-limit
) 0)
502 (error "Invalid character"))
503 (or (= (following-char) ?\
])
504 (error "Missing end of constraint `]'"))
510 (defun ebnf-ebx-skip-end-of-rule ()
513 ;; end of rule ==> 2 or more consecutive end of lines
514 (setq eor-p
(or (> (skip-chars-forward "\r\n" ebnf-limit
) 1)
517 (skip-chars-forward " \013\t" ebnf-limit
)
519 (and (= (following-char) ?
/)
520 (ebnf-ebx-skip-comment))))
524 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
525 (defconst ebnf-ebx-comment-chars
526 (ebnf-range-regexp "^\000-\010\016-\037\\*" ?
\177 ?
\237))
527 (defconst ebnf-ebx-filename-chars
528 (ebnf-range-regexp "^\000-\037\\*" ?
\177 ?
\237))
531 (defun ebnf-ebx-skip-comment ()
533 (or (= (following-char) ?
*)
534 (error "Invalid beginning of comment"))
538 ((and ebnf-eps-executing
(= (following-char) ?\
[))
539 (ebnf-eps-add-context (ebnf-ebx-eps-filename)))
541 ((and ebnf-eps-executing
(= (following-char) ?\
]))
542 (ebnf-eps-remove-context (ebnf-ebx-eps-filename)))
543 ;; any other action in comment
545 (setq ebnf-action
(aref ebnf-comment-table
(following-char))))
548 (skip-chars-forward ebnf-ebx-comment-chars ebnf-limit
)
549 (or (= (following-char) ?
*)
550 (error "Missing end of comment"))
552 (and (/= (following-char) ?
/)
553 (< (point) ebnf-limit
))))
554 ;; check for a valid end of comment
555 (and (>= (point) ebnf-limit
)
556 (error "Missing end of comment"))
561 (defun ebnf-ebx-eps-filename ()
567 (ebnf-buffer-substring ebnf-ebx-filename-chars
)))
568 (and (< (point) ebnf-limit
)
569 (> (setq nchar
(skip-chars-forward "*" ebnf-limit
)) 0)
570 (< (point) ebnf-limit
)
571 (/= (following-char) ?
/)))
572 (setq fname
(concat fname
(make-string nchar ?
*))
574 (if (or (not nchar
) (= nchar
0))
576 (and (< (point) ebnf-limit
)
577 (= (following-char) ?
/)
578 (setq nchar
(1- nchar
)))
579 (concat fname
(make-string nchar ?
*)))))
582 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
583 (defconst ebnf-ebx-double-string-chars
584 (ebnf-range-regexp "\t -!#-~" ?
\240 ?
\377))
585 (defconst ebnf-ebx-single-string-chars
586 (ebnf-range-regexp "\t -&(-~" ?
\240 ?
\377))
589 (defun ebnf-ebx-string (delim)
590 (buffer-substring-no-properties
595 (skip-chars-forward (if (= delim ?
\")
596 ebnf-ebx-double-string-chars
597 ebnf-ebx-single-string-chars
)
599 (or (= (following-char) delim
)
600 (error "Missing string delimiter `%c'" delim
))
606 (defun ebnf-ebx-character ()
608 (buffer-substring-no-properties
611 (ebnf-ebx-hex-character)
615 (defun ebnf-ebx-range ()
616 ;; [ ^? ( char ( - char )? )+ ]
617 (buffer-substring-no-properties
621 (and (= (following-char) ?^
)
623 (and (= (following-char) ?-
)
626 (ebnf-ebx-any-character)
627 (when (= (following-char) ?-
)
629 (ebnf-ebx-any-character))
630 (and (/= (following-char) ?\
])
631 (< (point) ebnf-limit
))))
632 (and (>= (point) ebnf-limit
)
633 (error "Missing end of character range `]'"))
638 (defun ebnf-ebx-any-character ()
639 (let ((char (following-char)))
641 (ebnf-ebx-hex-character t
))
642 ((or (and (<= ?\ char
) (<= char ?
\")) ; #
643 (and (<= ?$ char
) (<= char ?
,)) ; -
644 (and (<= ?. char
) (<= char ?
\\)) ; ]
645 (and (<= ?^ char
) (<= char ?~
))
646 (and (<= ?
\240 char
) (<= char ?
\377)))
649 (error "Invalid character `%c'" char
)))))
652 (defun ebnf-ebx-hex-character (&optional no-error
)
655 (if (/= (following-char) ?x
)
657 (error "Invalid hexadecimal character"))
659 (or (> (skip-chars-forward "0-9A-Fa-f" ebnf-limit
) 0)
660 (error "Invalid hexadecimal character"))))
663 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
668 ;;; arch-tag: bfe2f95b-66bc-4dc6-8b7e-b7831e68f5fb
669 ;;; ebnf-ebx.el ends here