*** empty log message ***
[emacs.git] / lisp / progmodes / ebnf-ebx.el
blobd7dfa7af89fad4b93e96957143d7ddc3b62d7338
1 ;;; ebnf-ebx.el --- parser for EBNF used to specify XML (EBNFX)
3 ;; Copyright (C) 2004 Free Sofware Foundation, Inc.
5 ;; Author: Vinicius Jose Latorre <viniciusjl@ig.com.br>
6 ;; Maintainer: Vinicius Jose Latorre <viniciusjl@ig.com.br>
7 ;; Time-stamp: <2004/04/03 16:45:34 vinicius>
8 ;; Keywords: wp, ebnf, PostScript
9 ;; Version: 1.1
11 ;; This file is part of GNU Emacs.
13 ;; GNU Emacs is free software; you can redistribute it and/or modify
14 ;; it under the terms of the GNU General Public License as published by
15 ;; the Free Software Foundation; either version 2, or (at your option)
16 ;; any later version.
18 ;; GNU Emacs is distributed in the hope that it will be useful,
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;; GNU General Public License for more details.
23 ;; You should have received a copy of the GNU General Public License
24 ;; along with GNU Emacs; see the file COPYING. If not, write to the
25 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
26 ;; Boston, MA 02111-1307, USA.
28 ;;; Commentary:
30 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
33 ;; This is part of ebnf2ps package.
35 ;; This package defines a parser for EBNF used to specify XML (EBNFX).
37 ;; See ebnf2ps.el for documentation.
40 ;; EBNFX Syntax
41 ;; ------------
43 ;; See the URL:
44 ;; `http://www.w3.org/TR/2004/REC-xml-20040204/#sec-notation'
45 ;; (Extensible Markup Language (XML) 1.0 (Third Edition))
48 ;; rule ::= symbol '::=' expression
49 ;; /* rules are separated by at least one blank line. */
51 ;; expression ::= concatenation ('|' concatenation)*
53 ;; concatenation ::= exception*
55 ;; exception ::= term ('-' term)?
57 ;; term ::= factor ('*' | '+' | '?')?
59 ;; factor ::= hex-char+
60 ;; | '[' '^'? ( char ( '-' char )? )+ ']'
61 ;; | '"' 'string' '"'
62 ;; | "'" "string" "'"
63 ;; | '(' expression ')'
64 ;; | symbol
66 ;; symbol ::= 'upper or lower case letter'
67 ;; ('upper or lower case letter' | '-' | '_')*
68 ;; /* upper and lower 8-bit accentuated characters are included */
70 ;; hex-char ::= '#x' [0-9A-Fa-f]+
72 ;; char ::= hex-char | 'any character except control characters'
73 ;; /* 8-bit accentuated characters are included */
75 ;; any-char ::= char | 'newline' | 'tab'
77 ;; ignore ::= '[' ('wfc' | 'WFC' | 'vc' | 'VC') ':' ( any-char - ']' )* ']'
79 ;; comment ::= '/*' ( any-char - '*/' ) '*/'
82 ;; Below is the Notation section extracted from the URL cited above.
84 ;; 6 Notation
86 ;; The formal grammar of XML is given in this specification using a simple
87 ;; Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines
88 ;; one symbol, in the form
90 ;; symbol ::= expression
92 ;; Symbols are written with an initial capital letter if they are the start
93 ;; symbol of a regular language, otherwise with an initial lowercase letter.
94 ;; Literal strings are quoted.
96 ;; Within the expression on the right-hand side of a rule, the following
97 ;; expressions are used to match strings of one or more characters:
99 ;; #xN
101 ;; where N is a hexadecimal integer, the expression matches the character
102 ;; whose number (code point) in ISO/IEC 10646 is N. The number of leading
103 ;; zeros in the #xN form is insignificant.
105 ;; [a-zA-Z], [#xN-#xN]
107 ;; matches any Char with a value in the range(s) indicated (inclusive).
109 ;; [abc], [#xN#xN#xN]
111 ;; matches any Char with a value among the characters enumerated.
112 ;; Enumerations and ranges can be mixed in one set of brackets.
114 ;; [^a-z], [^#xN-#xN]
116 ;; matches any Char with a value outside the range indicated.
118 ;; [^abc], [^#xN#xN#xN]
120 ;; matches any Char with a value not among the characters given.
121 ;; Enumerations and ranges of forbidden values can be mixed in one set of
122 ;; brackets.
124 ;; "string"
126 ;; matches a literal string matching that given inside the double quotes.
128 ;; 'string'
130 ;; matches a literal string matching that given inside the single quotes.
132 ;; These symbols may be combined to match more complex patterns as follows,
133 ;; where A and B represent simple expressions:
135 ;; (expression)
137 ;; expression is treated as a unit and may be combined as described in this
138 ;; list.
140 ;; A?
142 ;; matches A or nothing; optional A.
144 ;; A B
146 ;; matches A followed by B. This operator has higher precedence than
147 ;; alternation; thus A B | C D is identical to (A B) | (C D).
149 ;; A | B
151 ;; matches A or B.
153 ;; A - B
155 ;; matches any string that matches A but does not match B.
157 ;; A+
159 ;; matches one or more occurrences of A. Concatenation has higher
160 ;; precedence than alternation; thus A+ | B+ is identical to (A+) | (B+).
162 ;; A*
164 ;; matches zero or more occurrences of A. Concatenation has higher
165 ;; precedence than alternation; thus A* | B* is identical to (A*) | (B*).
167 ;; Other notations used in the productions are:
169 ;; /* ... */
171 ;; comment.
173 ;; [ wfc: ... ]
175 ;; well-formedness constraint; this identifies by name a constraint on
176 ;; well-formed documents associated with a production.
178 ;; [ vc: ... ]
180 ;; validity constraint; this identifies by name a constraint on valid
181 ;; documents associated with a production.
184 ;; Differences Between EBNFX And ebnf2ps EBNFX
185 ;; -------------------------------------------
187 ;; Besides the characters that EBNFX accepts, ebnf2ps EBNFX accepts also the
188 ;; underscore (_) and minus (-) for rule name and european 8-bit accentuated
189 ;; characters (from \240 to \377) for rule name, string and comment. Also
190 ;; rule name can start with upper case letter.
193 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
195 ;;; Code:
198 (require 'ebnf-otz)
201 (defvar ebnf-ebx-lex nil
202 "Value returned by `ebnf-ebx-lex' function.")
205 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
206 ;; Syntactic analyzer
209 ;;; rulelist ::= rule+
211 (defun ebnf-ebx-parser (start)
212 "EBNFX parser."
213 (let ((total (+ (- ebnf-limit start) 1))
214 (bias (1- start))
215 (origin (point))
216 rule-list token rule)
217 (goto-char start)
218 (setq token (ebnf-ebx-lex))
219 (and (eq token 'end-of-input)
220 (error "Invalid EBNFX file format"))
221 (and (eq token 'end-of-rule)
222 (setq token (ebnf-ebx-lex)))
223 (while (not (eq token 'end-of-input))
224 (ebnf-message-float
225 "Parsing...%s%%"
226 (/ (* (- (point) bias) 100.0) total))
227 (setq token (ebnf-ebx-rule token)
228 rule (cdr token)
229 token (car token))
230 (or (ebnf-add-empty-rule-list rule)
231 (setq rule-list (cons rule rule-list))))
232 (goto-char origin)
233 rule-list))
236 ;;; rule ::= symbol '::=' expression
239 (defun ebnf-ebx-rule (token)
240 (let ((name ebnf-ebx-lex)
241 (action ebnf-action)
242 elements)
243 (setq ebnf-action nil)
244 (or (eq token 'non-terminal)
245 (error "Invalid rule name"))
246 (setq token (ebnf-ebx-lex))
247 (or (eq token 'production)
248 (error "Invalid rule: missing `::='"))
249 (setq elements (ebnf-ebx-expression))
250 (or (memq (car elements) '(end-of-rule end-of-input))
251 (error "Invalid rule: there is no end of rule"))
252 (setq elements (cdr elements))
253 (ebnf-eps-add-production name)
254 (cons (ebnf-ebx-lex)
255 (ebnf-make-production name elements action))))
258 ;; expression ::= concatenation ('|' concatenation)*
261 (defun ebnf-ebx-expression ()
262 (let (body concatenation)
263 (while (eq (car (setq concatenation
264 (ebnf-ebx-concatenation (ebnf-ebx-lex))))
265 'alternative)
266 (setq body (cons (cdr concatenation) body)))
267 (ebnf-token-alternative body concatenation)))
270 ;; concatenation ::= exception*
273 (defun ebnf-ebx-concatenation (token)
274 (let ((term (ebnf-ebx-exception token))
275 seq)
276 (or (setq token (car term)
277 term (cdr term))
278 (error "Empty element"))
279 (setq seq (cons term seq))
280 (while (setq term (ebnf-ebx-exception token)
281 token (car term)
282 term (cdr term))
283 (setq seq (cons term seq)))
284 (cons token
285 (ebnf-token-sequence seq))))
288 ;;; exception ::= term ('-' term)?
291 (defun ebnf-ebx-exception (token)
292 (let ((term (ebnf-ebx-term token)))
293 (if (eq (car term) 'exception)
294 (let ((except (ebnf-ebx-term (ebnf-ebx-lex))))
295 (cons (car except)
296 (ebnf-make-except (cdr term) (cdr except))))
297 term)))
301 ;;; term ::= factor ('*' | '+' | '?')?
304 (defun ebnf-ebx-term (token)
305 (let ((factor (ebnf-ebx-factor token)))
306 (when factor
307 (setq token (ebnf-ebx-lex))
308 (cond ((eq token 'zero-or-more)
309 (setq factor (ebnf-make-zero-or-more factor)
310 token (ebnf-ebx-lex)))
311 ((eq token 'one-or-more)
312 (setq factor (ebnf-make-one-or-more factor)
313 token (ebnf-ebx-lex)))
314 ((eq token 'optional)
315 (setq factor (ebnf-token-optional factor)
316 token (ebnf-ebx-lex)))))
317 (cons token factor)))
320 ;;; factor ::= hex-char+
321 ;;; | '[' '^'? ( char ( '-' char )? )+ ']'
322 ;;; | '"' 'string' '"'
323 ;;; | "'" "string" "'"
324 ;;; | '(' expression ')'
325 ;;; | symbol
327 ;;; symbol ::= 'upper or lower case letter'
328 ;;; ('upper or lower case letter' | '-' | '_')*
329 ;;; /* upper and lower 8-bit accentuated characters are included */
331 ;;; hex-char ::= '#x' [0-9A-Fa-f]+
333 ;;; char ::= hex-char | 'any character except control characters'
334 ;;; /* 8-bit accentuated characters are included */
336 ;;; any-char ::= char | 'newline' | 'tab'
339 (defun ebnf-ebx-factor (token)
340 (cond
341 ;; terminal
342 ((eq token 'terminal)
343 (ebnf-make-terminal ebnf-ebx-lex))
344 ;; non-terminal
345 ((eq token 'non-terminal)
346 (ebnf-make-non-terminal ebnf-ebx-lex))
347 ;; group
348 ((eq token 'begin-group)
349 (let ((body (ebnf-ebx-expression)))
350 (or (eq (car body) 'end-group)
351 (error "Missing `)'"))
352 (cdr body)))
353 ;; no element
355 nil)
359 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
360 ;; Lexical analyzer
363 (defconst ebnf-ebx-token-table (make-vector 256 'error)
364 "Vector used to map characters to a lexical token.")
367 (defun ebnf-ebx-initialize ()
368 "Initialize EBNFX token table."
369 ;; control character & control 8-bit character are set to `error'
370 (let ((char ?\101))
371 ;; printable character: A-Z
372 (while (< char ?\133)
373 (aset ebnf-ebx-token-table char 'non-terminal)
374 (setq char (1+ char)))
375 ;; printable character: a-z
376 (setq char ?\141)
377 (while (< char ?\173)
378 (aset ebnf-ebx-token-table char 'non-terminal)
379 (setq char (1+ char)))
380 ;; European 8-bit accentuated characters:
381 (setq char ?\240)
382 (while (< char ?\400)
383 (aset ebnf-ebx-token-table char 'non-terminal)
384 (setq char (1+ char)))
385 ;; Override end of line characters:
386 (aset ebnf-ebx-token-table ?\n 'end-of-rule) ; [NL] linefeed
387 (aset ebnf-ebx-token-table ?\r 'end-of-rule) ; [CR] carriage return
388 ;; Override space characters:
389 (aset ebnf-ebx-token-table ?\013 'space) ; [VT] vertical tab
390 (aset ebnf-ebx-token-table ?\t 'space) ; [HT] horizontal tab
391 (aset ebnf-ebx-token-table ?\ 'space) ; [SP] space
392 ;; Override form feed character:
393 (aset ebnf-ebx-token-table ?\f 'form-feed) ; [FF] form feed
394 ;; Override other lexical characters:
395 (aset ebnf-ebx-token-table ?# 'hash)
396 (aset ebnf-ebx-token-table ?\" 'double-quote)
397 (aset ebnf-ebx-token-table ?\' 'single-quote)
398 (aset ebnf-ebx-token-table ?\( 'begin-group)
399 (aset ebnf-ebx-token-table ?\) 'end-group)
400 (aset ebnf-ebx-token-table ?- 'exception)
401 (aset ebnf-ebx-token-table ?: 'colon)
402 (aset ebnf-ebx-token-table ?\[ 'begin-square)
403 (aset ebnf-ebx-token-table ?| 'alternative)
404 (aset ebnf-ebx-token-table ?* 'zero-or-more)
405 (aset ebnf-ebx-token-table ?+ 'one-or-more)
406 (aset ebnf-ebx-token-table ?\? 'optional)
407 ;; Override comment character:
408 (aset ebnf-ebx-token-table ?/ 'comment)))
411 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
412 (defconst ebnf-ebx-non-terminal-chars
413 (ebnf-range-regexp "-_A-Za-z" ?\240 ?\377))
414 (defconst ebnf-ebx-non-terminal-letter-chars
415 (ebnf-range-regexp "A-Za-z" ?\240 ?\377))
418 (defun ebnf-ebx-lex ()
419 "Lexical analyser for EBNFX.
421 Return a lexical token.
423 See documentation for variable `ebnf-ebx-lex'."
424 (if (>= (point) ebnf-limit)
425 'end-of-input
426 (let (token)
427 ;; skip spaces and comments
428 (while (if (> (following-char) 255)
429 (progn
430 (setq token 'error)
431 nil)
432 (setq token (aref ebnf-ebx-token-table (following-char)))
433 (cond
434 ((eq token 'space)
435 (skip-chars-forward " \013\t" ebnf-limit)
436 (< (point) ebnf-limit))
437 ((eq token 'comment)
438 (ebnf-ebx-skip-comment))
439 ((eq token 'form-feed)
440 (forward-char)
441 (setq ebnf-action 'form-feed))
442 ((eq token 'end-of-rule)
443 (ebnf-ebx-skip-end-of-rule))
444 ((and (eq token 'begin-square)
445 (let ((case-fold-search t))
446 (looking-at "\\[\\(wfc\\|vc\\):")))
447 (ebnf-ebx-skip-constraint))
448 (t nil)
450 (cond
451 ;; end of input
452 ((>= (point) ebnf-limit)
453 'end-of-input)
454 ;; error
455 ((eq token 'error)
456 (error "Illegal character"))
457 ;; end of rule
458 ((eq token 'end-of-rule)
459 'end-of-rule)
460 ;; terminal: #x [0-9A-Fa-f]+
461 ((eq token 'hash)
462 (setq ebnf-ebx-lex (ebnf-ebx-character))
463 'terminal)
464 ;; terminal: "string"
465 ((eq token 'double-quote)
466 (setq ebnf-ebx-lex (ebnf-ebx-string ?\"))
467 'terminal)
468 ;; terminal: 'string'
469 ((eq token 'single-quote)
470 (setq ebnf-ebx-lex (ebnf-ebx-string ?\'))
471 'terminal)
472 ;; terminal: [ ^? ( char ( - char )? )+ ]
473 ((eq token 'begin-square)
474 (setq ebnf-ebx-lex (ebnf-ebx-range))
475 'terminal)
476 ;; non-terminal: NAME
477 ((eq token 'non-terminal)
478 (setq ebnf-ebx-lex
479 (ebnf-buffer-substring ebnf-ebx-non-terminal-chars))
480 'non-terminal)
481 ;; colon: ::=
482 ((eq token 'colon)
483 (or (looking-at "::=")
484 (error "Missing `::=' token"))
485 (forward-char 3)
486 'production)
487 ;; miscellaneous: (, ), *, +, ?, |, -
489 (forward-char)
490 token)
491 ))))
494 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
495 (defconst ebnf-ebx-constraint-chars
496 (ebnf-range-regexp "^\000-\010\016-\037]" ?\177 ?\237))
499 (defun ebnf-ebx-skip-constraint ()
500 (or (> (skip-chars-forward ebnf-ebx-constraint-chars ebnf-limit) 0)
501 (error "Invalid character"))
502 (or (= (following-char) ?\])
503 (error "Missing end of constraint `]'"))
504 (forward-char)
509 (defun ebnf-ebx-skip-end-of-rule ()
510 (let (eor-p)
511 (while (progn
512 ;; end of rule ==> 2 or more consecutive end of lines
513 (setq eor-p (or (> (skip-chars-forward "\r\n" ebnf-limit) 1)
514 eor-p))
515 ;; skip spaces
516 (skip-chars-forward " \013\t" ebnf-limit)
517 ;; skip comments
518 (and (= (following-char) ?/)
519 (ebnf-ebx-skip-comment))))
520 (not eor-p)))
523 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
524 (defconst ebnf-ebx-comment-chars
525 (ebnf-range-regexp "^\000-\010\016-\037\\*" ?\177 ?\237))
526 (defconst ebnf-ebx-filename-chars
527 (ebnf-range-regexp "^\000-\037\\*" ?\177 ?\237))
530 (defun ebnf-ebx-skip-comment ()
531 (forward-char)
532 (or (= (following-char) ?*)
533 (error "Invalid beginning of comment"))
534 (forward-char)
535 (cond
536 ;; open EPS file
537 ((and ebnf-eps-executing (= (following-char) ?\[))
538 (ebnf-eps-add-context (ebnf-ebx-eps-filename)))
539 ;; close EPS file
540 ((and ebnf-eps-executing (= (following-char) ?\]))
541 (ebnf-eps-remove-context (ebnf-ebx-eps-filename)))
542 ;; any other action in comment
544 (setq ebnf-action (aref ebnf-comment-table (following-char))))
546 (while (progn
547 (skip-chars-forward ebnf-ebx-comment-chars ebnf-limit)
548 (or (= (following-char) ?*)
549 (error "Missing end of comment"))
550 (forward-char)
551 (and (/= (following-char) ?/)
552 (< (point) ebnf-limit))))
553 ;; check for a valid end of comment
554 (and (>= (point) ebnf-limit)
555 (error "Missing end of comment"))
556 (forward-char)
560 (defun ebnf-ebx-eps-filename ()
561 (forward-char)
562 (let (fname nchar)
563 (while (progn
564 (setq fname
565 (concat fname
566 (ebnf-buffer-substring ebnf-ebx-filename-chars)))
567 (and (< (point) ebnf-limit)
568 (> (setq nchar (skip-chars-forward "*" ebnf-limit)) 0)
569 (< (point) ebnf-limit)
570 (/= (following-char) ?/)))
571 (setq fname (concat fname (make-string nchar ?*))
572 nchar nil))
573 (if (or (not nchar) (= nchar 0))
574 fname
575 (and (< (point) ebnf-limit)
576 (= (following-char) ?/)
577 (setq nchar (1- nchar)))
578 (concat fname (make-string nchar ?*)))))
581 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
582 (defconst ebnf-ebx-double-string-chars
583 (ebnf-range-regexp "\t -!#-~" ?\240 ?\377))
584 (defconst ebnf-ebx-single-string-chars
585 (ebnf-range-regexp "\t -&(-~" ?\240 ?\377))
588 (defun ebnf-ebx-string (delim)
589 (buffer-substring-no-properties
590 (progn
591 (forward-char)
592 (point))
593 (progn
594 (skip-chars-forward (if (= delim ?\")
595 ebnf-ebx-double-string-chars
596 ebnf-ebx-single-string-chars)
597 ebnf-limit)
598 (or (= (following-char) delim)
599 (error "Missing string delimiter `%c'" delim))
600 (prog1
601 (point)
602 (forward-char)))))
605 (defun ebnf-ebx-character ()
606 ;; #x [0-9A-Fa-f]+
607 (buffer-substring-no-properties
608 (point)
609 (progn
610 (ebnf-ebx-hex-character)
611 (point))))
614 (defun ebnf-ebx-range ()
615 ;; [ ^? ( char ( - char )? )+ ]
616 (buffer-substring-no-properties
617 (point)
618 (progn
619 (forward-char)
620 (and (= (following-char) ?^)
621 (forward-char))
622 (and (= (following-char) ?-)
623 (forward-char))
624 (while (progn
625 (ebnf-ebx-any-character)
626 (when (= (following-char) ?-)
627 (forward-char)
628 (ebnf-ebx-any-character))
629 (and (/= (following-char) ?\])
630 (< (point) ebnf-limit))))
631 (and (>= (point) ebnf-limit)
632 (error "Missing end of character range `]'"))
633 (forward-char)
634 (point))))
637 (defun ebnf-ebx-any-character ()
638 (let ((char (following-char)))
639 (cond ((= char ?#)
640 (ebnf-ebx-hex-character t))
641 ((or (and (<= ?\ char) (<= char ?\")) ; #
642 (and (<= ?$ char) (<= char ?,)) ; -
643 (and (<= ?. char) (<= char ?\\)) ; ]
644 (and (<= ?^ char) (<= char ?~))
645 (and (<= ?\240 char) (<= char ?\377)))
646 (forward-char))
648 (error "Invalid character `%c'" char)))))
651 (defun ebnf-ebx-hex-character (&optional no-error)
652 ;; #x [0-9A-Fa-f]+
653 (forward-char)
654 (if (/= (following-char) ?x)
655 (or no-error
656 (error "Invalid hexadecimal character"))
657 (forward-char)
658 (or (> (skip-chars-forward "0-9A-Fa-f" ebnf-limit) 0)
659 (error "Invalid hexadecimal character"))))
662 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
665 (provide 'ebnf-ebx)
667 ;;; arch-tag: bfe2f95b-66bc-4dc6-8b7e-b7831e68f5fb
668 ;;; ebnf-ebx.el ends here