Update copyright year to 2014 by running admin/update-copyright.
[emacs.git] / lisp / progmodes / ebnf-ebx.el
blob607abd6011f9e0c023d8877d0c1688a95bf7d3f1
1 ;;; ebnf-ebx.el --- parser for EBNF used to specify XML (EBNFX)
3 ;; Copyright (C) 2001-2014 Free Software Foundation, Inc.
5 ;; Author: Vinicius Jose Latorre <viniciusjl@ig.com.br>
6 ;; Maintainer: Vinicius Jose Latorre <viniciusjl@ig.com.br>
7 ;; Keywords: wp, ebnf, PostScript
8 ;; Old-Version: 1.2
9 ;; Package: ebnf2ps
11 ;; This file is part of GNU Emacs.
13 ;; GNU Emacs is free software: you can redistribute it and/or modify
14 ;; it under the terms of the GNU General Public License as published by
15 ;; the Free Software Foundation, either version 3 of the License, or
16 ;; (at your option) any later version.
18 ;; GNU Emacs is distributed in the hope that it will be useful,
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;; GNU General Public License for more details.
23 ;; You should have received a copy of the GNU General Public License
24 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
26 ;;; Commentary:
28 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31 ;; This is part of ebnf2ps package.
33 ;; This package defines a parser for EBNF used to specify XML (EBNFX).
35 ;; See ebnf2ps.el for documentation.
38 ;; EBNFX Syntax
39 ;; ------------
41 ;; See the URL:
42 ;; `http://www.w3.org/TR/2004/REC-xml-20040204/#sec-notation'
43 ;; (Extensible Markup Language (XML) 1.0 (Third Edition))
46 ;; rule ::= symbol '::=' expression
47 ;; /* rules are separated by at least one blank line. */
49 ;; expression ::= concatenation ('|' concatenation)*
51 ;; concatenation ::= exception*
53 ;; exception ::= term ('-' term)?
55 ;; term ::= factor ('*' | '+' | '?')?
57 ;; factor ::= hex-char+
58 ;; | '[' '^'? ( char ( '-' char )? )+ ']'
59 ;; | '"' 'string' '"'
60 ;; | "'" "string" "'"
61 ;; | '(' expression ')'
62 ;; | symbol
64 ;; symbol ::= 'upper or lower case letter'
65 ;; ('upper or lower case letter' | '-' | '_')*
66 ;; /* upper and lower 8-bit accentuated characters are included */
68 ;; hex-char ::= '#x' [0-9A-Fa-f]+
70 ;; char ::= hex-char | 'any character except control characters'
71 ;; /* 8-bit accentuated characters are included */
73 ;; any-char ::= char | 'newline' | 'tab'
75 ;; ignore ::= '[' ('wfc' | 'WFC' | 'vc' | 'VC') ':' ( any-char - ']' )* ']'
77 ;; comment ::= '/*' ( any-char - '*/' ) '*/'
80 ;; Below is the Notation section extracted from the URL cited above.
82 ;; 6 Notation
84 ;; The formal grammar of XML is given in this specification using a simple
85 ;; Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines
86 ;; one symbol, in the form
88 ;; symbol ::= expression
90 ;; Symbols are written with an initial capital letter if they are the start
91 ;; symbol of a regular language, otherwise with an initial lowercase letter.
92 ;; Literal strings are quoted.
94 ;; Within the expression on the right-hand side of a rule, the following
95 ;; expressions are used to match strings of one or more characters:
97 ;; #xN
99 ;; where N is a hexadecimal integer, the expression matches the character
100 ;; whose number (code point) in ISO/IEC 10646 is N. The number of leading
101 ;; zeros in the #xN form is insignificant.
103 ;; [a-zA-Z], [#xN-#xN]
105 ;; matches any Char with a value in the range(s) indicated (inclusive).
107 ;; [abc], [#xN#xN#xN]
109 ;; matches any Char with a value among the characters enumerated.
110 ;; Enumerations and ranges can be mixed in one set of brackets.
112 ;; [^a-z], [^#xN-#xN]
114 ;; matches any Char with a value outside the range indicated.
116 ;; [^abc], [^#xN#xN#xN]
118 ;; matches any Char with a value not among the characters given.
119 ;; Enumerations and ranges of forbidden values can be mixed in one set of
120 ;; brackets.
122 ;; "string"
124 ;; matches a literal string matching that given inside the double quotes.
126 ;; 'string'
128 ;; matches a literal string matching that given inside the single quotes.
130 ;; These symbols may be combined to match more complex patterns as follows,
131 ;; where A and B represent simple expressions:
133 ;; (expression)
135 ;; expression is treated as a unit and may be combined as described in this
136 ;; list.
138 ;; A?
140 ;; matches A or nothing; optional A.
142 ;; A B
144 ;; matches A followed by B. This operator has higher precedence than
145 ;; alternation; thus A B | C D is identical to (A B) | (C D).
147 ;; A | B
149 ;; matches A or B.
151 ;; A - B
153 ;; matches any string that matches A but does not match B.
155 ;; A+
157 ;; matches one or more occurrences of A. Concatenation has higher
158 ;; precedence than alternation; thus A+ | B+ is identical to (A+) | (B+).
160 ;; A*
162 ;; matches zero or more occurrences of A. Concatenation has higher
163 ;; precedence than alternation; thus A* | B* is identical to (A*) | (B*).
165 ;; Other notations used in the productions are:
167 ;; /* ... */
169 ;; comment.
171 ;; [ wfc: ... ]
173 ;; well-formedness constraint; this identifies by name a constraint on
174 ;; well-formed documents associated with a production.
176 ;; [ vc: ... ]
178 ;; validity constraint; this identifies by name a constraint on valid
179 ;; documents associated with a production.
182 ;; Differences Between EBNFX And ebnf2ps EBNFX
183 ;; -------------------------------------------
185 ;; Besides the characters that EBNFX accepts, ebnf2ps EBNFX accepts also the
186 ;; underscore (_) and minus (-) for rule name and european 8-bit accentuated
187 ;; characters (from \240 to \377) for rule name, string and comment. Also
188 ;; rule name can start with upper case letter.
191 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
193 ;;; Code:
196 (require 'ebnf-otz)
199 (defvar ebnf-ebx-lex nil
200 "Value returned by `ebnf-ebx-lex' function.")
203 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
204 ;; Syntactic analyzer
207 ;;; rulelist ::= rule+
209 (defun ebnf-ebx-parser (start)
210 "EBNFX parser."
211 (let ((total (+ (- ebnf-limit start) 1))
212 (bias (1- start))
213 (origin (point))
214 rule-list token rule)
215 (goto-char start)
216 (setq token (ebnf-ebx-lex))
217 (and (eq token 'end-of-input)
218 (error "Invalid EBNFX file format"))
219 (and (eq token 'end-of-rule)
220 (setq token (ebnf-ebx-lex)))
221 (while (not (eq token 'end-of-input))
222 (ebnf-message-float
223 "Parsing...%s%%"
224 (/ (* (- (point) bias) 100.0) total))
225 (setq token (ebnf-ebx-rule token)
226 rule (cdr token)
227 token (car token))
228 (or (ebnf-add-empty-rule-list rule)
229 (setq rule-list (cons rule rule-list))))
230 (goto-char origin)
231 rule-list))
234 ;;; rule ::= symbol '::=' expression
237 (defun ebnf-ebx-rule (token)
238 (let ((name ebnf-ebx-lex)
239 (action ebnf-action)
240 elements)
241 (setq ebnf-action nil)
242 (or (eq token 'non-terminal)
243 (error "Invalid rule name"))
244 (setq token (ebnf-ebx-lex))
245 (or (eq token 'production)
246 (error "Invalid rule: missing `::='"))
247 (setq elements (ebnf-ebx-expression))
248 (or (memq (car elements) '(end-of-rule end-of-input))
249 (error "Invalid rule: there is no end of rule"))
250 (setq elements (cdr elements))
251 (ebnf-eps-add-production name)
252 (cons (ebnf-ebx-lex)
253 (ebnf-make-production name elements action))))
256 ;; expression ::= concatenation ('|' concatenation)*
259 (defun ebnf-ebx-expression ()
260 (let (body concatenation)
261 (while (eq (car (setq concatenation
262 (ebnf-ebx-concatenation (ebnf-ebx-lex))))
263 'alternative)
264 (setq body (cons (cdr concatenation) body)))
265 (ebnf-token-alternative body concatenation)))
268 ;; concatenation ::= exception*
271 (defun ebnf-ebx-concatenation (token)
272 (let ((term (ebnf-ebx-exception token))
273 seq)
274 (or (setq token (car term)
275 term (cdr term))
276 (error "Empty element"))
277 (setq seq (cons term seq))
278 (while (setq term (ebnf-ebx-exception token)
279 token (car term)
280 term (cdr term))
281 (setq seq (cons term seq)))
282 (cons token
283 (ebnf-token-sequence seq))))
286 ;;; exception ::= term ('-' term)?
289 (defun ebnf-ebx-exception (token)
290 (let ((term (ebnf-ebx-term token)))
291 (if (eq (car term) 'exception)
292 (let ((except (ebnf-ebx-term (ebnf-ebx-lex))))
293 (cons (car except)
294 (ebnf-make-except (cdr term) (cdr except))))
295 term)))
299 ;;; term ::= factor ('*' | '+' | '?')?
302 (defun ebnf-ebx-term (token)
303 (let ((factor (ebnf-ebx-factor token)))
304 (when factor
305 (setq token (ebnf-ebx-lex))
306 (cond ((eq token 'zero-or-more)
307 (setq factor (ebnf-make-zero-or-more factor)
308 token (ebnf-ebx-lex)))
309 ((eq token 'one-or-more)
310 (setq factor (ebnf-make-one-or-more factor)
311 token (ebnf-ebx-lex)))
312 ((eq token 'optional)
313 (setq factor (ebnf-token-optional factor)
314 token (ebnf-ebx-lex)))))
315 (cons token factor)))
318 ;;; factor ::= hex-char+
319 ;;; | '[' '^'? ( char ( '-' char )? )+ ']'
320 ;;; | '"' 'string' '"'
321 ;;; | "'" "string" "'"
322 ;;; | '(' expression ')'
323 ;;; | symbol
325 ;;; symbol ::= 'upper or lower case letter'
326 ;;; ('upper or lower case letter' | '-' | '_')*
327 ;;; /* upper and lower 8-bit accentuated characters are included */
329 ;;; hex-char ::= '#x' [0-9A-Fa-f]+
331 ;;; char ::= hex-char | 'any character except control characters'
332 ;;; /* 8-bit accentuated characters are included */
334 ;;; any-char ::= char | 'newline' | 'tab'
337 (defun ebnf-ebx-factor (token)
338 (cond
339 ;; terminal
340 ((eq token 'terminal)
341 (ebnf-make-terminal ebnf-ebx-lex))
342 ;; non-terminal
343 ((eq token 'non-terminal)
344 (ebnf-make-non-terminal ebnf-ebx-lex))
345 ;; group
346 ((eq token 'begin-group)
347 (let ((body (ebnf-ebx-expression)))
348 (or (eq (car body) 'end-group)
349 (error "Missing `)'"))
350 (cdr body)))
351 ;; no element
353 nil)
357 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
358 ;; Lexical analyzer
361 (defconst ebnf-ebx-token-table (make-vector 256 'error)
362 "Vector used to map characters to a lexical token.")
365 (defun ebnf-ebx-initialize ()
366 "Initialize EBNFX token table."
367 ;; control character & control 8-bit character are set to `error'
368 (let ((char ?\101))
369 ;; printable character: A-Z
370 (while (< char ?\133)
371 (aset ebnf-ebx-token-table char 'non-terminal)
372 (setq char (1+ char)))
373 ;; printable character: a-z
374 (setq char ?\141)
375 (while (< char ?\173)
376 (aset ebnf-ebx-token-table char 'non-terminal)
377 (setq char (1+ char)))
378 ;; European 8-bit accentuated characters:
379 (setq char ?\240)
380 (while (< char ?\400)
381 (aset ebnf-ebx-token-table char 'non-terminal)
382 (setq char (1+ char)))
383 ;; Override end of line characters:
384 (aset ebnf-ebx-token-table ?\n 'end-of-rule) ; [NL] linefeed
385 (aset ebnf-ebx-token-table ?\r 'end-of-rule) ; [CR] carriage return
386 ;; Override space characters:
387 (aset ebnf-ebx-token-table ?\013 'space) ; [VT] vertical tab
388 (aset ebnf-ebx-token-table ?\t 'space) ; [HT] horizontal tab
389 (aset ebnf-ebx-token-table ?\ 'space) ; [SP] space
390 ;; Override form feed character:
391 (aset ebnf-ebx-token-table ?\f 'form-feed) ; [FF] form feed
392 ;; Override other lexical characters:
393 (aset ebnf-ebx-token-table ?# 'hash)
394 (aset ebnf-ebx-token-table ?\" 'double-quote)
395 (aset ebnf-ebx-token-table ?\' 'single-quote)
396 (aset ebnf-ebx-token-table ?\( 'begin-group)
397 (aset ebnf-ebx-token-table ?\) 'end-group)
398 (aset ebnf-ebx-token-table ?- 'exception)
399 (aset ebnf-ebx-token-table ?: 'colon)
400 (aset ebnf-ebx-token-table ?\[ 'begin-square)
401 (aset ebnf-ebx-token-table ?| 'alternative)
402 (aset ebnf-ebx-token-table ?* 'zero-or-more)
403 (aset ebnf-ebx-token-table ?+ 'one-or-more)
404 (aset ebnf-ebx-token-table ?\? 'optional)
405 ;; Override comment character:
406 (aset ebnf-ebx-token-table ?/ 'comment)))
409 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
410 (defconst ebnf-ebx-non-terminal-chars
411 (ebnf-range-regexp "-_A-Za-z" ?\240 ?\377))
412 (defconst ebnf-ebx-non-terminal-letter-chars
413 (ebnf-range-regexp "A-Za-z" ?\240 ?\377))
416 (defun ebnf-ebx-lex ()
417 "Lexical analyzer for EBNFX.
419 Return a lexical token.
421 See documentation for variable `ebnf-ebx-lex'."
422 (if (>= (point) ebnf-limit)
423 'end-of-input
424 (let (token)
425 ;; skip spaces and comments
426 (while (if (> (following-char) 255)
427 (progn
428 (setq token 'error)
429 nil)
430 (setq token (aref ebnf-ebx-token-table (following-char)))
431 (cond
432 ((eq token 'space)
433 (skip-chars-forward " \013\t" ebnf-limit)
434 (< (point) ebnf-limit))
435 ((eq token 'comment)
436 (ebnf-ebx-skip-comment))
437 ((eq token 'form-feed)
438 (forward-char)
439 (setq ebnf-action 'form-feed))
440 ((eq token 'end-of-rule)
441 (ebnf-ebx-skip-end-of-rule))
442 ((and (eq token 'begin-square)
443 (let ((case-fold-search t))
444 (looking-at "\\[\\(wfc\\|vc\\):")))
445 (ebnf-ebx-skip-constraint))
446 (t nil)
448 (cond
449 ;; end of input
450 ((>= (point) ebnf-limit)
451 'end-of-input)
452 ;; error
453 ((eq token 'error)
454 (error "Invalid character"))
455 ;; end of rule
456 ((eq token 'end-of-rule)
457 'end-of-rule)
458 ;; terminal: #x [0-9A-Fa-f]+
459 ((eq token 'hash)
460 (setq ebnf-ebx-lex (ebnf-ebx-character))
461 'terminal)
462 ;; terminal: "string"
463 ((eq token 'double-quote)
464 (setq ebnf-ebx-lex (ebnf-ebx-string ?\"))
465 'terminal)
466 ;; terminal: 'string'
467 ((eq token 'single-quote)
468 (setq ebnf-ebx-lex (ebnf-ebx-string ?\'))
469 'terminal)
470 ;; terminal: [ ^? ( char ( - char )? )+ ]
471 ((eq token 'begin-square)
472 (setq ebnf-ebx-lex (ebnf-ebx-range))
473 'terminal)
474 ;; non-terminal: NAME
475 ((eq token 'non-terminal)
476 (setq ebnf-ebx-lex
477 (ebnf-buffer-substring ebnf-ebx-non-terminal-chars))
478 'non-terminal)
479 ;; colon: ::=
480 ((eq token 'colon)
481 (or (looking-at "::=")
482 (error "Missing `::=' token"))
483 (forward-char 3)
484 'production)
485 ;; miscellaneous: (, ), *, +, ?, |, -
487 (forward-char)
488 token)
489 ))))
492 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
493 (defconst ebnf-ebx-constraint-chars
494 (ebnf-range-regexp "^\000-\010\016-\037]" ?\177 ?\237))
497 (defun ebnf-ebx-skip-constraint ()
498 (or (> (skip-chars-forward ebnf-ebx-constraint-chars ebnf-limit) 0)
499 (error "Invalid character"))
500 (or (= (following-char) ?\])
501 (error "Missing end of constraint `]'"))
502 (forward-char)
507 (defun ebnf-ebx-skip-end-of-rule ()
508 (let (eor-p)
509 (while (progn
510 ;; end of rule ==> 2 or more consecutive end of lines
511 (setq eor-p (or (> (skip-chars-forward "\r\n" ebnf-limit) 1)
512 eor-p))
513 ;; skip spaces
514 (skip-chars-forward " \013\t" ebnf-limit)
515 ;; skip comments
516 (and (= (following-char) ?/)
517 (ebnf-ebx-skip-comment))))
518 (not eor-p)))
521 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
522 (defconst ebnf-ebx-comment-chars
523 (ebnf-range-regexp "^\000-\010\016-\037\\*" ?\177 ?\237))
524 (defconst ebnf-ebx-filename-chars
525 (ebnf-range-regexp "^\000-\037\\*" ?\177 ?\237))
528 (defun ebnf-ebx-skip-comment ()
529 (forward-char)
530 (or (= (following-char) ?*)
531 (error "Invalid beginning of comment"))
532 (forward-char)
533 (cond
534 ;; open EPS file
535 ((and ebnf-eps-executing (= (following-char) ?\[))
536 (ebnf-eps-add-context (ebnf-ebx-eps-filename)))
537 ;; close EPS file
538 ((and ebnf-eps-executing (= (following-char) ?\]))
539 (ebnf-eps-remove-context (ebnf-ebx-eps-filename)))
540 ;; EPS header
541 ((and ebnf-eps-executing (= (following-char) ?H))
542 (ebnf-eps-header-comment (ebnf-ebx-eps-filename)))
543 ;; EPS footer
544 ((and ebnf-eps-executing (= (following-char) ?F))
545 (ebnf-eps-footer-comment (ebnf-ebx-eps-filename)))
546 ;; any other action in comment
548 (setq ebnf-action (aref ebnf-comment-table (following-char))))
550 (while (progn
551 (skip-chars-forward ebnf-ebx-comment-chars ebnf-limit)
552 (or (= (following-char) ?*)
553 (error "Missing end of comment"))
554 (forward-char)
555 (and (/= (following-char) ?/)
556 (< (point) ebnf-limit))))
557 ;; check for a valid end of comment
558 (and (>= (point) ebnf-limit)
559 (error "Missing end of comment"))
560 (forward-char)
564 (defun ebnf-ebx-eps-filename ()
565 (forward-char)
566 (let (fname nchar)
567 (while (progn
568 (setq fname
569 (concat fname
570 (ebnf-buffer-substring ebnf-ebx-filename-chars)))
571 (and (< (point) ebnf-limit)
572 (> (setq nchar (skip-chars-forward "*" ebnf-limit)) 0)
573 (< (point) ebnf-limit)
574 (/= (following-char) ?/)))
575 (setq fname (concat fname (make-string nchar ?*))
576 nchar nil))
577 (if (or (not nchar) (= nchar 0))
578 fname
579 (and (< (point) ebnf-limit)
580 (= (following-char) ?/)
581 (setq nchar (1- nchar)))
582 (concat fname (make-string nchar ?*)))))
585 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
586 (defconst ebnf-ebx-double-string-chars
587 (ebnf-range-regexp "\t -!#-~" ?\240 ?\377))
588 (defconst ebnf-ebx-single-string-chars
589 (ebnf-range-regexp "\t -&(-~" ?\240 ?\377))
592 (defun ebnf-ebx-string (delim)
593 (buffer-substring-no-properties
594 (progn
595 (forward-char)
596 (point))
597 (progn
598 (skip-chars-forward (if (= delim ?\")
599 ebnf-ebx-double-string-chars
600 ebnf-ebx-single-string-chars)
601 ebnf-limit)
602 (or (= (following-char) delim)
603 (error "Missing string delimiter `%c'" delim))
604 (prog1
605 (point)
606 (forward-char)))))
609 (defun ebnf-ebx-character ()
610 ;; #x [0-9A-Fa-f]+
611 (buffer-substring-no-properties
612 (point)
613 (progn
614 (ebnf-ebx-hex-character)
615 (point))))
618 (defun ebnf-ebx-range ()
619 ;; [ ^? ( char ( - char )? )+ ]
620 (buffer-substring-no-properties
621 (point)
622 (progn
623 (forward-char)
624 (and (= (following-char) ?^)
625 (forward-char))
626 (and (= (following-char) ?-)
627 (forward-char))
628 (while (progn
629 (ebnf-ebx-any-character)
630 (when (= (following-char) ?-)
631 (forward-char)
632 (ebnf-ebx-any-character))
633 (and (/= (following-char) ?\])
634 (< (point) ebnf-limit))))
635 (and (>= (point) ebnf-limit)
636 (error "Missing end of character range `]'"))
637 (forward-char)
638 (point))))
641 (defun ebnf-ebx-any-character ()
642 (let ((char (following-char)))
643 (cond ((= char ?#)
644 (ebnf-ebx-hex-character t))
645 ((or (and (<= ?\ char) (<= char ?\")) ; #
646 (and (<= ?$ char) (<= char ?,)) ; -
647 (and (<= ?. char) (<= char ?\\)) ; ]
648 (and (<= ?^ char) (<= char ?~))
649 (and (<= ?\240 char) (<= char ?\377)))
650 (forward-char))
652 (error "Invalid character `%c'" char)))))
655 (defun ebnf-ebx-hex-character (&optional no-error)
656 ;; #x [0-9A-Fa-f]+
657 (forward-char)
658 (if (/= (following-char) ?x)
659 (or no-error
660 (error "Invalid hexadecimal character"))
661 (forward-char)
662 (or (> (skip-chars-forward "0-9A-Fa-f" ebnf-limit) 0)
663 (error "Invalid hexadecimal character"))))
666 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
669 (provide 'ebnf-ebx)
671 ;;; ebnf-ebx.el ends here