Add "Package:" file headers to denote built-in packages.
[emacs.git] / lisp / progmodes / ebnf-ebx.el
blobdd94f9e638afc3311300f366f305d47b4a9975c1
1 ;;; ebnf-ebx.el --- parser for EBNF used to specify XML (EBNFX)
3 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
4 ;; Free Software Foundation, Inc.
6 ;; Author: Vinicius Jose Latorre <viniciusjl@ig.com.br>
7 ;; Maintainer: Vinicius Jose Latorre <viniciusjl@ig.com.br>
8 ;; Keywords: wp, ebnf, PostScript
9 ;; Version: 1.2
10 ;; Package: ebnf2ps
12 ;; This file is part of GNU Emacs.
14 ;; GNU Emacs is free software: you can redistribute it and/or modify
15 ;; it under the terms of the GNU General Public License as published by
16 ;; the Free Software Foundation, either version 3 of the License, or
17 ;; (at your option) any later version.
19 ;; GNU Emacs is distributed in the hope that it will be useful,
20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;; GNU General Public License for more details.
24 ;; You should have received a copy of the GNU General Public License
25 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
27 ;;; Commentary:
29 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
32 ;; This is part of ebnf2ps package.
34 ;; This package defines a parser for EBNF used to specify XML (EBNFX).
36 ;; See ebnf2ps.el for documentation.
39 ;; EBNFX Syntax
40 ;; ------------
42 ;; See the URL:
43 ;; `http://www.w3.org/TR/2004/REC-xml-20040204/#sec-notation'
44 ;; (Extensible Markup Language (XML) 1.0 (Third Edition))
47 ;; rule ::= symbol '::=' expression
48 ;; /* rules are separated by at least one blank line. */
50 ;; expression ::= concatenation ('|' concatenation)*
52 ;; concatenation ::= exception*
54 ;; exception ::= term ('-' term)?
56 ;; term ::= factor ('*' | '+' | '?')?
58 ;; factor ::= hex-char+
59 ;; | '[' '^'? ( char ( '-' char )? )+ ']'
60 ;; | '"' 'string' '"'
61 ;; | "'" "string" "'"
62 ;; | '(' expression ')'
63 ;; | symbol
65 ;; symbol ::= 'upper or lower case letter'
66 ;; ('upper or lower case letter' | '-' | '_')*
67 ;; /* upper and lower 8-bit accentuated characters are included */
69 ;; hex-char ::= '#x' [0-9A-Fa-f]+
71 ;; char ::= hex-char | 'any character except control characters'
72 ;; /* 8-bit accentuated characters are included */
74 ;; any-char ::= char | 'newline' | 'tab'
76 ;; ignore ::= '[' ('wfc' | 'WFC' | 'vc' | 'VC') ':' ( any-char - ']' )* ']'
78 ;; comment ::= '/*' ( any-char - '*/' ) '*/'
81 ;; Below is the Notation section extracted from the URL cited above.
83 ;; 6 Notation
85 ;; The formal grammar of XML is given in this specification using a simple
86 ;; Extended Backus-Naur Form (EBNF) notation. Each rule in the grammar defines
87 ;; one symbol, in the form
89 ;; symbol ::= expression
91 ;; Symbols are written with an initial capital letter if they are the start
92 ;; symbol of a regular language, otherwise with an initial lowercase letter.
93 ;; Literal strings are quoted.
95 ;; Within the expression on the right-hand side of a rule, the following
96 ;; expressions are used to match strings of one or more characters:
98 ;; #xN
100 ;; where N is a hexadecimal integer, the expression matches the character
101 ;; whose number (code point) in ISO/IEC 10646 is N. The number of leading
102 ;; zeros in the #xN form is insignificant.
104 ;; [a-zA-Z], [#xN-#xN]
106 ;; matches any Char with a value in the range(s) indicated (inclusive).
108 ;; [abc], [#xN#xN#xN]
110 ;; matches any Char with a value among the characters enumerated.
111 ;; Enumerations and ranges can be mixed in one set of brackets.
113 ;; [^a-z], [^#xN-#xN]
115 ;; matches any Char with a value outside the range indicated.
117 ;; [^abc], [^#xN#xN#xN]
119 ;; matches any Char with a value not among the characters given.
120 ;; Enumerations and ranges of forbidden values can be mixed in one set of
121 ;; brackets.
123 ;; "string"
125 ;; matches a literal string matching that given inside the double quotes.
127 ;; 'string'
129 ;; matches a literal string matching that given inside the single quotes.
131 ;; These symbols may be combined to match more complex patterns as follows,
132 ;; where A and B represent simple expressions:
134 ;; (expression)
136 ;; expression is treated as a unit and may be combined as described in this
137 ;; list.
139 ;; A?
141 ;; matches A or nothing; optional A.
143 ;; A B
145 ;; matches A followed by B. This operator has higher precedence than
146 ;; alternation; thus A B | C D is identical to (A B) | (C D).
148 ;; A | B
150 ;; matches A or B.
152 ;; A - B
154 ;; matches any string that matches A but does not match B.
156 ;; A+
158 ;; matches one or more occurrences of A. Concatenation has higher
159 ;; precedence than alternation; thus A+ | B+ is identical to (A+) | (B+).
161 ;; A*
163 ;; matches zero or more occurrences of A. Concatenation has higher
164 ;; precedence than alternation; thus A* | B* is identical to (A*) | (B*).
166 ;; Other notations used in the productions are:
168 ;; /* ... */
170 ;; comment.
172 ;; [ wfc: ... ]
174 ;; well-formedness constraint; this identifies by name a constraint on
175 ;; well-formed documents associated with a production.
177 ;; [ vc: ... ]
179 ;; validity constraint; this identifies by name a constraint on valid
180 ;; documents associated with a production.
183 ;; Differences Between EBNFX And ebnf2ps EBNFX
184 ;; -------------------------------------------
186 ;; Besides the characters that EBNFX accepts, ebnf2ps EBNFX accepts also the
187 ;; underscore (_) and minus (-) for rule name and european 8-bit accentuated
188 ;; characters (from \240 to \377) for rule name, string and comment. Also
189 ;; rule name can start with upper case letter.
192 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
194 ;;; Code:
197 (require 'ebnf-otz)
200 (defvar ebnf-ebx-lex nil
201 "Value returned by `ebnf-ebx-lex' function.")
204 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
205 ;; Syntactic analyzer
208 ;;; rulelist ::= rule+
210 (defun ebnf-ebx-parser (start)
211 "EBNFX parser."
212 (let ((total (+ (- ebnf-limit start) 1))
213 (bias (1- start))
214 (origin (point))
215 rule-list token rule)
216 (goto-char start)
217 (setq token (ebnf-ebx-lex))
218 (and (eq token 'end-of-input)
219 (error "Invalid EBNFX file format"))
220 (and (eq token 'end-of-rule)
221 (setq token (ebnf-ebx-lex)))
222 (while (not (eq token 'end-of-input))
223 (ebnf-message-float
224 "Parsing...%s%%"
225 (/ (* (- (point) bias) 100.0) total))
226 (setq token (ebnf-ebx-rule token)
227 rule (cdr token)
228 token (car token))
229 (or (ebnf-add-empty-rule-list rule)
230 (setq rule-list (cons rule rule-list))))
231 (goto-char origin)
232 rule-list))
235 ;;; rule ::= symbol '::=' expression
238 (defun ebnf-ebx-rule (token)
239 (let ((name ebnf-ebx-lex)
240 (action ebnf-action)
241 elements)
242 (setq ebnf-action nil)
243 (or (eq token 'non-terminal)
244 (error "Invalid rule name"))
245 (setq token (ebnf-ebx-lex))
246 (or (eq token 'production)
247 (error "Invalid rule: missing `::='"))
248 (setq elements (ebnf-ebx-expression))
249 (or (memq (car elements) '(end-of-rule end-of-input))
250 (error "Invalid rule: there is no end of rule"))
251 (setq elements (cdr elements))
252 (ebnf-eps-add-production name)
253 (cons (ebnf-ebx-lex)
254 (ebnf-make-production name elements action))))
257 ;; expression ::= concatenation ('|' concatenation)*
260 (defun ebnf-ebx-expression ()
261 (let (body concatenation)
262 (while (eq (car (setq concatenation
263 (ebnf-ebx-concatenation (ebnf-ebx-lex))))
264 'alternative)
265 (setq body (cons (cdr concatenation) body)))
266 (ebnf-token-alternative body concatenation)))
269 ;; concatenation ::= exception*
272 (defun ebnf-ebx-concatenation (token)
273 (let ((term (ebnf-ebx-exception token))
274 seq)
275 (or (setq token (car term)
276 term (cdr term))
277 (error "Empty element"))
278 (setq seq (cons term seq))
279 (while (setq term (ebnf-ebx-exception token)
280 token (car term)
281 term (cdr term))
282 (setq seq (cons term seq)))
283 (cons token
284 (ebnf-token-sequence seq))))
287 ;;; exception ::= term ('-' term)?
290 (defun ebnf-ebx-exception (token)
291 (let ((term (ebnf-ebx-term token)))
292 (if (eq (car term) 'exception)
293 (let ((except (ebnf-ebx-term (ebnf-ebx-lex))))
294 (cons (car except)
295 (ebnf-make-except (cdr term) (cdr except))))
296 term)))
300 ;;; term ::= factor ('*' | '+' | '?')?
303 (defun ebnf-ebx-term (token)
304 (let ((factor (ebnf-ebx-factor token)))
305 (when factor
306 (setq token (ebnf-ebx-lex))
307 (cond ((eq token 'zero-or-more)
308 (setq factor (ebnf-make-zero-or-more factor)
309 token (ebnf-ebx-lex)))
310 ((eq token 'one-or-more)
311 (setq factor (ebnf-make-one-or-more factor)
312 token (ebnf-ebx-lex)))
313 ((eq token 'optional)
314 (setq factor (ebnf-token-optional factor)
315 token (ebnf-ebx-lex)))))
316 (cons token factor)))
319 ;;; factor ::= hex-char+
320 ;;; | '[' '^'? ( char ( '-' char )? )+ ']'
321 ;;; | '"' 'string' '"'
322 ;;; | "'" "string" "'"
323 ;;; | '(' expression ')'
324 ;;; | symbol
326 ;;; symbol ::= 'upper or lower case letter'
327 ;;; ('upper or lower case letter' | '-' | '_')*
328 ;;; /* upper and lower 8-bit accentuated characters are included */
330 ;;; hex-char ::= '#x' [0-9A-Fa-f]+
332 ;;; char ::= hex-char | 'any character except control characters'
333 ;;; /* 8-bit accentuated characters are included */
335 ;;; any-char ::= char | 'newline' | 'tab'
338 (defun ebnf-ebx-factor (token)
339 (cond
340 ;; terminal
341 ((eq token 'terminal)
342 (ebnf-make-terminal ebnf-ebx-lex))
343 ;; non-terminal
344 ((eq token 'non-terminal)
345 (ebnf-make-non-terminal ebnf-ebx-lex))
346 ;; group
347 ((eq token 'begin-group)
348 (let ((body (ebnf-ebx-expression)))
349 (or (eq (car body) 'end-group)
350 (error "Missing `)'"))
351 (cdr body)))
352 ;; no element
354 nil)
358 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
359 ;; Lexical analyzer
362 (defconst ebnf-ebx-token-table (make-vector 256 'error)
363 "Vector used to map characters to a lexical token.")
366 (defun ebnf-ebx-initialize ()
367 "Initialize EBNFX token table."
368 ;; control character & control 8-bit character are set to `error'
369 (let ((char ?\101))
370 ;; printable character: A-Z
371 (while (< char ?\133)
372 (aset ebnf-ebx-token-table char 'non-terminal)
373 (setq char (1+ char)))
374 ;; printable character: a-z
375 (setq char ?\141)
376 (while (< char ?\173)
377 (aset ebnf-ebx-token-table char 'non-terminal)
378 (setq char (1+ char)))
379 ;; European 8-bit accentuated characters:
380 (setq char ?\240)
381 (while (< char ?\400)
382 (aset ebnf-ebx-token-table char 'non-terminal)
383 (setq char (1+ char)))
384 ;; Override end of line characters:
385 (aset ebnf-ebx-token-table ?\n 'end-of-rule) ; [NL] linefeed
386 (aset ebnf-ebx-token-table ?\r 'end-of-rule) ; [CR] carriage return
387 ;; Override space characters:
388 (aset ebnf-ebx-token-table ?\013 'space) ; [VT] vertical tab
389 (aset ebnf-ebx-token-table ?\t 'space) ; [HT] horizontal tab
390 (aset ebnf-ebx-token-table ?\ 'space) ; [SP] space
391 ;; Override form feed character:
392 (aset ebnf-ebx-token-table ?\f 'form-feed) ; [FF] form feed
393 ;; Override other lexical characters:
394 (aset ebnf-ebx-token-table ?# 'hash)
395 (aset ebnf-ebx-token-table ?\" 'double-quote)
396 (aset ebnf-ebx-token-table ?\' 'single-quote)
397 (aset ebnf-ebx-token-table ?\( 'begin-group)
398 (aset ebnf-ebx-token-table ?\) 'end-group)
399 (aset ebnf-ebx-token-table ?- 'exception)
400 (aset ebnf-ebx-token-table ?: 'colon)
401 (aset ebnf-ebx-token-table ?\[ 'begin-square)
402 (aset ebnf-ebx-token-table ?| 'alternative)
403 (aset ebnf-ebx-token-table ?* 'zero-or-more)
404 (aset ebnf-ebx-token-table ?+ 'one-or-more)
405 (aset ebnf-ebx-token-table ?\? 'optional)
406 ;; Override comment character:
407 (aset ebnf-ebx-token-table ?/ 'comment)))
410 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
411 (defconst ebnf-ebx-non-terminal-chars
412 (ebnf-range-regexp "-_A-Za-z" ?\240 ?\377))
413 (defconst ebnf-ebx-non-terminal-letter-chars
414 (ebnf-range-regexp "A-Za-z" ?\240 ?\377))
417 (defun ebnf-ebx-lex ()
418 "Lexical analyzer for EBNFX.
420 Return a lexical token.
422 See documentation for variable `ebnf-ebx-lex'."
423 (if (>= (point) ebnf-limit)
424 'end-of-input
425 (let (token)
426 ;; skip spaces and comments
427 (while (if (> (following-char) 255)
428 (progn
429 (setq token 'error)
430 nil)
431 (setq token (aref ebnf-ebx-token-table (following-char)))
432 (cond
433 ((eq token 'space)
434 (skip-chars-forward " \013\t" ebnf-limit)
435 (< (point) ebnf-limit))
436 ((eq token 'comment)
437 (ebnf-ebx-skip-comment))
438 ((eq token 'form-feed)
439 (forward-char)
440 (setq ebnf-action 'form-feed))
441 ((eq token 'end-of-rule)
442 (ebnf-ebx-skip-end-of-rule))
443 ((and (eq token 'begin-square)
444 (let ((case-fold-search t))
445 (looking-at "\\[\\(wfc\\|vc\\):")))
446 (ebnf-ebx-skip-constraint))
447 (t nil)
449 (cond
450 ;; end of input
451 ((>= (point) ebnf-limit)
452 'end-of-input)
453 ;; error
454 ((eq token 'error)
455 (error "Invalid character"))
456 ;; end of rule
457 ((eq token 'end-of-rule)
458 'end-of-rule)
459 ;; terminal: #x [0-9A-Fa-f]+
460 ((eq token 'hash)
461 (setq ebnf-ebx-lex (ebnf-ebx-character))
462 'terminal)
463 ;; terminal: "string"
464 ((eq token 'double-quote)
465 (setq ebnf-ebx-lex (ebnf-ebx-string ?\"))
466 'terminal)
467 ;; terminal: 'string'
468 ((eq token 'single-quote)
469 (setq ebnf-ebx-lex (ebnf-ebx-string ?\'))
470 'terminal)
471 ;; terminal: [ ^? ( char ( - char )? )+ ]
472 ((eq token 'begin-square)
473 (setq ebnf-ebx-lex (ebnf-ebx-range))
474 'terminal)
475 ;; non-terminal: NAME
476 ((eq token 'non-terminal)
477 (setq ebnf-ebx-lex
478 (ebnf-buffer-substring ebnf-ebx-non-terminal-chars))
479 'non-terminal)
480 ;; colon: ::=
481 ((eq token 'colon)
482 (or (looking-at "::=")
483 (error "Missing `::=' token"))
484 (forward-char 3)
485 'production)
486 ;; miscellaneous: (, ), *, +, ?, |, -
488 (forward-char)
489 token)
490 ))))
493 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
494 (defconst ebnf-ebx-constraint-chars
495 (ebnf-range-regexp "^\000-\010\016-\037]" ?\177 ?\237))
498 (defun ebnf-ebx-skip-constraint ()
499 (or (> (skip-chars-forward ebnf-ebx-constraint-chars ebnf-limit) 0)
500 (error "Invalid character"))
501 (or (= (following-char) ?\])
502 (error "Missing end of constraint `]'"))
503 (forward-char)
508 (defun ebnf-ebx-skip-end-of-rule ()
509 (let (eor-p)
510 (while (progn
511 ;; end of rule ==> 2 or more consecutive end of lines
512 (setq eor-p (or (> (skip-chars-forward "\r\n" ebnf-limit) 1)
513 eor-p))
514 ;; skip spaces
515 (skip-chars-forward " \013\t" ebnf-limit)
516 ;; skip comments
517 (and (= (following-char) ?/)
518 (ebnf-ebx-skip-comment))))
519 (not eor-p)))
522 ;; replace the range "\177-\237" (see `ebnf-range-regexp').
523 (defconst ebnf-ebx-comment-chars
524 (ebnf-range-regexp "^\000-\010\016-\037\\*" ?\177 ?\237))
525 (defconst ebnf-ebx-filename-chars
526 (ebnf-range-regexp "^\000-\037\\*" ?\177 ?\237))
529 (defun ebnf-ebx-skip-comment ()
530 (forward-char)
531 (or (= (following-char) ?*)
532 (error "Invalid beginning of comment"))
533 (forward-char)
534 (cond
535 ;; open EPS file
536 ((and ebnf-eps-executing (= (following-char) ?\[))
537 (ebnf-eps-add-context (ebnf-ebx-eps-filename)))
538 ;; close EPS file
539 ((and ebnf-eps-executing (= (following-char) ?\]))
540 (ebnf-eps-remove-context (ebnf-ebx-eps-filename)))
541 ;; EPS header
542 ((and ebnf-eps-executing (= (following-char) ?H))
543 (ebnf-eps-header-comment (ebnf-ebx-eps-filename)))
544 ;; EPS footer
545 ((and ebnf-eps-executing (= (following-char) ?F))
546 (ebnf-eps-footer-comment (ebnf-ebx-eps-filename)))
547 ;; any other action in comment
549 (setq ebnf-action (aref ebnf-comment-table (following-char))))
551 (while (progn
552 (skip-chars-forward ebnf-ebx-comment-chars ebnf-limit)
553 (or (= (following-char) ?*)
554 (error "Missing end of comment"))
555 (forward-char)
556 (and (/= (following-char) ?/)
557 (< (point) ebnf-limit))))
558 ;; check for a valid end of comment
559 (and (>= (point) ebnf-limit)
560 (error "Missing end of comment"))
561 (forward-char)
565 (defun ebnf-ebx-eps-filename ()
566 (forward-char)
567 (let (fname nchar)
568 (while (progn
569 (setq fname
570 (concat fname
571 (ebnf-buffer-substring ebnf-ebx-filename-chars)))
572 (and (< (point) ebnf-limit)
573 (> (setq nchar (skip-chars-forward "*" ebnf-limit)) 0)
574 (< (point) ebnf-limit)
575 (/= (following-char) ?/)))
576 (setq fname (concat fname (make-string nchar ?*))
577 nchar nil))
578 (if (or (not nchar) (= nchar 0))
579 fname
580 (and (< (point) ebnf-limit)
581 (= (following-char) ?/)
582 (setq nchar (1- nchar)))
583 (concat fname (make-string nchar ?*)))))
586 ;; replace the range "\240-\377" (see `ebnf-range-regexp').
587 (defconst ebnf-ebx-double-string-chars
588 (ebnf-range-regexp "\t -!#-~" ?\240 ?\377))
589 (defconst ebnf-ebx-single-string-chars
590 (ebnf-range-regexp "\t -&(-~" ?\240 ?\377))
593 (defun ebnf-ebx-string (delim)
594 (buffer-substring-no-properties
595 (progn
596 (forward-char)
597 (point))
598 (progn
599 (skip-chars-forward (if (= delim ?\")
600 ebnf-ebx-double-string-chars
601 ebnf-ebx-single-string-chars)
602 ebnf-limit)
603 (or (= (following-char) delim)
604 (error "Missing string delimiter `%c'" delim))
605 (prog1
606 (point)
607 (forward-char)))))
610 (defun ebnf-ebx-character ()
611 ;; #x [0-9A-Fa-f]+
612 (buffer-substring-no-properties
613 (point)
614 (progn
615 (ebnf-ebx-hex-character)
616 (point))))
619 (defun ebnf-ebx-range ()
620 ;; [ ^? ( char ( - char )? )+ ]
621 (buffer-substring-no-properties
622 (point)
623 (progn
624 (forward-char)
625 (and (= (following-char) ?^)
626 (forward-char))
627 (and (= (following-char) ?-)
628 (forward-char))
629 (while (progn
630 (ebnf-ebx-any-character)
631 (when (= (following-char) ?-)
632 (forward-char)
633 (ebnf-ebx-any-character))
634 (and (/= (following-char) ?\])
635 (< (point) ebnf-limit))))
636 (and (>= (point) ebnf-limit)
637 (error "Missing end of character range `]'"))
638 (forward-char)
639 (point))))
642 (defun ebnf-ebx-any-character ()
643 (let ((char (following-char)))
644 (cond ((= char ?#)
645 (ebnf-ebx-hex-character t))
646 ((or (and (<= ?\ char) (<= char ?\")) ; #
647 (and (<= ?$ char) (<= char ?,)) ; -
648 (and (<= ?. char) (<= char ?\\)) ; ]
649 (and (<= ?^ char) (<= char ?~))
650 (and (<= ?\240 char) (<= char ?\377)))
651 (forward-char))
653 (error "Invalid character `%c'" char)))))
656 (defun ebnf-ebx-hex-character (&optional no-error)
657 ;; #x [0-9A-Fa-f]+
658 (forward-char)
659 (if (/= (following-char) ?x)
660 (or no-error
661 (error "Invalid hexadecimal character"))
662 (forward-char)
663 (or (> (skip-chars-forward "0-9A-Fa-f" ebnf-limit) 0)
664 (error "Invalid hexadecimal character"))))
667 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
670 (provide 'ebnf-ebx)
672 ;; arch-tag: bfe2f95b-66bc-4dc6-8b7e-b7831e68f5fb
673 ;;; ebnf-ebx.el ends here