1 ;;; xmltok.el --- XML tokenization
3 ;; Copyright (C) 2003, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
25 ;; This implements an XML 1.0 parser. It also implements the XML
26 ;; Namespaces Recommendation. It is designed to be conforming, but it
27 ;; works a bit differently from a normal XML parser. An XML document
28 ;; consists of the prolog and an instance. The prolog is parsed as a
29 ;; single unit using `xmltok-forward-prolog'. The instance is
30 ;; considered as a sequence of tokens, where a token is something like
31 ;; a start-tag, a comment, a chunk of data or a CDATA section. The
32 ;; tokenization of the instance is stateless: the tokenization of one
33 ;; part of the instance does not depend on tokenization of the
34 ;; preceding part of the instance. This allows the instance to be
35 ;; parsed incrementally. The main entry point is `xmltok-forward':
36 ;; this can be called at any point in the instance provided it is
37 ;; between tokens. The other entry point is `xmltok-forward-special'
38 ;; which skips over tokens other comments, processing instructions or
39 ;; CDATA sections (i.e. the constructs in an instance that can contain
40 ;; less than signs that don't start a token).
42 ;; This is a non-validating XML 1.0 processor. It does not resolve
43 ;; parameter entities (including the external DTD subset) and it does
44 ;; not resolve external general entities.
46 ;; It is non-conformant by design in the following respects.
48 ;; 1. It expects the client to detect aspects of well-formedness that
49 ;; are not internal to a single token, specifically checking that
50 ;; end-tags match start-tags and that the instance contains exactly
53 ;; 2. It expects the client to detect duplicate attributes. Detection
54 ;; of duplicate attributes after expansion of namespace prefixes
55 ;; requires the namespace processing state. Detection of duplicate
56 ;; attributes before expansion of namespace prefixes does not, but is
57 ;; redundant given that the client will do detection of duplicate
58 ;; attributes after expansion of namespace prefixes.
60 ;; 3. It allows the client to recover from well-formedness errors.
61 ;; This is essential for use in applications where the document is
62 ;; being parsed during the editing process.
64 ;; 4. It does not support documents that do not conform to the lexical
65 ;; requirements of the XML Namespaces Recommendation (e.g. a document
66 ;; with a colon in an entity name).
68 ;; There are also a number of things that have not yet been
69 ;; implemented that make it non-conformant.
71 ;; 1. It does not implement default attributes. ATTLIST declarations
72 ;; are parsed, but no checking is done on the content of attribute
73 ;; value literals specifying default attribute values, and default
74 ;; attribute values are not reported to the client.
76 ;; 2. It does not implement internal entities containing elements. If
77 ;; an internal entity is referenced and parsing its replacement text
78 ;; yields one or more tags, then it will skip the reference and
79 ;; report this to the client.
81 ;; 3. It does not check the syntax of public identifiers in the DTD.
83 ;; 4. It allows some non-ASCII characters in certain situations where
84 ;; it should not. For example, it only enforces XML 1.0's
85 ;; restrictions on name characters strictly for ASCII characters. The
86 ;; problem here is XML's character model is based squarely on Unicode,
87 ;; whereas Emacs's is not (as of version 21). It is not clear what
88 ;; the right thing to do is.
92 (defvar xmltok-type nil
)
93 (defvar xmltok-start nil
)
94 (defvar xmltok-name-colon nil
)
95 (defvar xmltok-name-end nil
)
96 (defvar xmltok-replacement nil
97 "String containing replacement for a character or entity reference.")
99 (defvar xmltok-attributes nil
100 "List containing attributes of last scanned element.
101 Each member of the list is a vector representing an attribute, which
102 can be accessed using the functions `xmltok-attribute-name-start',
103 `xmltok-attribute-name-colon', `xmltok-attribute-name-end',
104 `xmltok-attribute-value-start', `xmltok-attribute-value-end',
105 `xmltok-attribute-raw-normalized-value', `xmltok-attribute-refs'.")
107 (defvar xmltok-namespace-attributes nil
108 "List containing namespace declarations of last scanned element.
109 List has same format as `xmltok-attributes'.")
111 (defvar xmltok-dtd nil
112 "Information about the DTD used by `xmltok-forward'.
113 `xmltok-forward-prolog' sets this up.
115 It consists of an alist of general entity names vs definitions. The
116 first member of the alist is t if references to entities not in the
117 alist are well-formed \(e.g. because there's an external subset that
120 Each general entity name is a string. The definition is either nil,
121 a symbol, a string, a cons cell. If the definition is nil, then it
122 means that it's an internal entity but the result of parsing it is
123 unknown. If it is a symbol, then the symbol is either `unparsed',
124 meaning the entity is an unparsed entity, `external', meaning the
125 entity is or references an external entity, `element', meaning the
126 entity includes one or more elements, or `not-well-formed', meaning
127 the replacement text is not well-formed. If the definition is a
128 string, then the replacement text of the entity is that string; this
129 happens only during the parsing of the prolog. If the definition is
130 a cons cell \(ER . AR), then ER specifies the string that results
131 from referencing the entity in element content and AR is either nil,
132 meaning the replacement text included a <, or a string which is the
133 normalized attribute value.")
135 (defvar xmltok-dependent-regions nil
136 "List of descriptors of regions that a parsed token depends on.
138 A token depends on a region if the region occurs after the token and a
139 change in the region may require the token to be reparsed. This only
140 happens with markup that is not well-formed. For example, if a <?
141 occurs without a matching ?>, then the <? is returned as a
142 not-well-formed token. However, this token is dependent on region
143 from the end of the token to the end of the buffer: if this ever
144 contains ?> then the buffer must be reparsed from the <?.
146 A region descriptor is a list (FUN START END ARG ...), where FUN is a
147 function to be called when the region changes, START and END are
148 integers giving the start and end of the region, and ARG... are
149 additional arguments to be passed to FUN. FUN will be called with 5
150 arguments followed by the additional arguments if any: the position of
151 the start of the changed area in the region, the position of the end
152 of the changed area in the region, the length of the changed area
153 before the change, the position of the start of the region, the
154 position of the end of the region. FUN must return non-nil if the
155 region needs reparsing. FUN will be called in a `save-excursion'
156 with match-data saved.
158 `xmltok-forward', `xmltok-forward-special' and `xmltok-forward-prolog'
159 may add entries to the beginning of this list, but will not clear it.
160 `xmltok-forward' and `xmltok-forward-special' will only add entries
161 when returning tokens of type not-well-formed.")
163 (defvar xmltok-errors nil
164 "List of errors detected by `xmltok-forward' and `xmltok-forward-prolog'.
165 When `xmltok-forward' and `xmltok-forward-prolog' detect a
166 well-formedness error, they will add an entry to the beginning of this
167 list. Each entry is a vector [MESSAGE START END], where MESSAGE is a
168 string giving the error message and START and END are integers
169 indicating the position of the error.")
171 (defmacro xmltok-save
(&rest body
)
178 xmltok-namespace-attributes
179 xmltok-dependent-regions
183 (put 'xmltok-save
'lisp-indent-function
0)
184 (def-edebug-spec xmltok-save t
)
186 (defsubst xmltok-attribute-name-start
(att)
189 (defsubst xmltok-attribute-name-colon
(att)
192 (defsubst xmltok-attribute-name-end
(att)
195 (defsubst xmltok-attribute-value-start
(att)
198 (defsubst xmltok-attribute-value-end
(att)
201 (defsubst xmltok-attribute-raw-normalized-value
(att)
202 "Return an object representing the normalized value of ATT.
203 This can be t indicating that the normalized value is the same as
204 the buffer substring from the start to the end of the value, or nil
205 indicating that the value is not well-formed or a string."
208 (defsubst xmltok-attribute-refs
(att)
209 "Return a list of the entity and character references in ATT.
210 Each member is a vector [TYPE START END] where TYPE is either char-ref
211 or entity-ref and START and END are integers giving the start and end of
212 the reference. Nested entity references are not included in the list."
215 (defun xmltok-attribute-prefix (att)
216 (let ((colon (xmltok-attribute-name-colon att
)))
218 (buffer-substring-no-properties (xmltok-attribute-name-start att
)
221 (defun xmltok-attribute-local-name (att)
222 (let ((colon (xmltok-attribute-name-colon att
)))
223 (buffer-substring-no-properties (if colon
225 (xmltok-attribute-name-start att
))
226 (xmltok-attribute-name-end att
))))
228 (defun xmltok-attribute-value (att)
229 (let ((rnv (xmltok-attribute-raw-normalized-value att
)))
233 (buffer-substring-no-properties (xmltok-attribute-value-start att
)
234 (xmltok-attribute-value-end att
))))))
236 (defun xmltok-start-tag-prefix ()
237 (and xmltok-name-colon
238 (buffer-substring-no-properties (1+ xmltok-start
)
241 (defun xmltok-start-tag-local-name ()
242 (buffer-substring-no-properties (1+ (or xmltok-name-colon
246 (defun xmltok-end-tag-prefix ()
247 (and xmltok-name-colon
248 (buffer-substring-no-properties (+ 2 xmltok-start
)
251 (defun xmltok-end-tag-local-name ()
252 (buffer-substring-no-properties (if xmltok-name-colon
253 (1+ xmltok-name-colon
)
257 (defun xmltok-start-tag-qname ()
258 (buffer-substring-no-properties (+ xmltok-start
1) xmltok-name-end
))
260 (defun xmltok-end-tag-qname ()
261 (buffer-substring-no-properties (+ xmltok-start
2) xmltok-name-end
))
263 (defsubst xmltok-make-attribute
(name-begin
269 raw-normalized-value
)
271 RAW-NORMALIZED-VALUE is nil if the value is not well-formed,
272 t if the normalized value is the string between VALUE-BEGIN
273 and VALUE-END, otherwise a STRING giving the value."
282 (defsubst xmltok-error-message
(err)
285 (defsubst xmltok-error-start
(err)
288 (defsubst xmltok-error-end
(err)
291 (defsubst xmltok-make-error
(message start end
)
292 (vector message start end
))
294 (defun xmltok-add-error (message &optional start end
)
296 (cons (xmltok-make-error message
297 (or start xmltok-start
)
301 (defun xmltok-add-dependent (fun &optional start end
&rest args
)
302 (setq xmltok-dependent-regions
304 (cons (or start xmltok-start
)
305 (cons (or end
(point-max))
307 xmltok-dependent-regions
)))
309 (defun xmltok-forward ()
310 (setq xmltok-start
(point))
311 (let* ((case-fold-search nil
)
312 (space-count (skip-chars-forward " \t\r\n"))
315 (cond ((> space-count
0)
316 (setq xmltok-type
'space
))
319 (xmltok-scan-after-lt))))
321 (cond ((> space-count
0)
322 (setq xmltok-type
'space
))
325 (xmltok-scan-after-amp 'xmltok-handle-entity
))))
326 ((re-search-forward "[<&]\\|\\(]]>\\)" nil t
)
327 (cond ((not (match-beginning 1))
328 (goto-char (match-beginning 0))
329 ;; must have got a non-space char
330 (setq xmltok-type
'data
))
331 ((= (match-beginning 1) xmltok-start
)
332 (xmltok-add-error "Found `]]>' not closing a CDATA section")
333 (setq xmltok-type
'not-well-formed
))
335 (goto-char (match-beginning 0))
337 (if (= (point) (+ xmltok-start space-count
))
342 (if (> space-count
0)
346 (goto-char (point-max))
347 (setq xmltok-type
'data
)))))
349 (defun xmltok-forward-special (bound)
350 "Scan forward past the first special token starting at or after point.
351 Return nil if there is no special token that starts before BOUND.
352 CDATA sections, processing instructions and comments (and indeed
353 anything starting with < following by ? or !) count as special.
354 Return the type of the token."
355 (when (re-search-forward "<[?!]" (1+ bound
) t
)
356 (setq xmltok-start
(match-beginning 0))
357 (goto-char (1+ xmltok-start
))
358 (let ((case-fold-search nil
))
359 (xmltok-scan-after-lt))))
363 ;; A symbolic regexp is represented by a list whose CAR is the string
364 ;; containing the regexp and whose cdr is a list of symbolic names
365 ;; for the groups in the string.
367 ;; Construct a symbolic regexp from a regexp.
368 (defun xmltok-r (str)
371 ;; Concatenate zero of more regexps and symbolic regexps.
372 (defun xmltok+ (&rest args
)
375 (let ((arg (car args
)))
377 (setq strs
(cons arg strs
))
378 (setq strs
(cons (car arg
) strs
))
379 (setq names
(cons (cdr arg
) names
)))
380 (setq args
(cdr args
))))
381 (cons (apply 'concat
(nreverse strs
))
382 (apply 'append
(nreverse names
))))))
385 ;; Make a symbolic group named NAME from the regexp R.
386 ;; R may be a symbolic regexp or an ordinary regexp.
387 (defmacro xmltok-g
(name &rest r
)
388 (let ((sym (make-symbol "r")))
389 `(let ((,sym
(xmltok+ ,@r
)))
391 (cons (concat "\\(" ,sym
"\\)") (cons ',name nil
))
392 (cons (concat "\\(" (car ,sym
) "\\)") (cons ',name
(cdr ,sym
)))))))
394 (defun xmltok-p (&rest r
) (xmltok+ "\\(?:"
398 ;; Get the group index of ELEM in a LIST of symbols.
399 (defun xmltok-get-index (elem list
)
401 (error "Missing group name"))
405 (cond ((eq elem
(car list
))
410 (setq list
(cdr list
)))))
412 (error "Bad group name %s" elem
))))
414 ;; Define a macro SYM using a symbolic regexp R.
415 ;; SYM can be called in three ways:
417 ;; expands to the regexp in R
420 ;; (match-beginning N)
421 ;; where N is the group index of G in R.
425 ;; where N is the group index of G in R.
426 (defmacro xmltok-defregexp
(sym r
)
429 `(macro lambda
(action &optional group-name
)
430 (cond ((eq action
'regexp
)
432 ((or (eq action
'start
) (eq action
'beginning
))
433 (list 'match-beginning
(xmltok-get-index group-name
436 (list 'match-end
(xmltok-get-index group-name
440 (xmltok-get-index group-name
',(cdr r
))))
441 ((eq action
'string-no-properties
)
442 (list 'match-string-no-properties
443 (xmltok-get-index group-name
',(cdr r
))))
444 (t (error "Invalid action: %s" action
))))))))
452 (name-start-char "[_[:alpha:]]")
453 (name-continue-not-start-char "[-.[:digit:]]")
454 (name-continue-char "[-._[:alnum:]]")
462 (ncname (concat name-start-char name-continue-char
*))
464 (xmltok+ (xmltok-g entity-name ncname
)
465 (xmltok-g entity-ref-close
";") opt
))
467 (xmltok+ (xmltok-g decimal
"[0-9]" +)
468 (xmltok-g decimal-ref-close
";") opt
))
471 (xmltok-g hex
"[0-9a-fA-F]" +)
472 (xmltok-g hex-ref-close
";") opt
475 (xmltok+ (xmltok-g number-sign
"#")
476 open decimal-ref or hex-ref close opt
))
478 (xmltok+ open
(xmltok-g start-tag-close s
* ">")
479 or open
(xmltok-g empty-tag-slash s
* "/")
480 (xmltok-g empty-tag-close
">") opt close
481 or
(xmltok-g start-tag-s s
+)
484 (xmltok+ (xmltok-g start-tag-name
485 ncname
(xmltok-g start-tag-colon
":" ncname
) opt
)
486 start-tag-close opt
))
488 (xmltok+ (xmltok-g end-tag-slash
"/")
489 open
(xmltok-g end-tag-name
491 (xmltok-g end-tag-colon
":" ncname
) opt
)
492 (xmltok-g end-tag-close s
* ">") opt
495 (xmltok+ (xmltok-g markup-declaration
"!")
496 (xmltok-g comment-first-dash
"-"
497 (xmltok-g comment-open
"-") opt
) opt
))
500 (xmltok-g marked-section-open
"\\[")
506 (xmltok-g cdata-section-open
"\\[" ) opt
512 (processing-instruction
513 (xmltok-g processing-instruction-question question
)))
515 (xmltok-defregexp xmltok-ncname
(xmltok+ open ncname close
))
517 (xmltok-defregexp xmltok-after-amp
518 (xmltok+ entity-ref or char-ref
))
519 (xmltok-defregexp xmltok-after-lt
522 ;; cdata-section must come before comment
523 ;; because we treat <! as a comment
524 ;; and Emacs doesn't do fully greedy matching
528 or processing-instruction
))
534 (xmltok-g complex1
"[&\r\n\t][^<']*") opt
536 (lit2 (cons (replace-regexp-in-string "'" "\"" (car lit1
))
538 (literal (xmltok-g literal lit1 or lit2
))
539 (name (xmltok+ open
(xmltok-g xmlns
"xmlns") or ncname close
540 (xmltok-g colon
":" ncname
) opt
)))
541 (xmltok+ (xmltok-g name name
)
543 ;; If the literal isn't followed by what it should be,
544 ;; then the closing delimiter is probably really the
545 ;; opening delimiter of another literal, so don't
546 ;; absorb the literal in this case.
547 open s
* literal start-tag-close close opt
)))
549 xmltok-xml-declaration
550 (let* ((literal-content "[-._:a-zA-Z0-9]+")
552 (concat open
"\"" literal-content
"\""
553 or
"'" literal-content
"'" close
))
556 s
+ (xmltok-g version-name
"version")
558 s
* (xmltok-g version-value literal
)
562 s
+ (xmltok-g encoding-name
"encoding")
564 s
* (xmltok-g encoding-value literal
)
567 (concat open
"yes" or
"no" close
))
570 s
+ (xmltok-g standalone-name
"standalone")
572 s
* (xmltok-g standalone-value
573 "\"" yes-no
"\"" or
"'" yes-no
"'")
575 (xmltok+ "<" question
"xml"
582 (let* ((single-char (xmltok-g single-char
"[[|,(\"'>]"))
583 (internal-subset-close (xmltok-g internal-subset-close
585 (starts-with-close-paren
586 (xmltok-g close-paren
589 (xmltok-g close-paren-occur
"[+?]")
591 (xmltok-g close-paren-star
"\\*"))
595 "%" (xmltok-g param-entity-ref
597 (xmltok-g param-entity-ref-close
599 (starts-with-nmtoken-not-name
601 (xmltok-p name-continue-not-start-char or
":")
602 (xmltok-p name-continue-char or
":") *))
605 (xmltok-p name-continue-not-start-char or
":")
606 (xmltok-p name-continue-char or
":") *
611 (xmltok-p name-continue-char or
":") *))
613 (xmltok+ (xmltok-g ncname-nmtoken
614 ":" (xmltok-p nmtoken-after-colon
))
615 or
(xmltok-p (xmltok-g colon
":" ncname
)
616 (xmltok-g colon-name-occur
"[?+*]") opt
)
617 or
(xmltok-g ncname-occur
"[?+*]")
618 or
(xmltok-g ncname-colon
":")))
620 (xmltok-g name ncname
(xmltok-p after-ncname
) opt
))
623 "#" (xmltok-g hash-name ncname
)))
625 (xmltok-g markup-declaration
626 "!" (xmltok-p (xmltok-g comment-first-dash
"-"
627 (xmltok-g comment-open
"-") opt
)
628 or
(xmltok-g named-markup-declaration
631 (xmltok+ markup-declaration
632 or
(xmltok-g processing-instruction-question
634 or
(xmltok-g instance-start
636 (starts-with-lt (xmltok-g less-than
"<" (xmltok-p after-lt
) opt
)))
637 (xmltok+ starts-with-lt
639 or starts-with-close-paren
640 or starts-with-percent
642 or starts-with-nmtoken-not-name
644 or internal-subset-close
)))))
646 (defconst xmltok-ncname-regexp
(xmltok-ncname regexp
))
648 (defun xmltok-scan-after-lt ()
649 (cond ((not (looking-at (xmltok-after-lt regexp
)))
650 (xmltok-add-error "`<' that is not markup must be entered as `<'")
651 (setq xmltok-type
'not-well-formed
))
653 (goto-char (match-end 0))
654 (cond ((xmltok-after-lt start start-tag-close
)
655 (setq xmltok-name-end
656 (xmltok-after-lt end start-tag-name
))
657 (setq xmltok-name-colon
658 (xmltok-after-lt start start-tag-colon
))
659 (setq xmltok-attributes nil
)
660 (setq xmltok-namespace-attributes nil
)
661 (setq xmltok-type
'start-tag
))
662 ((xmltok-after-lt start end-tag-close
)
663 (setq xmltok-name-end
664 (xmltok-after-lt end end-tag-name
))
665 (setq xmltok-name-colon
666 (xmltok-after-lt start end-tag-colon
))
667 (setq xmltok-type
'end-tag
))
668 ((xmltok-after-lt start start-tag-s
)
669 (setq xmltok-name-end
670 (xmltok-after-lt end start-tag-name
))
671 (setq xmltok-name-colon
672 (xmltok-after-lt start start-tag-colon
))
673 (setq xmltok-namespace-attributes nil
)
674 (setq xmltok-attributes nil
)
675 (xmltok-scan-attributes)
677 ((xmltok-after-lt start empty-tag-close
)
678 (setq xmltok-name-end
679 (xmltok-after-lt end start-tag-name
))
680 (setq xmltok-name-colon
681 (xmltok-after-lt start start-tag-colon
))
682 (setq xmltok-attributes nil
)
683 (setq xmltok-namespace-attributes nil
)
684 (setq xmltok-type
'empty-element
))
685 ((xmltok-after-lt start cdata-section-open
)
687 (if (search-forward "]]>" nil t
)
689 (xmltok-add-error "No closing ]]>")
690 (xmltok-add-dependent 'xmltok-unclosed-reparse-p
695 ((xmltok-after-lt start processing-instruction-question
)
696 (xmltok-scan-after-processing-instruction-open))
697 ((xmltok-after-lt start comment-open
)
698 (xmltok-scan-after-comment-open))
699 ((xmltok-after-lt start empty-tag-slash
)
700 (setq xmltok-name-end
701 (xmltok-after-lt end start-tag-name
))
702 (setq xmltok-name-colon
703 (xmltok-after-lt start start-tag-colon
))
704 (setq xmltok-attributes nil
)
705 (setq xmltok-namespace-attributes nil
)
706 (xmltok-add-error "Expected `/>'" (1- (point)))
707 (setq xmltok-type
'partial-empty-element
))
708 ((xmltok-after-lt start start-tag-name
)
709 (xmltok-add-error "Missing `>'"
712 (setq xmltok-name-end
713 (xmltok-after-lt end start-tag-name
))
714 (setq xmltok-name-colon
715 (xmltok-after-lt start start-tag-colon
))
716 (setq xmltok-namespace-attributes nil
)
717 (setq xmltok-attributes nil
)
718 (setq xmltok-type
'partial-start-tag
))
719 ((xmltok-after-lt start end-tag-name
)
720 (setq xmltok-name-end
(xmltok-after-lt end end-tag-name
))
721 (setq xmltok-name-colon
722 (xmltok-after-lt start end-tag-colon
))
723 (cond ((and (not xmltok-name-colon
)
724 (eq (char-after) ?
:))
725 (goto-char (1+ (point)))
726 (xmltok-add-error "Expected name following `:'"
729 (xmltok-add-error "Missing `>'"
732 (setq xmltok-type
'partial-end-tag
))
733 ((xmltok-after-lt start end-tag-slash
)
734 (xmltok-add-error "Expected name following `</'")
735 (setq xmltok-name-end nil
)
736 (setq xmltok-name-colon nil
)
737 (setq xmltok-type
'partial-end-tag
))
738 ((xmltok-after-lt start marked-section-open
)
739 (xmltok-add-error "Expected `CDATA[' after `<!['"
742 (setq xmltok-type
'not-well-formed
))
743 ((xmltok-after-lt start comment-first-dash
)
744 (xmltok-add-error "Expected `-' after `<!-'"
747 (setq xmltok-type
'not-well-formed
))
748 ((xmltok-after-lt start markup-declaration
)
749 (xmltok-add-error "Expected `[CDATA[' or `--' after `<!'"
752 (setq xmltok-type
'not-well-formed
))
754 (xmltok-add-error "Not well-formed")
755 (setq xmltok-type
'not-well-formed
))))))
757 ;; XXX This should be unified with
758 ;; xmltok-scan-prolog-after-processing-instruction-open
759 ;; XXX maybe should include rest of line (up to any <,>) in unclosed PI
760 (defun xmltok-scan-after-processing-instruction-open ()
761 (cond ((not (search-forward "?>" nil t
))
762 (xmltok-add-error "No closing ?>"
765 (xmltok-add-dependent 'xmltok-unclosed-reparse-p
769 (setq xmltok-type
'not-well-formed
))
771 (cond ((not (save-excursion
772 (goto-char (+ 2 xmltok-start
))
773 (and (looking-at (xmltok-ncname regexp
))
774 (setq xmltok-name-end
(match-end 0)))))
775 (setq xmltok-name-end
(+ xmltok-start
2))
776 (xmltok-add-error "<? not followed by name"
779 ((not (or (memq (char-after xmltok-name-end
)
781 (= xmltok-name-end
(- (point) 2))))
782 (xmltok-add-error "Target not followed by whitespace"
784 (1+ xmltok-name-end
)))
785 ((and (= xmltok-name-end
(+ xmltok-start
5))
787 (goto-char (+ xmltok-start
2))
788 (let ((case-fold-search t
))
789 (looking-at "xml"))))
790 (xmltok-add-error "Processing instruction target is xml"
792 (+ xmltok-start
5))))
793 (setq xmltok-type
'processing-instruction
))))
795 (defun xmltok-scan-after-comment-open ()
797 (cond ((not (search-forward "--" nil t
))
798 (xmltok-add-error "No closing -->")
799 (xmltok-add-dependent 'xmltok-unclosed-reparse-p
804 ;; in comments in XML
807 ((eq (char-after) ?
>)
808 (goto-char (1+ (point)))
811 (xmltok-add-dependent
812 'xmltok-semi-closed-reparse-p
817 ;; just include the <!-- in the token
818 (goto-char (+ xmltok-start
4))
819 ;; Need do this after the goto-char because
820 ;; marked error should just apply to <!--
821 (xmltok-add-error "First following `--' not followed by `>'")
824 (defun xmltok-scan-attributes ()
825 (let ((recovering nil
)
826 (atts-needing-normalization nil
))
827 (while (cond ((or (looking-at (xmltok-attribute regexp
))
828 ;; use non-greedy group
829 (when (looking-at (concat "[^<>\n]+?"
830 (xmltok-attribute regexp
)))
832 (xmltok-add-error "Malformed attribute"
835 (goto-char (xmltok-attribute start
837 (skip-chars-backward "\r\n\t ")
840 (setq recovering nil
)
841 (goto-char (match-end 0))
842 (let ((att (xmltok-add-attribute)))
844 (setq atts-needing-normalization
845 (cons att atts-needing-normalization
))))
846 (cond ((xmltok-attribute start start-tag-s
) t
)
847 ((xmltok-attribute start start-tag-close
)
848 (setq xmltok-type
'start-tag
)
850 ((xmltok-attribute start empty-tag-close
)
851 (setq xmltok-type
'empty-element
)
853 ((xmltok-attribute start empty-tag-slash
)
854 (setq xmltok-type
'partial-empty-element
)
855 (xmltok-add-error "Expected `/>'"
858 ((looking-at "[ \t\r\n]*[\"']")
859 (goto-char (match-end 0))
860 (xmltok-add-error "Missing closing delimiter"
864 ((looking-at "[ \t]*\\([^ \t\r\n\"'=<>/]+\\)[ \t\r\n/>]")
865 (goto-char (match-end 1))
866 (xmltok-add-error "Attribute value not quoted"
871 (xmltok-add-error "Missing attribute value"
875 ((looking-at "[^<>\n]*/>")
876 (let ((start (point)))
877 (goto-char (match-end 0))
879 (xmltok-add-error "Malformed empty-element"
882 (setq xmltok-type
'empty-element
)
884 ((looking-at "[^<>\n]*>")
885 (let ((start (point)))
886 (goto-char (match-end 0))
888 (xmltok-add-error "Malformed start-tag"
891 (setq xmltok-type
'start-tag
)
895 (skip-chars-forward "^<>\n"))
896 (xmltok-add-error "Missing `>'"
899 (setq xmltok-type
'partial-start-tag
)
901 (while atts-needing-normalization
902 (xmltok-normalize-attribute (car atts-needing-normalization
))
903 (setq atts-needing-normalization
(cdr atts-needing-normalization
))))
904 (setq xmltok-attributes
905 (nreverse xmltok-attributes
))
906 (setq xmltok-namespace-attributes
907 (nreverse xmltok-namespace-attributes
)))
909 (defun xmltok-add-attribute ()
910 "Return the attribute if it needs normalizing, otherwise nil."
911 (let* ((needs-normalizing nil
)
913 (if (xmltok-attribute start literal
)
915 (setq needs-normalizing
916 (or (xmltok-attribute start complex1
)
917 (xmltok-attribute start complex2
)))
918 (xmltok-make-attribute (xmltok-attribute start name
)
919 (xmltok-attribute start colon
)
920 (xmltok-attribute end name
)
921 (1+ (xmltok-attribute start literal
))
922 (1- (xmltok-attribute end literal
))
923 (not needs-normalizing
)))
924 (xmltok-make-attribute (xmltok-attribute start name
)
925 (xmltok-attribute start colon
)
926 (xmltok-attribute end name
)))))
927 (if (xmltok-attribute start xmlns
)
928 (setq xmltok-namespace-attributes
929 (cons att xmltok-namespace-attributes
))
930 (setq xmltok-attributes
931 (cons att xmltok-attributes
)))
932 (and needs-normalizing
935 (defun xmltok-normalize-attribute (att)
936 (let ((end (xmltok-attribute-value-end att
))
941 (goto-char (xmltok-attribute-value-start att
))
943 (let ((n (skip-chars-forward "^\r\t\n&" end
)))
946 (cons (buffer-substring-no-properties (- (point) n
)
949 (when (< (point) end
)
950 (goto-char (1+ (point)))
951 (cond ((eq (char-before) ?\
&)
952 (let ((xmltok-start (1- (point)))
953 xmltok-type xmltok-replacement
)
954 (xmltok-scan-after-amp
956 (xmltok-handle-entity start end t
)))
957 (cond ((or (eq xmltok-type
'char-ref
)
958 (eq xmltok-type
'entity-ref
))
960 (cons (vector xmltok-type
964 (if xmltok-replacement
966 (cons xmltok-replacement
968 (setq well-formed nil
)))
969 (t (setq well-formed nil
)))))
971 (cons " " value-parts
)))))
974 (aset att
5 (apply 'concat
(nreverse value-parts
))))
975 (aset att
6 (nreverse refs
))))
977 (defun xmltok-scan-after-amp (entity-handler)
978 (cond ((not (looking-at (xmltok-after-amp regexp
)))
979 (xmltok-add-error "`&' that is not markup must be entered as `&'")
980 (setq xmltok-type
'not-well-formed
))
982 (goto-char (match-end 0))
983 (cond ((xmltok-after-amp start entity-ref-close
)
984 (funcall entity-handler
985 (xmltok-after-amp start entity-name
)
986 (xmltok-after-amp end entity-name
))
987 (setq xmltok-type
'entity-ref
))
988 ((xmltok-after-amp start decimal-ref-close
)
989 (xmltok-scan-char-ref (xmltok-after-amp start decimal
)
990 (xmltok-after-amp end decimal
)
992 ((xmltok-after-amp start hex-ref-close
)
993 (xmltok-scan-char-ref (xmltok-after-amp start hex
)
994 (xmltok-after-amp end hex
)
996 ((xmltok-after-amp start number-sign
)
997 (xmltok-add-error "Missing character number")
998 (setq xmltok-type
'not-well-formed
))
1000 (xmltok-add-error "Missing closing `;'")
1001 (setq xmltok-type
'not-well-formed
))))))
1003 (defconst xmltok-entity-error-messages
1004 '((unparsed .
"Referenced entity is unparsed")
1005 (not-well-formed .
"Referenced entity is not well-formed")
1006 (external nil .
"Referenced entity is external")
1007 (element nil .
"Referenced entity contains <")))
1009 (defun xmltok-handle-entity (start end
&optional attributep
)
1010 (let* ((name (buffer-substring-no-properties start end
))
1011 (name-def (assoc name xmltok-dtd
))
1012 (def (cdr name-def
)))
1013 (cond ((setq xmltok-replacement
(and (consp def
)
1018 (unless (eq (car xmltok-dtd
) t
)
1019 (xmltok-add-error "Referenced entity has not been defined"
1022 ((and attributep
(consp def
))
1023 (xmltok-add-error "Referenced entity contains <"
1027 (let ((err (cdr (assq def xmltok-entity-error-messages
))))
1029 (setq err
(if attributep
(cdr err
) (car err
))))
1031 (xmltok-add-error err start end
)))))))
1033 (defun xmltok-scan-char-ref (start end base
)
1034 (setq xmltok-replacement
1035 (let ((n (string-to-number (buffer-substring-no-properties start end
)
1037 (cond ((and (integerp n
) (xmltok-valid-char-p n
))
1038 (setq n
(xmltok-unicode-to-char n
))
1041 (xmltok-add-error "Invalid character code" start end
)
1043 (setq xmltok-type
'char-ref
))
1045 (defun xmltok-char-number (start end
)
1046 (let* ((base (if (eq (char-after (+ start
2)) ?x
)
1049 (n (string-to-number
1050 (buffer-substring-no-properties (+ start
(if (= base
16) 3 2))
1054 (xmltok-valid-char-p n
)
1057 (defun xmltok-unclosed-reparse-p (change-start
1063 (let ((len-1 (1- (length delimiter
))))
1064 (goto-char (max start
(- change-start len-1
)))
1065 (search-forward delimiter
(min end
(+ change-end len-1
)) t
)))
1067 ;; Handles a <!-- with the next -- not followed by >
1069 (defun xmltok-semi-closed-reparse-p (change-start
1076 (or (<= (- end delimiter-length
) change-end
)
1077 (xmltok-unclosed-reparse-p change-start
1084 (defun xmltok-valid-char-p (n)
1085 "Return non-nil if N is the Unicode code of a valid XML character."
1086 (cond ((< n
#x20
) (memq n
'(#xA
#xD
#x9
)))
1090 (t (and (> n
#xFFFF
)
1093 (defun xmltok-unicode-to-char (n)
1094 "Return the character corresponding to Unicode scalar value N.
1095 Return nil if unsupported in Emacs."
1096 (decode-char 'ucs n
))
1100 (defvar xmltok-contains-doctype nil
)
1101 (defvar xmltok-doctype-external-subset-flag nil
)
1102 (defvar xmltok-internal-subset-start nil
)
1103 (defvar xmltok-had-param-entity-ref nil
)
1104 (defvar xmltok-prolog-regions nil
)
1105 (defvar xmltok-standalone nil
1106 "Non-nil if there was an XML declaration specifying standalone=\"yes\".")
1107 (defvar xmltok-markup-declaration-doctype-flag nil
)
1109 (defconst xmltok-predefined-entity-alist
1114 ("quot" "\"" .
"\"")))
1116 (defun xmltok-forward-prolog ()
1117 "Move forward to the end of the XML prolog.
1119 Returns a list of vectors [TYPE START END] where TYPE is a symbol and
1120 START and END are integers giving the start and end of the region of
1121 that type. TYPE can be one of xml-declaration,
1122 xml-declaration-attribute-name, xml-declaration-attribute-value,
1123 comment, processing-instruction-left, processing-instruction-right,
1124 markup-declaration-open, markup-declaration-close,
1125 internal-subset-open, internal-subset-close, hash-name, keyword,
1126 literal, encoding-name.
1127 Adds to `xmltok-errors' and `xmltok-dependent-regions' as appropriate."
1128 (let ((case-fold-search nil
)
1131 xmltok-prolog-regions
1132 xmltok-contains-doctype
1133 xmltok-internal-subset-start
1134 xmltok-had-param-entity-ref
1136 xmltok-doctype-external-subset-flag
1137 xmltok-markup-declaration-doctype-flag
)
1138 (setq xmltok-dtd xmltok-predefined-entity-alist
)
1139 (xmltok-scan-xml-declaration)
1140 (xmltok-next-prolog-token)
1141 (while (condition-case err
1142 (when (xmltok-parse-prolog-item)
1143 (xmltok-next-prolog-token))
1144 (xmltok-markup-declaration-parse-error
1145 (xmltok-skip-markup-declaration))))
1146 (when xmltok-internal-subset-start
1147 (xmltok-add-error "No closing ]"
1148 (1- xmltok-internal-subset-start
)
1149 xmltok-internal-subset-start
))
1150 (xmltok-parse-entities)
1151 ;; XXX prune dependent-regions for those entirely in prolog
1152 (nreverse xmltok-prolog-regions
)))
1154 (defconst xmltok-bad-xml-decl-regexp
1155 "[ \t\r\n]*<\\?xml\\(?:[ \t\r\n]\\|\\?>\\)")
1158 (defun xmltok-get-declared-encoding-position (&optional limit
)
1159 "Return the position of the encoding in the XML declaration at point.
1160 If there is a well-formed XML declaration starting at point and it
1161 contains an encoding declaration, then return (START . END)
1162 where START and END are the positions of the start and the end
1163 of the encoding name; if there is no encoding declaration return
1164 the position where and encoding declaration could be inserted.
1165 If there is XML that is not well-formed that looks like an XML
1166 declaration, return nil. Otherwise, return t.
1167 If LIMIT is non-nil, then do not consider characters beyond LIMIT."
1168 (cond ((let ((case-fold-search nil
))
1169 (and (looking-at (xmltok-xml-declaration regexp
))
1170 (or (not limit
) (<= (match-end 0) limit
))))
1171 (let ((end (xmltok-xml-declaration end encoding-value
)))
1173 (cons (1+ (xmltok-xml-declaration start encoding-value
))
1175 (or (xmltok-xml-declaration end version-value
)
1177 ((not (let ((case-fold-search t
))
1178 (looking-at xmltok-bad-xml-decl-regexp
))))))
1180 (defun xmltok-scan-xml-declaration ()
1181 (when (looking-at (xmltok-xml-declaration regexp
))
1182 (xmltok-add-prolog-region 'xml-declaration
(point) (match-end 0))
1183 (goto-char (match-end 0))
1184 (when (xmltok-xml-declaration start version-name
)
1185 (xmltok-add-prolog-region 'xml-declaration-attribute-name
1186 (xmltok-xml-declaration start version-name
)
1187 (xmltok-xml-declaration end version-name
))
1188 (let ((start (xmltok-xml-declaration start version-value
))
1189 (end (xmltok-xml-declaration end version-value
)))
1190 (xmltok-add-prolog-region 'xml-declaration-attribute-value
1193 ;; XXX need to check encoding name
1194 ;; Should start with letter, not contain colon
1195 (when (xmltok-xml-declaration start encoding-name
)
1196 (xmltok-add-prolog-region 'xml-declaration-attribute-name
1197 (xmltok-xml-declaration start encoding-name
)
1198 (xmltok-xml-declaration end encoding-name
))
1199 (let ((start (xmltok-xml-declaration start encoding-value
))
1200 (end (xmltok-xml-declaration end encoding-value
)))
1201 (xmltok-add-prolog-region 'encoding-name
1204 (xmltok-add-prolog-region 'xml-declaration-attribute-value
1207 (when (xmltok-xml-declaration start standalone-name
)
1208 (xmltok-add-prolog-region 'xml-declaration-attribute-name
1209 (xmltok-xml-declaration start standalone-name
)
1210 (xmltok-xml-declaration end standalone-name
))
1211 (let ((start (xmltok-xml-declaration start standalone-value
))
1212 (end (xmltok-xml-declaration end standalone-value
)))
1213 (xmltok-add-prolog-region 'xml-declaration-attribute-value
1216 (setq xmltok-standalone
1217 (string= (buffer-substring-no-properties (1+ start
) (1- end
))
1221 (defconst xmltok-markup-declaration-alist
1222 '(("ELEMENT" . xmltok-parse-element-declaration
)
1223 ("ATTLIST" . xmltok-parse-attlist-declaration
)
1224 ("ENTITY" . xmltok-parse-entity-declaration
)
1225 ("NOTATION" . xmltok-parse-notation-declaration
)))
1227 (defun xmltok-parse-prolog-item ()
1228 (cond ((eq xmltok-type
'comment
)
1229 (xmltok-add-prolog-region 'comment
1233 ((eq xmltok-type
'processing-instruction
))
1234 ((eq xmltok-type
'named-markup-declaration
)
1235 (setq xmltok-markup-declaration-doctype-flag nil
)
1236 (xmltok-add-prolog-region 'markup-declaration-open
1239 (let* ((name (buffer-substring-no-properties
1242 (fun (cdr (assoc name xmltok-markup-declaration-alist
))))
1244 (unless xmltok-internal-subset-start
1246 "Declaration allowed only in internal subset"))
1248 ((string= name
"DOCTYPE")
1249 (xmltok-parse-doctype))
1251 (xmltok-add-error "Unknown markup declaration"
1253 (xmltok-next-prolog-token)
1254 (xmltok-markup-declaration-parse-error))))
1256 ((or (eq xmltok-type
'end-prolog
)
1259 ((eq xmltok-type
'internal-subset-close
)
1260 (xmltok-add-prolog-region 'internal-subset-close
1263 (xmltok-add-prolog-region 'markup-declaration-close
1266 (if xmltok-internal-subset-start
1267 (setq xmltok-internal-subset-start nil
)
1268 (xmltok-add-error "]> outside internal subset"))
1270 ((eq xmltok-type
'param-entity-ref
)
1271 (if xmltok-internal-subset-start
1272 (setq xmltok-had-param-entity-ref t
)
1273 (xmltok-add-error "Parameter entity reference outside document type declaration"))
1275 ;; If we don't do this, we can get thousands of errors when
1276 ;; a plain text file is parsed.
1277 ((not xmltok-internal-subset-start
)
1278 (when (let ((err (car xmltok-errors
)))
1280 (<= (xmltok-error-end err
) xmltok-start
)))
1281 (goto-char xmltok-start
))
1283 ((eq xmltok-type
'not-well-formed
) t
)
1285 (xmltok-add-error "Token allowed only inside markup declaration")
1288 (defun xmltok-parse-doctype ()
1289 (setq xmltok-markup-declaration-doctype-flag t
)
1290 (xmltok-next-prolog-token)
1291 (when xmltok-internal-subset-start
1292 (xmltok-add-error "DOCTYPE declaration not allowed in internal subset")
1293 (xmltok-markup-declaration-parse-error))
1294 (when xmltok-contains-doctype
1295 (xmltok-add-error "Duplicate DOCTYPE declaration")
1296 (xmltok-markup-declaration-parse-error))
1297 (setq xmltok-contains-doctype t
)
1298 (xmltok-require-token 'name
'prefixed-name
)
1299 (xmltok-require-next-token "SYSTEM" "PUBLIC" ?\
[ ?
>)
1300 (cond ((eq xmltok-type ?\
[)
1301 (setq xmltok-internal-subset-start
(point)))
1302 ((eq xmltok-type ?
>))
1304 (setq xmltok-doctype-external-subset-flag t
)
1305 (xmltok-parse-external-id)
1306 (xmltok-require-token ?\
[ ?
>)
1307 (when (eq xmltok-type ?\
[)
1308 (setq xmltok-internal-subset-start
(point))))))
1310 (defun xmltok-parse-attlist-declaration ()
1311 (xmltok-require-next-token 'prefixed-name
'name
)
1313 (xmltok-require-next-token ?
> 'name
'prefixed-name
)
1314 (if (eq xmltok-type ?
>)
1316 (xmltok-require-next-token ?\
(
1326 (cond ((eq xmltok-type ?\
()
1327 (xmltok-parse-nmtoken-group))
1328 ((string= (xmltok-current-token-string)
1330 (xmltok-require-next-token ?\
()
1331 (xmltok-parse-nmtoken-group)))
1332 (xmltok-require-next-token "#IMPLIED"
1336 (when (string= (xmltok-current-token-string) "#FIXED")
1337 (xmltok-require-next-token 'literal
))
1340 (defun xmltok-parse-nmtoken-group ()
1342 (xmltok-require-next-token 'nmtoken
'prefixed-name
'name
)
1343 (xmltok-require-next-token ?| ?\
))
1344 (eq xmltok-type ?|
))))
1346 (defun xmltok-parse-element-declaration ()
1347 (xmltok-require-next-token 'name
'prefixed-name
)
1348 (xmltok-require-next-token "EMPTY" "ANY" ?\
()
1349 (when (eq xmltok-type ?\
()
1350 (xmltok-require-next-token "#PCDATA"
1355 (cond ((eq xmltok-type
'hash-name
)
1356 (xmltok-require-next-token ?| ?\
) 'close-paren-star
)
1357 (while (eq xmltok-type ?|
)
1358 (xmltok-require-next-token 'name
'prefixed-name
)
1359 (xmltok-require-next-token 'close-paren-star ?|
)))
1360 (t (xmltok-parse-model-group))))
1361 (xmltok-require-next-token ?
>))
1363 (defun xmltok-parse-model-group ()
1364 (xmltok-parse-model-group-member)
1365 (xmltok-require-next-token ?|
1370 (when (memq xmltok-type
'(?
, ?|
))
1371 (let ((connector xmltok-type
))
1373 (xmltok-next-prolog-token)
1374 (xmltok-parse-model-group-member)
1375 (xmltok-require-next-token connector
1379 (eq xmltok-type connector
))))))
1381 (defun xmltok-parse-model-group-member ()
1382 (xmltok-require-token 'name
1386 (when (eq xmltok-type ?\
()
1387 (xmltok-next-prolog-token)
1388 (xmltok-parse-model-group)))
1390 (defun xmltok-parse-entity-declaration ()
1392 (xmltok-require-next-token 'name ?%
)
1393 (when (eq xmltok-type ?%
)
1395 (xmltok-require-next-token 'name
))
1396 (setq name
(xmltok-current-token-string))
1397 (xmltok-require-next-token 'literal
"SYSTEM" "PUBLIC")
1398 (cond ((eq xmltok-type
'literal
)
1399 (let ((replacement (xmltok-parse-entity-value)))
1401 (xmltok-define-entity name replacement
)))
1402 (xmltok-require-next-token ?
>))
1404 (xmltok-parse-external-id)
1406 (xmltok-require-token ?
>)
1407 (xmltok-require-token ?
> "NDATA")
1408 (if (eq xmltok-type ?
>)
1409 (xmltok-define-entity name
'external
)
1410 (xmltok-require-next-token 'name
)
1411 (xmltok-require-next-token ?
>)
1412 (xmltok-define-entity name
'unparsed
)))))))
1414 (defun xmltok-define-entity (name value
)
1415 (when (and (or (not xmltok-had-param-entity-ref
)
1417 (not (assoc name xmltok-dtd
)))
1419 (cons (cons name value
) xmltok-dtd
))))
1421 (defun xmltok-parse-entity-value ()
1422 (let ((lim (1- (point)))
1427 (goto-char (1+ xmltok-start
))
1428 (setq start
(point))
1430 (skip-chars-forward "^%&" lim
)
1431 (when (< (point) lim
)
1432 (goto-char (1+ (point)))
1433 (cond ((eq (char-before) ?%
)
1434 (xmltok-add-error "Parameter entity references are not allowed in the internal subset"
1437 (setq well-formed nil
))
1439 (let ((xmltok-start (1- (point)))
1440 xmltok-type xmltok-replacement
)
1441 (xmltok-scan-after-amp (lambda (start end
)))
1442 (cond ((eq xmltok-type
'char-ref
)
1444 (cons (buffer-substring-no-properties
1449 (cons xmltok-replacement
1451 (setq start
(point)))
1452 ((eq xmltok-type
'not-well-formed
)
1453 (setq well-formed nil
))))))
1455 (if (not well-formed
)
1458 (nreverse (cons (buffer-substring-no-properties start lim
)
1461 (defun xmltok-parse-notation-declaration ()
1462 (xmltok-require-next-token 'name
)
1463 (xmltok-require-next-token "SYSTEM" "PUBLIC")
1464 (let ((publicp (string= (xmltok-current-token-string) "PUBLIC")))
1465 (xmltok-require-next-token 'literal
)
1467 (xmltok-require-next-token 'literal ?
>)
1468 (unless (eq xmltok-type ?
>)
1469 (xmltok-require-next-token ?
>)))
1470 (t (xmltok-require-next-token ?
>)))))
1472 (defun xmltok-parse-external-id ()
1473 (xmltok-require-token "SYSTEM" "PUBLIC")
1474 (let ((publicp (string= (xmltok-current-token-string) "PUBLIC")))
1475 (xmltok-require-next-token 'literal
)
1477 (xmltok-require-next-token 'literal
)))
1478 (xmltok-next-prolog-token))
1480 (defun xmltok-require-next-token (&rest types
)
1481 (xmltok-next-prolog-token)
1482 (apply 'xmltok-require-token types
))
1484 (defun xmltok-require-token (&rest types
)
1485 ;; XXX Generate a more helpful error message
1486 (while (and (not (let ((type (car types
)))
1487 (if (stringp (car types
))
1488 (string= (xmltok-current-token-string) type
)
1489 (eq type xmltok-type
))))
1490 (setq types
(cdr types
))))
1492 (when (and xmltok-type
1493 (not (eq xmltok-type
'not-well-formed
)))
1494 (xmltok-add-error "Unexpected token"))
1495 (xmltok-markup-declaration-parse-error))
1496 (let ((region-type (xmltok-prolog-region-type (car types
))))
1498 (xmltok-add-prolog-region region-type
1502 (defun xmltok-current-token-string ()
1503 (buffer-substring-no-properties xmltok-start
(point)))
1505 (put 'xmltok-markup-declaration-parse-error
1507 '(error xmltok-markup-declaration-parse-error
))
1509 (put 'xmltok-markup-declaration-parse-error
1511 "Syntax error in markup declaration")
1513 (defun xmltok-markup-declaration-parse-error ()
1514 (signal 'xmltok-markup-declaration-parse-error nil
))
1516 (defun xmltok-skip-markup-declaration ()
1517 (while (cond ((eq xmltok-type ?
>)
1518 (xmltok-next-prolog-token)
1520 ((and xmltok-markup-declaration-doctype-flag
1521 (eq xmltok-type ?\
[))
1522 (setq xmltok-internal-subset-start
(point))
1523 (xmltok-next-prolog-token)
1525 ((memq xmltok-type
'(nil
1527 named-markup-declaration
1529 processing-instruction
))
1531 ((and xmltok-internal-subset-start
1532 (eq xmltok-type
'internal-subset-close
))
1534 (t (xmltok-next-prolog-token) t
)))
1537 (defun xmltok-prolog-region-type (required)
1538 (cond ((cdr (assq xmltok-type
1539 '((literal . literal
)
1540 (?
> . markup-declaration-close
)
1541 (?\
[ . internal-subset-open
)
1542 (hash-name . hash-name
)))))
1543 ((and (stringp required
) (eq xmltok-type
'name
))
1546 ;; Return new token type.
1548 (defun xmltok-next-prolog-token ()
1549 (skip-chars-forward " \t\r\n")
1550 (setq xmltok-start
(point))
1551 (cond ((not (and (looking-at (xmltok-prolog regexp
))
1552 (goto-char (match-end 0))))
1553 (let ((ch (char-after)))
1555 (goto-char (1+ (point)))
1556 (xmltok-add-error "Illegal char in prolog")
1557 (setq xmltok-type
'not-well-formed
))
1558 (t (setq xmltok-type nil
)))))
1559 ((or (xmltok-prolog start ncname-occur
)
1560 (xmltok-prolog start colon-name-occur
))
1561 (setq xmltok-name-end
(1- (point)))
1562 (setq xmltok-name-colon
(xmltok-prolog start colon
))
1563 (setq xmltok-type
'name-occur
))
1564 ((xmltok-prolog start colon
)
1565 (setq xmltok-name-end
(point))
1566 (setq xmltok-name-colon
(xmltok-prolog start colon
))
1567 (unless (looking-at "[ \t\r\n>),|[%]")
1568 (xmltok-add-error "Missing space after name"))
1569 (setq xmltok-type
'prefixed-name
))
1570 ((or (xmltok-prolog start ncname-nmtoken
)
1571 (xmltok-prolog start ncname-colon
))
1572 (unless (looking-at "[ \t\r\n>),|[%]")
1573 (xmltok-add-error "Missing space after name token"))
1574 (setq xmltok-type
'nmtoken
))
1575 ((xmltok-prolog start name
)
1576 (setq xmltok-name-end
(point))
1577 (setq xmltok-name-colon nil
)
1578 (unless (looking-at "[ \t\r\n>),|[%]")
1579 (xmltok-add-error "Missing space after name"))
1580 (setq xmltok-type
'name
))
1581 ((xmltok-prolog start hash-name
)
1582 (setq xmltok-name-end
(point))
1583 (unless (looking-at "[ \t\r\n>)|%]")
1584 (xmltok-add-error "Missing space after name"))
1585 (setq xmltok-type
'hash-name
))
1586 ((xmltok-prolog start processing-instruction-question
)
1587 (xmltok-scan-prolog-after-processing-instruction-open))
1588 ((xmltok-prolog start comment-open
)
1589 ;; XXX if not-well-formed, ignore some stuff
1590 (xmltok-scan-after-comment-open))
1591 ((xmltok-prolog start named-markup-declaration
)
1592 (setq xmltok-type
'named-markup-declaration
))
1593 ((xmltok-prolog start instance-start
)
1594 (goto-char xmltok-start
)
1595 (setq xmltok-type
'end-prolog
))
1596 ((xmltok-prolog start close-paren-star
)
1597 (setq xmltok-type
'close-paren-star
))
1598 ((xmltok-prolog start close-paren-occur
)
1599 (setq xmltok-type
'close-paren-occur
))
1600 ((xmltok-prolog start close-paren
)
1601 (unless (looking-at "[ \t\r\n>,|)]")
1602 (xmltok-add-error "Missing space after )"))
1603 (setq xmltok-type ?\
)))
1604 ((xmltok-prolog start single-char
)
1605 (let ((ch (char-before)))
1606 (cond ((memq ch
'(?
\" ?
\'))
1607 (xmltok-scan-prolog-literal))
1608 (t (setq xmltok-type ch
)))))
1609 ((xmltok-prolog start percent
)
1610 (cond ((xmltok-prolog start param-entity-ref-close
)
1611 (setq xmltok-name-end
(1- (point)))
1612 (setq xmltok-type
'param-entity-ref
))
1613 ((xmltok-prolog start param-entity-ref
)
1614 (xmltok-add-error "Missing ;")
1615 (setq xmltok-name-end
(point))
1616 (setq xmltok-type
'param-entity-ref
))
1617 ((looking-at "[ \t\r\n%]")
1618 (setq xmltok-type ?%
))
1620 (xmltok-add-error "Expected name after %")
1621 (setq xmltok-type
'not-well-formed
))))
1622 ((xmltok-prolog start nmtoken
)
1623 (unless (looking-at "[ \t\r\n>),|[%]")
1624 (xmltok-add-error "Missing space after name token"))
1625 (setq xmltok-type
'nmtoken
))
1626 ((xmltok-prolog start internal-subset-close
)
1627 (setq xmltok-type
'internal-subset-close
))
1628 ((xmltok-prolog start pound
)
1629 (xmltok-add-error "Expected name after #")
1630 (setq xmltok-type
'not-well-formed
))
1631 ((xmltok-prolog start markup-declaration
)
1632 (xmltok-add-error "Expected name or -- after <!")
1633 (setq xmltok-type
'not-well-formed
))
1634 ((xmltok-prolog start comment-first-dash
)
1635 (xmltok-add-error "Expected <!--")
1636 (setq xmltok-type
'not-well-formed
))
1637 ((xmltok-prolog start less-than
)
1638 (xmltok-add-error "Incomplete markup")
1639 (setq xmltok-type
'not-well-formed
))
1640 (t (error "Unhandled token in prolog %s"
1641 (match-string-no-properties 0)))))
1643 (defun xmltok-scan-prolog-literal ()
1644 (let* ((delim (string (char-before)))
1645 (safe-end (save-excursion
1646 (skip-chars-forward (concat "^<>[]" delim
))
1648 (end (save-excursion
1649 (goto-char safe-end
)
1650 (search-forward delim nil t
))))
1651 (or (cond ((not end
)
1652 (xmltok-add-dependent 'xmltok-unclosed-reparse-p
1659 (looking-at "[ \t\r\n>%[]"))
1661 (setq xmltok-type
'literal
))
1662 ((eq (1+ safe-end
) end
)
1664 (xmltok-add-error (format "Missing space after %s" delim
)
1666 (setq xmltok-type
'literal
))
1668 (xmltok-add-dependent 'xmltok-semi-closed-reparse-p
1675 (xmltok-add-error (format "Missing closing %s" delim
))
1676 (goto-char safe-end
)
1677 (skip-chars-backward " \t\r\n")
1678 (setq xmltok-type
'not-well-formed
)))))
1680 (defun xmltok-scan-prolog-after-processing-instruction-open ()
1681 (cond ((not (search-forward "?>" nil t
))
1682 (xmltok-add-error "No closing ?>"
1685 (xmltok-add-dependent 'xmltok-unclosed-reparse-p
1689 (setq xmltok-type
'not-well-formed
))
1691 (let* ((end (point))
1694 (goto-char (+ xmltok-start
2))
1695 (and (looking-at (xmltok-ncname regexp
))
1696 (or (memq (char-after (match-end 0))
1698 (= (match-end 0) (- end
2)))
1699 (match-string-no-properties 0)))))
1701 (xmltok-add-error "\
1702 Processing instruction does not start with a name"
1704 (+ xmltok-start
3)))
1705 ((not (and (= (length target
) 3)
1706 (let ((case-fold-search t
))
1707 (string-match "xml" target
)))))
1709 (xmltok-add-error "Invalid XML declaration"
1713 (goto-char xmltok-start
)
1714 (looking-at (xmltok-xml-declaration regexp
)))
1715 (xmltok-add-error "XML declaration not at beginning of file"
1719 (xmltok-add-error "Processing instruction has target of xml"
1721 (+ xmltok-start
5))))
1722 (xmltok-add-prolog-region 'processing-instruction-left
1729 (xmltok-add-prolog-region 'processing-instruction-right
1732 (goto-char (+ xmltok-start
1735 (skip-chars-forward " \t\r\n")
1739 (setq xmltok-type
'processing-instruction
))))
1741 (defun xmltok-parse-entities ()
1742 (let ((todo xmltok-dtd
))
1743 (when (and (or xmltok-had-param-entity-ref
1744 xmltok-doctype-external-subset-flag
)
1745 (not xmltok-standalone
))
1746 (setq xmltok-dtd
(cons t xmltok-dtd
)))
1748 (xmltok-parse-entity (car todo
))
1749 (setq todo
(cdr todo
)))))
1751 (defun xmltok-parse-entity (name-def)
1752 (let ((def (cdr name-def
))
1753 ;; in case its value is buffer local
1754 (xmltok-dtd xmltok-dtd
)
1757 (if (string-match "\\`[^&<\t\r\n]*\\'" def
)
1758 (setcdr name-def
(cons def def
))
1759 (setcdr name-def
'not-well-formed
) ; avoid infinite expansion loops
1760 (setq buf
(get-buffer-create
1761 (format " *Entity %s*" (car name-def
))))
1762 (with-current-buffer buf
1765 (goto-char (point-min))
1767 (xmltok-parse-entity-replacement)))
1768 (kill-buffer buf
)))))
1770 (defun xmltok-parse-entity-replacement ()
1771 (let ((def (cons "" "")))
1772 (while (let* ((start (point))
1773 (found (re-search-forward "[<&\t\r\n]\\|]]>" nil t
))
1774 (ch (and found
(char-before)))
1775 (str (buffer-substring-no-properties
1781 (xmltok-append-entity-def def
1783 (cond ((not found
) nil
)
1785 (setq def
'not-well-formed
)
1789 (setq xmltok-start
(1- (point)))
1790 (xmltok-scan-after-lt)
1792 (xmltok-append-entity-def
1794 (cond ((memq xmltok-type
1801 processing-instruction
))
1805 (cons (buffer-substring-no-properties
1809 (t 'not-well-formed
)))))
1812 (let ((xmltok-start (1- (point)))
1816 (xmltok-scan-after-amp 'xmltok-handle-nested-entity
)
1817 (cond ((eq xmltok-type
'entity-ref
)
1819 (xmltok-append-entity-def
1821 xmltok-replacement
)))
1822 ((eq xmltok-type
'char-ref
)
1824 (xmltok-append-entity-def
1826 (if xmltok-replacement
1827 (cons xmltok-replacement
1829 (and xmltok-errors
'not-well-formed
)))))
1831 (setq def
'not-well-formed
))))
1835 (xmltok-append-entity-def
1837 (cons (match-string-no-properties 0)
1842 (defun xmltok-handle-nested-entity (start end
)
1843 (let* ((name-def (assoc (buffer-substring-no-properties start end
)
1845 (def (cdr name-def
)))
1847 (xmltok-parse-entity name-def
)
1848 (setq def
(cdr name-def
)))
1849 (setq xmltok-replacement
1850 (cond ((null name-def
)
1851 (if (eq (car xmltok-dtd
) t
)
1854 ((eq def
'unparsed
) 'not-well-formed
)
1857 (defun xmltok-append-entity-def (d1 d2
)
1860 (cons (concat (car d1
) (car d2
))
1863 (concat (cdr d1
) (cdr d2
))))
1867 (let ((defs '(not-well-formed external element
)))
1868 (while (not (or (eq (car defs
) d1
)
1869 (eq (car defs
) d2
)))
1870 (setq defs
(cdr defs
)))
1873 (defun xmltok-add-prolog-region (type start end
)
1874 (setq xmltok-prolog-regions
1875 (cons (vector type start end
)
1876 xmltok-prolog-regions
)))
1878 (defun xmltok-merge-attributes ()
1879 "Return a list merging `xmltok-attributes' and `xmltok-namespace-attributes'.
1880 The members of the merged list are in order of occurrence in the
1881 document. The list may share list structure with `xmltok-attributes'
1882 and `xmltok-namespace-attributes'."
1883 (cond ((not xmltok-namespace-attributes
)
1885 ((not xmltok-attributes
)
1886 xmltok-namespace-attributes
)
1888 (let ((atts1 xmltok-attributes
)
1889 (atts2 xmltok-namespace-attributes
)
1891 (while (and atts1 atts2
)
1892 (cond ((< (xmltok-attribute-name-start (car atts1
))
1893 (xmltok-attribute-name-start (car atts2
)))
1894 (setq merged
(cons (car atts1
) merged
))
1895 (setq atts1
(cdr atts1
)))
1897 (setq merged
(cons (car atts2
) merged
))
1898 (setq atts2
(cdr atts2
)))))
1899 (setq merged
(nreverse merged
))
1900 (cond (atts1 (setq merged
(nconc merged atts1
)))
1901 (atts2 (setq merged
(nconc merged atts2
))))
1906 (defun xmltok-forward-test ()
1908 (if (xmltok-forward)
1909 (message "Scanned %s" xmltok-type
)
1910 (message "Scanned nothing")))
1912 (defun xmltok-next-prolog-token-test ()
1914 (if (xmltok-next-prolog-token)
1915 (message "Scanned %s"
1916 (if (integerp xmltok-type
)
1917 (string xmltok-type
)
1919 (message "Scanned end of file")))
1923 ;; arch-tag: 747e5f3a-6fc3-4f8d-bd96-89f05aa99f5e
1924 ;;; xmltok.el ends here