Tweak previous vz-bzr change yet again
[emacs.git] / lisp / nxml / nxml-parse.el
blobdfe98acd4136074d3daea594d676e0307516cb98
1 ;;; nxml-parse.el --- XML parser, sharing infrastructure with nxml-mode
3 ;; Copyright (C) 2003, 2007-2012 Free Software Foundation, Inc.
5 ;; Author: James Clark
6 ;; Keywords: XML
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
23 ;;; Commentary:
25 ;; Entry point is `nxml-parse-file'.
27 ;;; Code:
29 (require 'nxml-util)
30 (require 'xmltok)
31 (require 'nxml-enc)
32 (require 'nxml-ns)
34 (defvar nxml-parse-file-name nil)
36 (defvar nxml-validate-function nil
37 "Either nil or a function called by `nxml-parse-file' to perform validation.
38 The function will be called once for each start-tag or end-tag. The
39 function is passed two arguments TEXT and START-TAG. For a start-tag,
40 START-TAG is a list (NAME ATTRIBUTES) where NAME and ATTRIBUTES are in
41 the same form as returned by `nxml-parse-file'. For an end-tag,
42 START-TAG is nil. TEXT is a string containing the text immediately
43 preceding the tag, or nil if there was no such text. An empty element
44 is treated as a start-tag followed by an end-tag.
46 For a start-tag, the namespace state will be the state after
47 processing the namespace declarations in the start-tag. For an
48 end-tag, the namespace state will be the state before popping the
49 namespace declarations for the corresponding start-tag.
51 The function must return nil if no error is detected or a
52 cons (MESSAGE . LOCATION) where MESSAGE is a string containing
53 an error message and LOCATION indicates what caused the error
54 as follows:
56 - nil indicates the tag as whole caused it; this is always allowed;
58 - text indicates the text caused it; this is allowed only if
59 TEXT is non-nil;
61 - tag-close indicates the close of the tag caused it; this is
62 allowed only if START-TAG is non-nil;
64 - (attribute-name . N) indicates that the name of the Nth attribute
65 caused it; N counts from 0; this is allowed only if START-TAG is non-nil
66 and N must be less than the number of attributes;
68 - (attribute-value . N) indicates that the value of the Nth attribute
69 caused it; N counts from 0; this is allowed only if START-TAG is non-nil
70 and N must be less than the number of attributes.")
72 (defun nxml-parse-file (file)
73 "Parse the XML document in FILE and return it as a list.
74 An XML element is represented as a list (NAME ATTRIBUTES . CHILDREN).
75 NAME is either a string, in the case where the name does not have a
76 namespace, or a cons (NAMESPACE . LOCAL-NAME), where NAMESPACE is a
77 symbol and LOCAL-NAME is a string, in the case where the name does
78 have a namespace. NAMESPACE is a keyword whose name is `:URI', where
79 URI is the namespace name. ATTRIBUTES is an alist of attributes where
80 each attribute has the form (NAME . VALUE), where NAME has the same
81 form as an element name, and VALUE is a string. A namespace
82 declaration is represented as an attribute whose name is
83 \(:http://www.w3.org/2000/xmlns/ . LOCAL-NAME). CHILDREN is a list
84 containing strings and child elements; CHILDREN never contains two
85 consecutive strings and never contains an empty string. Processing
86 instructions and comments are not represented. The return value is a
87 list representing the document element.
89 If the XML document is not well-formed, an error having the condition
90 `nxml-file-parse-error' will be signaled; the error data will be a
91 list of the form \(FILE POSITION MESSAGE), where POSITION is an
92 integer specifying the position where the error was detected, and
93 MESSAGE is a string describing the error.
95 The current contents of FILE will be parsed even if there is a
96 modified buffer currently visiting FILE.
98 If the variable `nxml-validate-function' is non-nil, it will be called
99 twice for each element, and any reported error will be signaled in the
100 same way as well-formedness error."
101 (with-current-buffer (nxml-parse-find-file file)
102 (unwind-protect
103 (let ((nxml-parse-file-name file))
104 (nxml-parse-instance))
105 (kill-buffer nil))))
107 (defun nxml-parse-find-file (file)
108 (with-current-buffer (get-buffer-create " *nXML Parse*")
109 (erase-buffer)
110 (let ((set-auto-coding-function 'nxml-set-xml-coding))
111 (insert-file-contents file))
112 (current-buffer)))
114 (defun nxml-parse-instance ()
115 (let (xmltok-dtd)
116 (xmltok-save
117 (xmltok-forward-prolog)
118 (nxml-check-xmltok-errors)
119 (nxml-ns-save
120 (nxml-parse-instance-1)))))
122 (defun nxml-parse-instance-1 ()
123 (let* ((top (cons nil nil))
124 ;; tail is a cons cell, whose cdr is nil
125 ;; additional elements will destructively appended to tail
126 (tail top)
127 ;; stack of tails one for each open element
128 tail-stack
129 ;; list of QNames of open elements
130 open-element-tags
131 ;; list of strings buffering a text node, in reverse order
132 text
133 ;; position of beginning of first (in buffer) string in text
134 text-pos)
135 (while (xmltok-forward)
136 (nxml-check-xmltok-errors)
137 (cond ((memq xmltok-type '(start-tag end-tag empty-element))
138 (when text
139 (setq text (apply 'concat (nreverse text)))
140 (setcdr tail (cons text nil))
141 (setq tail (cdr tail)))
142 (when (not (eq xmltok-type 'end-tag))
143 (when (and (not open-element-tags)
144 (not (eq tail top)))
145 (nxml-parse-error nil "Multiple top-level elements"))
146 (setq open-element-tags
147 (cons (xmltok-start-tag-qname)
148 open-element-tags))
149 (nxml-ns-push-state)
150 (let ((tag (nxml-parse-start-tag)))
151 (nxml-validate-tag text text-pos tag)
152 (setq text nil)
153 (setcdr tail (cons tag nil))
154 (setq tail (cdr tail))
155 (setq tail-stack (cons tail tail-stack))
156 (setq tail (last tag))))
157 (when (not (eq xmltok-type 'start-tag))
158 (or (eq xmltok-type 'empty-element)
159 (equal (car open-element-tags)
160 (xmltok-end-tag-qname))
161 (if open-element-tags
162 (nxml-parse-error nil
163 "Unbalanced end-tag; expected </%s>"
164 (car open-element-tags))
165 (nxml-parse-error nil "Extra end-tag")))
166 (nxml-validate-tag text text-pos nil)
167 (setq text nil)
168 (nxml-ns-pop-state)
169 (setq open-element-tags (cdr open-element-tags))
170 (setq tail (car tail-stack))
171 (setq tail-stack (cdr tail-stack)))
172 (setq text-pos nil))
173 ((memq xmltok-type '(space data entity-ref char-ref cdata-section))
174 (cond (open-element-tags
175 (unless text-pos
176 (setq text-pos xmltok-start))
177 (setq text
178 (cons (nxml-current-text-string) text)))
179 ((not (eq xmltok-type 'space))
180 (nxml-parse-error
182 "%s at top-level"
183 (cdr (assq xmltok-type
184 '((data . "Text characters")
185 (entity-ref . "Entity reference")
186 (char-ref . "Character reference")
187 (cdata-section . "CDATA section"))))))))))
188 (unless (cdr top)
189 (nxml-parse-error (point-max) "Missing document element"))
190 (cadr top)))
192 (defun nxml-parse-start-tag ()
193 (let (parsed-attributes
194 parsed-namespace-attributes
195 atts att prefixes prefix ns value name)
196 (setq atts xmltok-namespace-attributes)
197 (while atts
198 (setq att (car atts))
199 (setq value (or (xmltok-attribute-value att)
200 (nxml-parse-error nil "Invalid attribute value")))
201 (setq ns (nxml-make-namespace value))
202 (setq prefix (and (xmltok-attribute-prefix att)
203 (xmltok-attribute-local-name att)))
204 (cond ((member prefix prefixes)
205 (nxml-parse-error nil "Duplicate namespace declaration"))
206 ((not prefix)
207 (nxml-ns-set-default ns))
209 (nxml-ns-set-prefix prefix ns))
210 (t (nxml-parse-error nil "Cannot undeclare namespace prefix")))
211 (setq prefixes (cons prefix prefixes))
212 (setq parsed-namespace-attributes
213 (cons (cons (nxml-make-name nxml-xmlns-namespace-uri
214 (xmltok-attribute-local-name att))
215 value)
216 parsed-namespace-attributes))
217 (setq atts (cdr atts)))
218 (setq name
219 (nxml-make-name
220 (let ((prefix (xmltok-start-tag-prefix)))
221 (if prefix
222 (or (nxml-ns-get-prefix prefix)
223 (nxml-parse-error (1+ xmltok-start)
224 "Prefix `%s' undeclared"
225 prefix))
226 (nxml-ns-get-default)))
227 (xmltok-start-tag-local-name)))
228 (setq atts xmltok-attributes)
229 (while atts
230 (setq att (car atts))
231 (setq ns
232 (let ((prefix (xmltok-attribute-prefix att)))
233 (and prefix
234 (or (nxml-ns-get-prefix prefix)
235 (nxml-parse-error (xmltok-attribute-name-start att)
236 "Prefix `%s' undeclared"
237 prefix)))))
238 (setq parsed-attributes
239 (let ((nm (nxml-make-name ns
240 (xmltok-attribute-local-name att))))
241 (when (assoc nm parsed-attributes)
242 (nxml-parse-error (xmltok-attribute-name-start att)
243 "Duplicate attribute"))
244 (cons (cons nm (or (xmltok-attribute-value att)
245 (nxml-parse-error nil "Invalid attribute value")))
246 parsed-attributes)))
247 (setq atts (cdr atts)))
248 ;; We want to end up with the attributes followed by the
249 ;; the namespace attributes in the same order as
250 ;; xmltok-attributes and xmltok-namespace-attributes respectively.
251 (when parsed-namespace-attributes
252 (setq parsed-attributes
253 (nconc parsed-namespace-attributes parsed-attributes)))
254 (list name (nreverse parsed-attributes))))
256 (defun nxml-validate-tag (text text-pos tag)
257 (when nxml-validate-function
258 (let ((err (funcall nxml-validate-function text tag))
259 pos)
260 (when err
261 (setq pos (nxml-validate-error-position (cdr err)
262 (and text text-pos)
263 tag))
264 (or pos (error "Incorrect return value from %s"
265 nxml-validate-function))
266 (nxml-parse-error pos (car err))))))
268 (defun nxml-validate-error-position (location text-pos tag)
269 (cond ((null location) xmltok-start)
270 ((eq location 'text) text-pos)
271 ((eq location 'tag-close)
272 (and tag (- (point) (if (eq xmltok-type 'empty-element ) 2 1))))
273 ((consp location)
274 (let ((att (nth (cdr location) xmltok-attributes)))
275 (when (not att)
276 (setq att (nth (- (cdr location) (length xmltok-attributes))
277 xmltok-namespace-attributes)))
278 (cond ((not att))
279 ((eq (car location) 'attribute-name)
280 (xmltok-attribute-name-start att))
281 ((eq (car location) 'attribute-value)
282 (xmltok-attribute-value-start att)))))))
284 (defun nxml-make-name (ns local-name)
285 (if ns
286 (cons ns local-name)
287 local-name))
289 (defun nxml-current-text-string ()
290 (cond ((memq xmltok-type '(space data))
291 (buffer-substring-no-properties xmltok-start
292 (point)))
293 ((eq xmltok-type 'cdata-section)
294 (buffer-substring-no-properties (+ xmltok-start 9)
295 (- (point) 3)))
296 ((memq xmltok-type '(char-ref entity-ref))
297 (unless xmltok-replacement
298 (nxml-parse-error nil
299 (if (eq xmltok-type 'char-ref)
300 "Reference to unsupported Unicode character"
301 "Unresolvable entity reference")))
302 xmltok-replacement)))
304 (defun nxml-parse-error (position &rest args)
305 (nxml-signal-file-parse-error nxml-parse-file-name
306 (or position xmltok-start)
307 (apply 'format args)))
309 (defun nxml-check-xmltok-errors ()
310 (when xmltok-errors
311 (let ((err (car (last xmltok-errors))))
312 (nxml-signal-file-parse-error nxml-parse-file-name
313 (xmltok-error-start err)
314 (xmltok-error-message err)))))
316 (provide 'nxml-parse)
318 ;;; nxml-parse.el ends here