1 ;;; nxml-rap.el --- low-level support for random access parsing for nXML mode
3 ;; Copyright (C) 2003-2004, 2007-2015 Free Software Foundation, Inc.
6 ;; Keywords: wp, hypermedia, languages, XML
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
25 ;; This uses xmltok.el to do XML parsing. The fundamental problem is
26 ;; how to handle changes. We don't want to maintain a complete parse
27 ;; tree. We also don't want to reparse from the start of the document
28 ;; on every keystroke. However, it is not possible in general to
29 ;; parse an XML document correctly starting at a random point in the
30 ;; middle. The main problems are comments, CDATA sections and
31 ;; processing instructions: these can all contain things that are
32 ;; indistinguishable from elements. Literals in the prolog are also a
33 ;; problem. Attribute value literals are not a problem because
34 ;; attribute value literals cannot contain less-than signs.
36 ;; Our strategy is to keep track of just the problematic things.
37 ;; Specifically, we keep track of all comments, CDATA sections and
38 ;; processing instructions in the instance. We do this by marking all
39 ;; except the first character of these with a non-nil nxml-inside text
40 ;; property. The value of the nxml-inside property is comment,
41 ;; cdata-section or processing-instruction. The first character does
42 ;; not have the nxml-inside property so we can find the beginning of
43 ;; the construct by looking for a change in a text property value
44 ;; (Emacs provides primitives for this). We use text properties
45 ;; rather than overlays, since the implementation of overlays doesn't
46 ;; look like it scales to large numbers of overlays in a buffer.
48 ;; We don't in fact track all these constructs, but only track them in
49 ;; some initial part of the instance. The variable `nxml-scan-end'
50 ;; contains the limit of where we have scanned up to for them.
52 ;; Thus to parse some random point in the file we first ensure that we
53 ;; have scanned up to that point. Then we search backwards for a
54 ;; <. Then we check whether the < has an nxml-inside property. If it
55 ;; does we go backwards to first character that does not have an
56 ;; nxml-inside property (this character must be a <). Then we start
57 ;; parsing forward from the < we have found.
59 ;; The prolog has to be parsed specially, so we also keep track of the
60 ;; end of the prolog in `nxml-prolog-end'. The prolog is reparsed on
61 ;; every change to the prolog. This won't work well if people try to
62 ;; edit huge internal subsets. Hopefully that will be rare.
64 ;; We keep track of the changes by adding to the buffer's
65 ;; after-change-functions hook. Scanning is also done as a
66 ;; prerequisite to fontification by adding to fontification-functions
67 ;; (in the same way as jit-lock). This means that scanning for these
68 ;; constructs had better be quick. Fortunately it is. Firstly, the
69 ;; typical proportion of comments, CDATA sections and processing
70 ;; instructions is small relative to other things. Secondly, to scan
71 ;; we just search for the regexp <[!?].
78 (defvar nxml-prolog-end nil
79 "Integer giving position following end of the prolog.")
80 (make-variable-buffer-local 'nxml-prolog-end
)
82 (defvar nxml-scan-end nil
83 "Marker giving position up to which we have scanned.
84 nxml-scan-end must be >= nxml-prolog-end. Furthermore, nxml-scan-end
85 must not be an inside position in the following sense. A position is
86 inside if the following character is a part of, but not the first
87 character of, a CDATA section, comment or processing instruction.
88 Furthermore all positions >= nxml-prolog-end and < nxml-scan-end that
89 are inside positions must have a non-nil `nxml-inside' property whose
90 value is a symbol specifying what it is inside. Any characters with a
91 non-nil `fontified' property must have position < nxml-scan-end and
92 the correct face. Dependent regions must also be established for any
93 unclosed constructs starting before nxml-scan-end.
94 There must be no `nxml-inside' properties after nxml-scan-end.")
95 (make-variable-buffer-local 'nxml-scan-end
)
97 (defsubst nxml-get-inside
(pos)
98 (get-text-property pos
'nxml-inside
))
100 (defsubst nxml-clear-inside
(start end
)
101 (nxml-debug-clear-inside start end
)
102 (remove-text-properties start end
'(nxml-inside nil
)))
104 (defsubst nxml-set-inside
(start end type
)
105 (nxml-debug-set-inside start end
)
106 (put-text-property start end
'nxml-inside type
))
108 (defun nxml-inside-end (pos)
109 "Return the end of the inside region containing POS.
110 Return nil if the character at POS is not inside."
111 (if (nxml-get-inside pos
)
112 (or (next-single-property-change pos
'nxml-inside
)
116 (defun nxml-inside-start (pos)
117 "Return the start of the inside region containing POS.
118 Return nil if the character at POS is not inside."
119 (if (nxml-get-inside pos
)
120 (or (previous-single-property-change (1+ pos
) 'nxml-inside
)
124 ;;; Change management
126 (defun nxml-scan-after-change (start end
)
127 "Restore `nxml-scan-end' invariants after a change.
128 The change happened between START and END.
129 Return position after which lexical state is unchanged.
130 END must be > `nxml-prolog-end'. START must be outside
131 any 'inside' regions and at the beginning of a token."
132 (if (>= start nxml-scan-end
)
134 (let ((inside-remove-start start
)
136 (while (or (when (xmltok-forward-special (min end nxml-scan-end
))
137 (when (memq xmltok-type
140 processing-instruction
))
141 (nxml-clear-inside inside-remove-start
143 (nxml-set-inside (1+ xmltok-start
)
146 (setq inside-remove-start
(point)))
147 (if (< (point) (min end nxml-scan-end
))
151 ;; The end of the change was inside but is now outside.
152 ;; Imagine something really weird like
153 ;; <![CDATA[foo <!-- bar ]]> <![CDATA[ stuff --> <!-- ]]> -->
154 ;; and suppose we deleted "<![CDATA[f"
155 (let ((inside-end (nxml-inside-end end
)))
157 (setq end inside-end
)
159 (nxml-clear-inside inside-remove-start end
))
160 (when (> end nxml-scan-end
)
161 (set-marker nxml-scan-end end
))
164 ;; n-s-p only called from nxml-mode.el, where this variable is defined.
165 (defvar nxml-prolog-regions
)
167 (defun nxml-scan-prolog ()
168 (goto-char (point-min))
171 (setq nxml-prolog-regions
(xmltok-forward-prolog))
172 (setq nxml-prolog-end
(point))
173 (nxml-clear-inside (point-min) nxml-prolog-end
))
174 (when (< nxml-scan-end nxml-prolog-end
)
175 (set-marker nxml-scan-end nxml-prolog-end
)))
178 ;;; Random access parsing
180 (defun nxml-token-after ()
181 "Return the position after the token containing the char after point.
182 Sets up the variables `xmltok-type', `xmltok-start',
183 `xmltok-name-end', `xmltok-name-colon', `xmltok-attributes',
184 `xmltok-namespace-attributes' in the same was as does
185 `xmltok-forward'. The prolog will be treated as a single token with
188 (if (< pos nxml-prolog-end
)
190 (setq xmltok-type
'prolog
191 xmltok-start
(point-min))
192 (min nxml-prolog-end
(point-max)))
193 (nxml-ensure-scan-up-to-date)
194 (if (nxml-get-inside pos
)
196 (nxml-move-outside-backwards)
200 (if (or (eq (char-after) ?
<)
202 (max (point-min) nxml-prolog-end
)
204 (nxml-move-outside-backwards)
205 (goto-char (if (<= (point-min) nxml-prolog-end
)
207 (or (nxml-inside-end (point-min))
209 (while (and (nxml-tokenize-forward)
213 (defun nxml-token-before ()
214 "Return the position after the token containing the char before point.
215 Sets variables like `nxml-token-after'."
216 (if (/= (point-min) (point))
218 (goto-char (1- (point)))
220 (setq xmltok-start
(point))
221 (setq xmltok-type nil
)
224 (defun nxml-tokenize-forward ()
226 (when (and (xmltok-forward)
227 (> (point) nxml-scan-end
))
228 (cond ((memq xmltok-type
'(comment
230 processing-instruction
))
231 (with-silent-modifications
232 (nxml-set-inside (1+ xmltok-start
) (point) xmltok-type
))))
233 (set-marker nxml-scan-end
(point)))
236 (defun nxml-move-tag-backwards (bound)
237 "Move point backwards outside any 'inside' regions or tags.
238 Point will not move past `nxml-prolog-end'.
239 Point will either be at BOUND or a '<' character starting a tag
240 outside any 'inside' regions.
241 As a precondition, point must be >= BOUND."
242 (nxml-move-outside-backwards)
243 (when (not (equal (char-after) ?
<))
244 (if (search-backward "<" bound t
)
246 (nxml-move-outside-backwards)
247 (when (not (equal (char-after) ?
<))
248 (search-backward "<" bound t
)))
251 (defun nxml-move-outside-backwards ()
252 "Move point to first character of the containing special thing.
253 Leave point unmoved if it is not inside anything special."
254 (let ((start (nxml-inside-start (point))))
256 (goto-char (1- start
))
257 (when (nxml-get-inside (point))
258 (error "Char before inside-start at %s had nxml-inside property %s"
260 (nxml-get-inside (point)))))))
262 (defun nxml-ensure-scan-up-to-date ()
264 (when (< nxml-scan-end pos
)
266 (goto-char nxml-scan-end
)
268 (while (when (xmltok-forward-special pos
)
269 (when (memq xmltok-type
271 processing-instruction
273 (with-silent-modifications
274 (nxml-set-inside (1+ xmltok-start
)
281 (set-marker nxml-scan-end pos
))))))
285 (defun nxml-scan-element-forward (from &optional up
)
286 "Scan forward from FROM over a single balanced element.
287 Point must be between tokens. Return the position of the end of
288 the tag that ends the element. `xmltok-start' will contain the
289 position of the start of the tag. If UP is non-nil, then scan
290 past end-tag of element containing point. If no element is
291 found, return nil. If a well-formedness error prevents scanning,
292 signal an `nxml-scan-error'. Point is not moved."
293 (let ((open-tags (and up t
))
297 (while (cond ((not (nxml-tokenize-forward))
298 (when (consp open-tags
)
299 (nxml-scan-error (cadr open-tags
)
300 "Start-tag has no end-tag"))
302 ((eq xmltok-type
'start-tag
)
304 (cons (xmltok-start-tag-qname)
308 ((eq xmltok-type
'end-tag
)
309 (cond ((not open-tags
) nil
)
310 ((not (consp open-tags
)) (setq found
(point)) nil
)
311 ((not (string= (car open-tags
)
312 (xmltok-end-tag-qname)))
313 (nxml-scan-error (+ 2 xmltok-start
)
314 "Mismatched end-tag; \
317 ((setq open-tags
(cddr open-tags
)) t
)
318 (t (setq found
(point)) nil
)))
319 ((memq xmltok-type
'(empty-element
320 partial-empty-element
))
325 ((eq xmltok-type
'partial-end-tag
)
326 (cond ((not open-tags
) nil
)
327 ((not (consp open-tags
)) (setq found
(point)) nil
)
328 ((setq open-tags
(cddr open-tags
)) t
)
329 (t (setq found
(point)) nil
)))
330 ((eq xmltok-type
'partial-start-tag
)
331 (nxml-scan-error xmltok-start
336 (defun nxml-scan-element-backward (from &optional up bound
)
337 "Scan backward from FROM over a single balanced element.
338 Point must be between tokens. Return the position of the end of
339 the tag that starts the element. `xmltok-start' will contain the
340 position of the start of the tag. If UP is non-nil, then scan
341 past start-tag of element containing point. If BOUND is non-nil,
342 then don't scan back past BOUND. If no element is found, return
343 nil. If a well-formedness error prevents scanning, signal an
344 `nxml-scan-error'. Point is not moved."
345 (let ((open-tags (and up t
))
349 (while (cond ((or (< (point) nxml-prolog-end
)
350 (not (search-backward "<"
354 (when (and (consp open-tags
) (not bound
))
355 (nxml-scan-error (cadr open-tags
)
356 "End-tag has no start-tag"))
359 (nxml-move-outside-backwards)
361 (nxml-tokenize-forward)
362 (setq token-end
(point)))
363 (eq xmltok-type
'end-tag
))
365 (cons (xmltok-end-tag-qname)
366 (cons xmltok-start open-tags
)))
368 ((eq xmltok-type
'start-tag
)
369 (cond ((not open-tags
) nil
)
370 ((not (consp open-tags
))
371 (setq found token-end
)
373 ((and (car open-tags
)
374 (not (string= (car open-tags
)
375 (xmltok-start-tag-qname))))
376 (nxml-scan-error (1+ xmltok-start
)
377 "Mismatched start-tag; \
380 ((setq open-tags
(cddr open-tags
)) t
)
381 (t (setq found token-end
) nil
)))
382 ((memq xmltok-type
'(empty-element
383 partial-empty-element
))
386 (setq found token-end
)
388 ((eq xmltok-type
'partial-end-tag
)
390 (cons nil
(cons xmltok-start open-tags
)))
392 ((eq xmltok-type
'partial-start-tag
)
393 ;; if we have only a partial-start-tag
394 ;; then it's unlikely that there's a matching
395 ;; end-tag, so it's probably not helpful
396 ;; to treat it as a complete start-tag
397 (nxml-scan-error xmltok-start
402 (defun nxml-scan-error (&rest args
)
403 (signal 'nxml-scan-error args
))
405 (define-error 'nxml-scan-error
406 "Scan over element that is not well-formed" 'nxml-error
)
410 ;;; nxml-rap.el ends here