1 ;;; nxml-rap.el --- low-level support for random access parsing for nXML mode
3 ;; Copyright (C) 2003, 2004, 2007, 2008 Free Software Foundation, Inc.
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 3, or (at your option)
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23 ;; Boston, MA 02110-1301, USA.
27 ;; This uses xmltok.el to do XML parsing. The fundamental problem is
28 ;; how to handle changes. We don't want to maintain a complete parse
29 ;; tree. We also don't want to reparse from the start of the document
30 ;; on every keystroke. However, it is not possible in general to
31 ;; parse an XML document correctly starting at a random point in the
32 ;; middle. The main problems are comments, CDATA sections and
33 ;; processing instructions: these can all contain things that are
34 ;; indistinguishable from elements. Literals in the prolog are also a
35 ;; problem. Attribute value literals are not a problem because
36 ;; attribute value literals cannot contain less-than signs.
38 ;; Our strategy is to keep track of just the problematic things.
39 ;; Specifically, we keep track of all comments, CDATA sections and
40 ;; processing instructions in the instance. We do this by marking all
41 ;; except the first character of these with a non-nil nxml-inside text
42 ;; property. The value of the nxml-inside property is comment,
43 ;; cdata-section or processing-instruction. The first character does
44 ;; not have the nxml-inside property so we can find the beginning of
45 ;; the construct by looking for a change in a text property value
46 ;; (Emacs provides primitives for this). We use text properties
47 ;; rather than overlays, since the implementation of overlays doesn't
48 ;; look like it scales to large numbers of overlays in a buffer.
50 ;; We don't in fact track all these constructs, but only track them in
51 ;; some initial part of the instance. The variable `nxml-scan-end'
52 ;; contains the limit of where we have scanned up to for them.
54 ;; Thus to parse some random point in the file we first ensure that we
55 ;; have scanned up to that point. Then we search backwards for a
56 ;; <. Then we check whether the < has an nxml-inside property. If it
57 ;; does we go backwards to first character that does not have an
58 ;; nxml-inside property (this character must be a <). Then we start
59 ;; parsing forward from the < we have found.
61 ;; The prolog has to be parsed specially, so we also keep track of the
62 ;; end of the prolog in `nxml-prolog-end'. The prolog is reparsed on
63 ;; every change to the prolog. This won't work well if people try to
64 ;; edit huge internal subsets. Hopefully that will be rare.
66 ;; We keep track of the changes by adding to the buffer's
67 ;; after-change-functions hook. Scanning is also done as a
68 ;; prerequisite to fontification by adding to fontification-functions
69 ;; (in the same way as jit-lock). This means that scanning for these
70 ;; constructs had better be quick. Fortunately it is. Firstly, the
71 ;; typical proportion of comments, CDATA sections and processing
72 ;; instructions is small relative to other things. Secondly, to scan
73 ;; we just search for the regexp <[!?].
75 ;; One problem is unclosed comments, processing instructions and CDATA
76 ;; sections. Suppose, for example, we encounter a <!-- but there's no
77 ;; matching -->. This is not an unexpected situation if the user is
78 ;; creating a comment. It is not helpful to treat the whole of the
79 ;; file starting from the <!-- onwards as a single unclosed comment
80 ;; token. Instead we treat just the <!-- as a piece of not well-formed
81 ;; markup and continue. The problem is that if at some later stage a
82 ;; --> gets added to the buffer after the unclosed <!--, we will need
83 ;; to reparse the buffer starting from the <!--. We need to keep
84 ;; track of these reparse dependencies; they are called dependent
85 ;; regions in the code.
92 (defvar nxml-prolog-end nil
93 "Integer giving position following end of the prolog.")
94 (make-variable-buffer-local 'nxml-prolog-end
)
96 (defvar nxml-scan-end nil
97 "Marker giving position up to which we have scanned.
98 nxml-scan-end must be >= nxml-prolog-end. Furthermore, nxml-scan-end
99 must not an inside position in the following sense. A position is
100 inside if the following character is a part of, but not the first
101 character of, a CDATA section, comment or processing instruction.
102 Furthermore all positions >= nxml-prolog-end and < nxml-scan-end that
103 are inside positions must have a non-nil nxml-inside property whose
104 value is a symbol specifying what it is inside. Any characters with a
105 non-nil fontified property must have position < nxml-scan-end and the
106 correct face. Dependent regions must also be established for any
107 unclosed constructs starting before nxml-scan-end.
108 There must be no nxml-inside properties after nxml-scan-end.")
109 (make-variable-buffer-local 'nxml-scan-end
)
111 (defsubst nxml-get-inside
(pos)
112 (get-text-property pos
'nxml-inside
))
114 (defsubst nxml-clear-inside
(start end
)
115 (remove-text-properties start end
'(nxml-inside nil
)))
117 (defsubst nxml-set-inside
(start end type
)
118 (put-text-property start end
'nxml-inside type
))
120 (defun nxml-inside-end (pos)
121 "Return the end of the inside region containing POS.
122 Return nil if the character at POS is not inside."
123 (if (nxml-get-inside pos
)
124 (or (next-single-property-change pos
'nxml-inside
)
128 (defun nxml-inside-start (pos)
129 "Return the start of the inside region containing POS.
130 Return nil if the character at POS is not inside."
131 (if (nxml-get-inside pos
)
132 (or (previous-single-property-change (1+ pos
) 'nxml-inside
)
136 ;;; Change management
138 (defun nxml-scan-after-change (start end
)
139 "Restore `nxml-scan-end' invariants after a change.
140 The change happened between START and END.
141 Return position after which lexical state is unchanged.
142 END must be > nxml-prolog-end."
143 (if (>= start nxml-scan-end
)
146 (nxml-move-outside-backwards)
148 (let ((inside-remove-start start
)
150 xmltok-dependent-regions
)
151 (while (or (when (xmltok-forward-special (min end nxml-scan-end
))
152 (when (memq xmltok-type
155 processing-instruction
))
156 (nxml-clear-inside inside-remove-start
158 (nxml-set-inside (1+ xmltok-start
)
161 (setq inside-remove-start
(point)))
162 (if (< (point) (min end nxml-scan-end
))
166 ;; The end of the change was inside but is now outside.
167 ;; Imagine something really weird like
168 ;; <![CDATA[foo <!-- bar ]]> <![CDATA[ stuff --> <!-- ]]> -->
169 ;; and suppose we deleted "<![CDATA[f"
170 (let ((inside-end (nxml-inside-end end
)))
172 (setq end inside-end
)
174 (nxml-clear-inside inside-remove-start end
)
175 (nxml-clear-dependent-regions start end
)
176 (nxml-mark-parse-dependent-regions))
177 (when (> end nxml-scan-end
)
178 (set-marker nxml-scan-end end
))
181 ;; n-s-p only called from nxml-mode.el, where this variable is defined.
182 (defvar nxml-prolog-regions
)
184 (defun nxml-scan-prolog ()
185 (goto-char (point-min))
188 xmltok-dependent-regions
)
189 (setq nxml-prolog-regions
(xmltok-forward-prolog))
190 (setq nxml-prolog-end
(point))
191 (nxml-clear-inside (point-min) nxml-prolog-end
)
192 (nxml-clear-dependent-regions (point-min) nxml-prolog-end
)
193 (nxml-mark-parse-dependent-regions))
194 (when (< nxml-scan-end nxml-prolog-end
)
195 (set-marker nxml-scan-end nxml-prolog-end
)))
198 ;;; Dependent regions
200 (defun nxml-adjust-start-for-dependent-regions (start end pre-change-length
)
201 (let ((overlays (overlays-in (1- start
) start
))
202 (adjusted-start start
))
204 (let* ((overlay (car overlays
))
205 (ostart (overlay-start overlay
)))
206 (when (and (eq (overlay-get overlay
'category
) 'nxml-dependent
)
207 (< ostart adjusted-start
))
208 (let ((funargs (overlay-get overlay
'nxml-funargs
)))
209 (when (apply (car funargs
)
214 (overlay-end overlay
))
216 (setq adjusted-start ostart
)))))
217 (setq overlays
(cdr overlays
)))
220 (defun nxml-mark-parse-dependent-regions ()
221 (while xmltok-dependent-regions
222 (apply 'nxml-mark-parse-dependent-region
223 (car xmltok-dependent-regions
))
224 (setq xmltok-dependent-regions
225 (cdr xmltok-dependent-regions
))))
227 (defun nxml-mark-parse-dependent-region (fun start end
&rest args
)
228 (let ((overlay (make-overlay start end nil t t
)))
229 (overlay-put overlay
'category
'nxml-dependent
)
230 (overlay-put overlay
'nxml-funargs
(cons fun args
))))
232 (put 'nxml-dependent
'evaporate t
)
234 (defun nxml-clear-dependent-regions (start end
)
235 (let ((overlays (overlays-in start end
)))
237 (let* ((overlay (car overlays
))
238 (category (overlay-get overlay
'category
)))
239 (when (and (eq category
'nxml-dependent
)
240 (<= start
(overlay-start overlay
)))
241 (delete-overlay overlay
)))
242 (setq overlays
(cdr overlays
)))))
244 ;;; Random access parsing
246 (defun nxml-token-after ()
247 "Return the position after the token containing the char after point.
248 Sets up the variables `xmltok-type', `xmltok-start',
249 `xmltok-name-end', `xmltok-name-colon', `xmltok-attributes',
250 `xmltok-namespace-attributes' in the same was as does
251 `xmltok-forward'. The prolog will be treated as a single token with
254 (if (< pos nxml-prolog-end
)
256 (setq xmltok-type
'prolog
257 xmltok-start
(point-min))
258 (min nxml-prolog-end
(point-max)))
259 (nxml-ensure-scan-up-to-date)
260 (if (nxml-get-inside pos
)
262 (nxml-move-outside-backwards)
266 (if (or (eq (char-after) ?
<)
268 (max (point-min) nxml-prolog-end
)
270 (nxml-move-outside-backwards)
271 (goto-char (if (<= (point-min) nxml-prolog-end
)
273 (or (nxml-inside-end (point-min))
275 (while (and (nxml-tokenize-forward)
279 (defun nxml-token-before ()
280 "Return the position after the token containing the char before point.
281 Sets variables like `nxml-token-after'."
282 (if (/= (point-min) (point))
284 (goto-char (1- (point)))
286 (setq xmltok-start
(point))
287 (setq xmltok-type nil
)
290 (defun nxml-tokenize-forward ()
291 (let (xmltok-dependent-regions
293 (when (and (xmltok-forward)
294 (> (point) nxml-scan-end
))
295 (cond ((memq xmltok-type
'(comment
297 processing-instruction
))
298 (nxml-with-unmodifying-text-property-changes
299 (nxml-set-inside (1+ xmltok-start
) (point) xmltok-type
)))
300 (xmltok-dependent-regions
301 (nxml-mark-parse-dependent-regions)))
302 (set-marker nxml-scan-end
(point)))
305 (defun nxml-move-outside-backwards ()
306 "Move point to first character of the containing special thing.
307 Leave point unmoved if it is not inside anything special."
308 (let ((start (nxml-inside-start (point))))
310 (goto-char (1- start
))
311 (when (nxml-get-inside (point))
312 (error "Char before inside-start at %s had nxml-inside property %s"
314 (nxml-get-inside (point)))))))
316 (defun nxml-ensure-scan-up-to-date ()
318 (when (< nxml-scan-end pos
)
320 (goto-char nxml-scan-end
)
322 xmltok-dependent-regions
)
323 (while (when (xmltok-forward-special pos
)
324 (when (memq xmltok-type
326 processing-instruction
328 (nxml-with-unmodifying-text-property-changes
329 (nxml-set-inside (1+ xmltok-start
)
336 (nxml-clear-dependent-regions nxml-scan-end pos
)
337 (nxml-mark-parse-dependent-regions)
338 (set-marker nxml-scan-end pos
))))))
342 (defun nxml-scan-element-forward (from &optional up
)
343 "Scan forward from FROM over a single balanced element.
344 Point must between tokens. Return the position of the end of the tag
345 that ends the element. `xmltok-start' will contain the position of the
346 start of the tag. If UP is non-nil, then scan past end-tag of element
347 containing point. If no element is found, return nil. If a
348 well-formedness error prevents scanning, signal an nxml-scan-error.
350 (let ((open-tags (and up t
))
354 (while (cond ((not (nxml-tokenize-forward))
355 (when (consp open-tags
)
356 (nxml-scan-error (cadr open-tags
)
357 "Start-tag has no end-tag"))
359 ((eq xmltok-type
'start-tag
)
361 (cons (xmltok-start-tag-qname)
365 ((eq xmltok-type
'end-tag
)
366 (cond ((not open-tags
) nil
)
367 ((not (consp open-tags
)) (setq found
(point)) nil
)
368 ((not (string= (car open-tags
)
369 (xmltok-end-tag-qname)))
370 (nxml-scan-error (+ 2 xmltok-start
)
371 "Mismatched end-tag; \
374 ((setq open-tags
(cddr open-tags
)) t
)
375 (t (setq found
(point)) nil
)))
376 ((memq xmltok-type
'(empty-element
377 partial-empty-element
))
382 ((eq xmltok-type
'partial-end-tag
)
383 (cond ((not open-tags
) nil
)
384 ((not (consp open-tags
)) (setq found
(point)) nil
)
385 ((setq open-tags
(cddr open-tags
)) t
)
386 (t (setq found
(point)) nil
)))
387 ((eq xmltok-type
'partial-start-tag
)
388 (nxml-scan-error xmltok-start
393 (defun nxml-scan-element-backward (from &optional up bound
)
394 "Scan backward from FROM over a single balanced element.
395 Point must between tokens. Return the position of the end of the tag
396 that starts the element. `xmltok-start' will contain the position of
397 the start of the tag. If UP is non-nil, then scan past start-tag of
398 element containing point. If BOUND is non-nil, then don't scan back
399 past BOUND. If no element is found, return nil. If a well-formedness
400 error prevents scanning, signal an nxml-scan-error. Point is not
402 (let ((open-tags (and up t
))
406 (while (cond ((or (< (point) nxml-prolog-end
)
407 (not (search-backward "<"
411 (when (and (consp open-tags
) (not bound
))
412 (nxml-scan-error (cadr open-tags
)
413 "End-tag has no start-tag"))
416 (nxml-move-outside-backwards)
418 (nxml-tokenize-forward)
419 (setq token-end
(point)))
420 (eq xmltok-type
'end-tag
))
422 (cons (xmltok-end-tag-qname)
423 (cons xmltok-start open-tags
)))
425 ((eq xmltok-type
'start-tag
)
426 (cond ((not open-tags
) nil
)
427 ((not (consp open-tags
))
428 (setq found token-end
)
430 ((and (car open-tags
)
431 (not (string= (car open-tags
)
432 (xmltok-start-tag-qname))))
433 (nxml-scan-error (1+ xmltok-start
)
434 "Mismatched start-tag; \
437 ((setq open-tags
(cddr open-tags
)) t
)
438 (t (setq found token-end
) nil
)))
439 ((memq xmltok-type
'(empty-element
440 partial-empty-element
))
443 (setq found token-end
)
445 ((eq xmltok-type
'partial-end-tag
)
447 (cons nil
(cons xmltok-start open-tags
)))
449 ((eq xmltok-type
'partial-start-tag
)
450 ;; if we have only a partial-start-tag
451 ;; then it's unlikely that there's a matching
452 ;; end-tag, so it's probably not helpful
453 ;; to treat it as a complete start-tag
454 (nxml-scan-error xmltok-start
459 (defun nxml-scan-error (&rest args
)
460 (signal 'nxml-scan-error args
))
462 (put 'nxml-scan-error
464 '(error nxml-error nxml-scan-error
))
466 (put 'nxml-scan-error
468 "Scan over element that is not well-formed")
472 ;; arch-tag: cba241ec-4c59-4ef3-aa51-2cf92b3dd24f
473 ;;; nxml-rap.el ends here