1 ;;; nxml-rap.el --- low-level support for random access parsing for nXML mode -*- lexical-binding:t -*-
3 ;; Copyright (C) 2003-2004, 2007-2017 Free Software Foundation, Inc.
6 ;; Keywords: wp, hypermedia, languages, XML
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
25 ;; This uses xmltok.el to do XML parsing. The fundamental problem is
26 ;; how to handle changes. We don't want to maintain a complete parse
27 ;; tree. We also don't want to reparse from the start of the document
28 ;; on every keystroke. However, it is not possible in general to
29 ;; parse an XML document correctly starting at a random point in the
30 ;; middle. The main problems are comments, CDATA sections and
31 ;; processing instructions: these can all contain things that are
32 ;; indistinguishable from elements. Literals in the prolog are also a
33 ;; problem. Attribute value literals are not a problem because
34 ;; attribute value literals cannot contain less-than signs.
36 ;; Our strategy is to keep track of just the problematic things.
37 ;; Specifically, we keep track of all comments, CDATA sections and
38 ;; processing instructions in the instance. We do this by marking all
39 ;; except the first character of these with a non-nil nxml-inside text
40 ;; property. The value of the nxml-inside property is comment,
41 ;; cdata-section or processing-instruction. The first character does
42 ;; not have the nxml-inside property so we can find the beginning of
43 ;; the construct by looking for a change in a text property value
44 ;; (Emacs provides primitives for this). We use text properties
45 ;; rather than overlays, since the implementation of overlays doesn't
46 ;; look like it scales to large numbers of overlays in a buffer.
48 ;; We don't in fact track all these constructs, but only track them in
49 ;; some initial part of the instance.
51 ;; Thus to parse some random point in the file we first ensure that we
52 ;; have scanned up to that point. Then we search backwards for a
53 ;; <. Then we check whether the < has an nxml-inside property. If it
54 ;; does we go backwards to first character that does not have an
55 ;; nxml-inside property (this character must be a <). Then we start
56 ;; parsing forward from the < we have found.
58 ;; The prolog has to be parsed specially, so we also keep track of the
59 ;; end of the prolog in `nxml-prolog-end'. The prolog is reparsed on
60 ;; every change to the prolog. This won't work well if people try to
61 ;; edit huge internal subsets. Hopefully that will be rare.
63 ;; We keep track of the changes by adding to the buffer's
64 ;; after-change-functions hook. Scanning is also done as a
65 ;; prerequisite to fontification by adding to fontification-functions
66 ;; (in the same way as jit-lock). This means that scanning for these
67 ;; constructs had better be quick. Fortunately it is. Firstly, the
68 ;; typical proportion of comments, CDATA sections and processing
69 ;; instructions is small relative to other things. Secondly, to scan
70 ;; we just search for the regexp <[!?].
78 (defvar-local nxml-prolog-end nil
79 "Integer giving position following end of the prolog.")
81 (defsubst nxml-get-inside
(pos)
82 (save-excursion (nth 8 (syntax-ppss pos
))))
84 (defun nxml-inside-end (pos)
85 "Return the end of the inside region containing POS.
86 Return nil if the character at POS is not inside."
88 (let ((ppss (syntax-ppss pos
)))
90 (goto-char (nth 8 ppss
))
91 (with-syntax-table sgml-tag-syntax-table
93 (progn (forward-comment 1) (point))
94 (or (scan-sexps (point) 1) (point-max))))))))
96 (defun nxml-inside-start (pos)
97 "Return the start of the inside region containing POS.
98 Return nil if the character at POS is not inside."
99 (save-excursion (nth 8 (syntax-ppss pos
))))
101 ;;; Change management
103 ;; n-s-p only called from nxml-mode.el, where this variable is defined.
104 (defvar nxml-prolog-regions
)
106 (defun nxml-scan-prolog ()
107 (goto-char (point-min))
110 (setq nxml-prolog-regions
(xmltok-forward-prolog))
111 (setq nxml-prolog-end
(point))))
114 ;;; Random access parsing
116 (defun nxml-token-after ()
117 "Return the position after the token containing the char after point.
118 Sets up the variables `xmltok-type', `xmltok-start',
119 `xmltok-name-end', `xmltok-name-colon', `xmltok-attributes',
120 `xmltok-namespace-attributes' in the same was as does
121 `xmltok-forward'. The prolog will be treated as a single token with
124 (if (< pos nxml-prolog-end
)
126 (setq xmltok-type
'prolog
127 xmltok-start
(point-min))
128 (min nxml-prolog-end
(point-max)))
129 (nxml-ensure-scan-up-to-date)
130 (if (nxml-get-inside pos
)
132 (nxml-move-outside-backwards)
136 (if (or (eq (char-after) ?
<)
138 (max (point-min) nxml-prolog-end
)
140 (nxml-move-outside-backwards)
141 (goto-char (if (<= (point-min) nxml-prolog-end
)
143 (or (nxml-inside-end (point-min))
145 (while (and (nxml-tokenize-forward)
149 (defun nxml-token-before ()
150 "Return the position after the token containing the char before point.
151 Sets variables like `nxml-token-after'."
152 (if (/= (point-min) (point))
154 (goto-char (1- (point)))
156 (setq xmltok-start
(point))
157 (setq xmltok-type nil
)
160 (defun nxml-tokenize-forward ()
165 (defun nxml-move-tag-backwards (bound)
166 "Move point backwards outside any “inside” regions or tags.
167 Point will not move past `nxml-prolog-end'.
168 Point will either be at BOUND or a `<' character starting a tag
169 outside any “inside” regions.
170 As a precondition, point must be >= BOUND."
171 (nxml-move-outside-backwards)
172 (when (not (equal (char-after) ?
<))
173 (if (search-backward "<" bound t
)
175 (nxml-move-outside-backwards)
176 (when (not (equal (char-after) ?
<))
177 (search-backward "<" bound t
)))
180 (defun nxml-move-outside-backwards ()
181 "Move point to first character of the containing special thing.
182 Leave point unmoved if it is not inside anything special."
183 (let ((start (nxml-inside-start (point))))
186 (when (nxml-get-inside (point))
187 (error "Char before inside-start at %s is still \"inside\"" (point))))))
189 (defun nxml-ensure-scan-up-to-date ()
190 (syntax-propertize (point)))
194 (defun nxml-scan-element-forward (from &optional up
)
195 "Scan forward from FROM over a single balanced element.
196 Point must be between tokens. Return the position of the end of
197 the tag that ends the element. `xmltok-start' will contain the
198 position of the start of the tag. If UP is non-nil, then scan
199 past end-tag of element containing point. If no element is
200 found, return nil. If a well-formedness error prevents scanning,
201 signal an `nxml-scan-error'. Point is not moved."
202 (let ((open-tags (and up t
))
206 (while (cond ((not (nxml-tokenize-forward))
207 (when (consp open-tags
)
208 (nxml-scan-error (cadr open-tags
)
209 "Start-tag has no end-tag"))
211 ((eq xmltok-type
'start-tag
)
213 (cons (xmltok-start-tag-qname)
217 ((eq xmltok-type
'end-tag
)
218 (cond ((not open-tags
) nil
)
219 ((not (consp open-tags
)) (setq found
(point)) nil
)
220 ((not (string= (car open-tags
)
221 (xmltok-end-tag-qname)))
222 (nxml-scan-error (+ 2 xmltok-start
)
223 "Mismatched end-tag; \
226 ((setq open-tags
(cddr open-tags
)) t
)
227 (t (setq found
(point)) nil
)))
228 ((memq xmltok-type
'(empty-element
229 partial-empty-element
))
234 ((eq xmltok-type
'partial-end-tag
)
235 (cond ((not open-tags
) nil
)
236 ((not (consp open-tags
)) (setq found
(point)) nil
)
237 ((setq open-tags
(cddr open-tags
)) t
)
238 (t (setq found
(point)) nil
)))
239 ((eq xmltok-type
'partial-start-tag
)
240 (nxml-scan-error xmltok-start
245 (defun nxml-scan-element-backward (from &optional up bound
)
246 "Scan backward from FROM over a single balanced element.
247 Point must be between tokens. Return the position of the end of
248 the tag that starts the element. `xmltok-start' will contain the
249 position of the start of the tag. If UP is non-nil, then scan
250 past start-tag of element containing point. If BOUND is non-nil,
251 then don't scan back past BOUND. If no element is found, return
252 nil. If a well-formedness error prevents scanning, signal an
253 `nxml-scan-error'. Point is not moved."
254 (let ((open-tags (and up t
))
258 (while (cond ((or (< (point) nxml-prolog-end
)
259 (not (search-backward "<"
263 (when (and (consp open-tags
) (not bound
))
264 (nxml-scan-error (cadr open-tags
)
265 "End-tag has no start-tag"))
268 (nxml-move-outside-backwards)
270 (nxml-tokenize-forward)
271 (setq token-end
(point)))
272 (eq xmltok-type
'end-tag
))
274 (cons (xmltok-end-tag-qname)
275 (cons xmltok-start open-tags
)))
277 ((eq xmltok-type
'start-tag
)
278 (cond ((not open-tags
) nil
)
279 ((not (consp open-tags
))
280 (setq found token-end
)
282 ((and (car open-tags
)
283 (not (string= (car open-tags
)
284 (xmltok-start-tag-qname))))
285 (nxml-scan-error (1+ xmltok-start
)
286 "Mismatched start-tag; \
289 ((setq open-tags
(cddr open-tags
)) t
)
290 (t (setq found token-end
) nil
)))
291 ((memq xmltok-type
'(empty-element
292 partial-empty-element
))
295 (setq found token-end
)
297 ((eq xmltok-type
'partial-end-tag
)
299 (cons nil
(cons xmltok-start open-tags
)))
301 ((eq xmltok-type
'partial-start-tag
)
302 ;; if we have only a partial-start-tag
303 ;; then it's unlikely that there's a matching
304 ;; end-tag, so it's probably not helpful
305 ;; to treat it as a complete start-tag
306 (nxml-scan-error xmltok-start
311 (defun nxml-scan-error (&rest args
)
312 (signal 'nxml-scan-error args
))
314 (define-error 'nxml-scan-error
315 "Scan over element that is not well-formed" 'nxml-error
)
319 ;;; nxml-rap.el ends here