Merge branch 'master' into comment-cache
[emacs.git] / lisp / nxml / nxml-rap.el
blob0132a2b9234a6cf6b65c89f70c6afa0194d9912c
1 ;;; nxml-rap.el --- low-level support for random access parsing for nXML mode -*- lexical-binding:t -*-
3 ;; Copyright (C) 2003-2004, 2007-2017 Free Software Foundation, Inc.
5 ;; Author: James Clark
6 ;; Keywords: wp, hypermedia, languages, XML
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
23 ;;; Commentary:
25 ;; This uses xmltok.el to do XML parsing. The fundamental problem is
26 ;; how to handle changes. We don't want to maintain a complete parse
27 ;; tree. We also don't want to reparse from the start of the document
28 ;; on every keystroke. However, it is not possible in general to
29 ;; parse an XML document correctly starting at a random point in the
30 ;; middle. The main problems are comments, CDATA sections and
31 ;; processing instructions: these can all contain things that are
32 ;; indistinguishable from elements. Literals in the prolog are also a
33 ;; problem. Attribute value literals are not a problem because
34 ;; attribute value literals cannot contain less-than signs.
36 ;; Our strategy is to keep track of just the problematic things.
37 ;; Specifically, we keep track of all comments, CDATA sections and
38 ;; processing instructions in the instance. We do this by marking all
39 ;; except the first character of these with a non-nil nxml-inside text
40 ;; property. The value of the nxml-inside property is comment,
41 ;; cdata-section or processing-instruction. The first character does
42 ;; not have the nxml-inside property so we can find the beginning of
43 ;; the construct by looking for a change in a text property value
44 ;; (Emacs provides primitives for this). We use text properties
45 ;; rather than overlays, since the implementation of overlays doesn't
46 ;; look like it scales to large numbers of overlays in a buffer.
48 ;; We don't in fact track all these constructs, but only track them in
49 ;; some initial part of the instance.
51 ;; Thus to parse some random point in the file we first ensure that we
52 ;; have scanned up to that point. Then we search backwards for a
53 ;; <. Then we check whether the < has an nxml-inside property. If it
54 ;; does we go backwards to first character that does not have an
55 ;; nxml-inside property (this character must be a <). Then we start
56 ;; parsing forward from the < we have found.
58 ;; The prolog has to be parsed specially, so we also keep track of the
59 ;; end of the prolog in `nxml-prolog-end'. The prolog is reparsed on
60 ;; every change to the prolog. This won't work well if people try to
61 ;; edit huge internal subsets. Hopefully that will be rare.
63 ;; We keep track of the changes by adding to the buffer's
64 ;; after-change-functions hook. Scanning is also done as a
65 ;; prerequisite to fontification by adding to fontification-functions
66 ;; (in the same way as jit-lock). This means that scanning for these
67 ;; constructs had better be quick. Fortunately it is. Firstly, the
68 ;; typical proportion of comments, CDATA sections and processing
69 ;; instructions is small relative to other things. Secondly, to scan
70 ;; we just search for the regexp <[!?].
72 ;;; Code:
74 (require 'xmltok)
75 (require 'nxml-util)
76 (require 'sgml-mode)
78 (defvar-local nxml-prolog-end nil
79 "Integer giving position following end of the prolog.")
81 (defsubst nxml-get-inside (pos)
82 (save-excursion (nth 8 (syntax-ppss pos))))
84 (defun nxml-inside-end (pos)
85 "Return the end of the inside region containing POS.
86 Return nil if the character at POS is not inside."
87 (save-excursion
88 (let ((ppss (syntax-ppss pos)))
89 (when (nth 8 ppss)
90 (goto-char (nth 8 ppss))
91 (with-syntax-table sgml-tag-syntax-table
92 (if (nth 3 ppss)
93 (progn (forward-comment 1) (point))
94 (or (scan-sexps (point) 1) (point-max))))))))
96 (defun nxml-inside-start (pos)
97 "Return the start of the inside region containing POS.
98 Return nil if the character at POS is not inside."
99 (save-excursion (nth 8 (syntax-ppss pos))))
101 ;;; Change management
103 ;; n-s-p only called from nxml-mode.el, where this variable is defined.
104 (defvar nxml-prolog-regions)
106 (defun nxml-scan-prolog ()
107 (goto-char (point-min))
108 (let (xmltok-dtd
109 xmltok-errors)
110 (setq nxml-prolog-regions (xmltok-forward-prolog))
111 (setq nxml-prolog-end (point))))
114 ;;; Random access parsing
116 (defun nxml-token-after ()
117 "Return the position after the token containing the char after point.
118 Sets up the variables `xmltok-type', `xmltok-start',
119 `xmltok-name-end', `xmltok-name-colon', `xmltok-attributes',
120 `xmltok-namespace-attributes' in the same was as does
121 `xmltok-forward'. The prolog will be treated as a single token with
122 type `prolog'."
123 (let ((pos (point)))
124 (if (< pos nxml-prolog-end)
125 (progn
126 (setq xmltok-type 'prolog
127 xmltok-start (point-min))
128 (min nxml-prolog-end (point-max)))
129 (nxml-ensure-scan-up-to-date)
130 (if (nxml-get-inside pos)
131 (save-excursion
132 (nxml-move-outside-backwards)
133 (xmltok-forward)
134 (point))
135 (save-excursion
136 (if (or (eq (char-after) ?<)
137 (search-backward "<"
138 (max (point-min) nxml-prolog-end)
140 (nxml-move-outside-backwards)
141 (goto-char (if (<= (point-min) nxml-prolog-end)
142 nxml-prolog-end
143 (or (nxml-inside-end (point-min))
144 (point-min)))))
145 (while (and (nxml-tokenize-forward)
146 (<= (point) pos)))
147 (point))))))
149 (defun nxml-token-before ()
150 "Return the position after the token containing the char before point.
151 Sets variables like `nxml-token-after'."
152 (if (/= (point-min) (point))
153 (save-excursion
154 (goto-char (1- (point)))
155 (nxml-token-after))
156 (setq xmltok-start (point))
157 (setq xmltok-type nil)
158 (point)))
160 (defun nxml-tokenize-forward ()
161 (let (xmltok-errors)
162 (xmltok-forward)
163 xmltok-type))
165 (defun nxml-move-tag-backwards (bound)
166 "Move point backwards outside any “inside” regions or tags.
167 Point will not move past `nxml-prolog-end'.
168 Point will either be at BOUND or a `<' character starting a tag
169 outside any “inside” regions.
170 As a precondition, point must be >= BOUND."
171 (nxml-move-outside-backwards)
172 (when (not (equal (char-after) ?<))
173 (if (search-backward "<" bound t)
174 (progn
175 (nxml-move-outside-backwards)
176 (when (not (equal (char-after) ?<))
177 (search-backward "<" bound t)))
178 (goto-char bound))))
180 (defun nxml-move-outside-backwards ()
181 "Move point to first character of the containing special thing.
182 Leave point unmoved if it is not inside anything special."
183 (let ((start (nxml-inside-start (point))))
184 (when start
185 (goto-char start)
186 (when (nxml-get-inside (point))
187 (error "Char before inside-start at %s is still \"inside\"" (point))))))
189 (defun nxml-ensure-scan-up-to-date ()
190 (syntax-propertize (point)))
192 ;;; Element scanning
194 (defun nxml-scan-element-forward (from &optional up)
195 "Scan forward from FROM over a single balanced element.
196 Point must be between tokens. Return the position of the end of
197 the tag that ends the element. `xmltok-start' will contain the
198 position of the start of the tag. If UP is non-nil, then scan
199 past end-tag of element containing point. If no element is
200 found, return nil. If a well-formedness error prevents scanning,
201 signal an `nxml-scan-error'. Point is not moved."
202 (let ((open-tags (and up t))
203 found)
204 (save-excursion
205 (goto-char from)
206 (while (cond ((not (nxml-tokenize-forward))
207 (when (consp open-tags)
208 (nxml-scan-error (cadr open-tags)
209 "Start-tag has no end-tag"))
210 nil)
211 ((eq xmltok-type 'start-tag)
212 (setq open-tags
213 (cons (xmltok-start-tag-qname)
214 (cons xmltok-start
215 open-tags)))
217 ((eq xmltok-type 'end-tag)
218 (cond ((not open-tags) nil)
219 ((not (consp open-tags)) (setq found (point)) nil)
220 ((not (string= (car open-tags)
221 (xmltok-end-tag-qname)))
222 (nxml-scan-error (+ 2 xmltok-start)
223 "Mismatched end-tag; \
224 expected `%s'"
225 (car open-tags)))
226 ((setq open-tags (cddr open-tags)) t)
227 (t (setq found (point)) nil)))
228 ((memq xmltok-type '(empty-element
229 partial-empty-element))
230 (if open-tags
232 (setq found (point))
233 nil))
234 ((eq xmltok-type 'partial-end-tag)
235 (cond ((not open-tags) nil)
236 ((not (consp open-tags)) (setq found (point)) nil)
237 ((setq open-tags (cddr open-tags)) t)
238 (t (setq found (point)) nil)))
239 ((eq xmltok-type 'partial-start-tag)
240 (nxml-scan-error xmltok-start
241 "Missing `>'"))
242 (t t))))
243 found))
245 (defun nxml-scan-element-backward (from &optional up bound)
246 "Scan backward from FROM over a single balanced element.
247 Point must be between tokens. Return the position of the end of
248 the tag that starts the element. `xmltok-start' will contain the
249 position of the start of the tag. If UP is non-nil, then scan
250 past start-tag of element containing point. If BOUND is non-nil,
251 then don't scan back past BOUND. If no element is found, return
252 nil. If a well-formedness error prevents scanning, signal an
253 `nxml-scan-error'. Point is not moved."
254 (let ((open-tags (and up t))
255 token-end found)
256 (save-excursion
257 (goto-char from)
258 (while (cond ((or (< (point) nxml-prolog-end)
259 (not (search-backward "<"
260 (max (or bound 0)
261 nxml-prolog-end)
262 t)))
263 (when (and (consp open-tags) (not bound))
264 (nxml-scan-error (cadr open-tags)
265 "End-tag has no start-tag"))
266 nil)
267 ((progn
268 (nxml-move-outside-backwards)
269 (save-excursion
270 (nxml-tokenize-forward)
271 (setq token-end (point)))
272 (eq xmltok-type 'end-tag))
273 (setq open-tags
274 (cons (xmltok-end-tag-qname)
275 (cons xmltok-start open-tags)))
277 ((eq xmltok-type 'start-tag)
278 (cond ((not open-tags) nil)
279 ((not (consp open-tags))
280 (setq found token-end)
281 nil)
282 ((and (car open-tags)
283 (not (string= (car open-tags)
284 (xmltok-start-tag-qname))))
285 (nxml-scan-error (1+ xmltok-start)
286 "Mismatched start-tag; \
287 expected `%s'"
288 (car open-tags)))
289 ((setq open-tags (cddr open-tags)) t)
290 (t (setq found token-end) nil)))
291 ((memq xmltok-type '(empty-element
292 partial-empty-element))
293 (if open-tags
295 (setq found token-end)
296 nil))
297 ((eq xmltok-type 'partial-end-tag)
298 (setq open-tags
299 (cons nil (cons xmltok-start open-tags)))
301 ((eq xmltok-type 'partial-start-tag)
302 ;; if we have only a partial-start-tag
303 ;; then it's unlikely that there's a matching
304 ;; end-tag, so it's probably not helpful
305 ;; to treat it as a complete start-tag
306 (nxml-scan-error xmltok-start
307 "Missing `>'"))
308 (t t))))
309 found))
311 (defun nxml-scan-error (&rest args)
312 (signal 'nxml-scan-error args))
314 (define-error 'nxml-scan-error
315 "Scan over element that is not well-formed" 'nxml-error)
317 (provide 'nxml-rap)
319 ;;; nxml-rap.el ends here