[lice @ add data-types.lisp]
[lice.git] / search.lisp
blobd1215c864481a90c199eff02b81983fec8f78069
1 (in-package "LICE")
3 ;; because gnu emacs' match-data is not reentrant we create this
4 ;; structure that is returned for all searching functions. It is
5 ;; passed into the match-data related functions.
6 (defstruct match-data
7 obj start end reg-starts reg-ends)
9 (defvar *match-data* nil
10 "store the match data for searches.")
12 (defvar *with-match-data* nil
13 "Set to true when inside a match-data block. If this is NIL
14 during one of the searches, a warning is signaled because it's
15 not thread safe. But, lots of code uses the search functions so
16 it's useful, at least now to be compatible with gnu emacs, even
17 if it's not thread safe. Never set this variable directly.")
19 (defmacro with-match-data (&body body)
20 `(let ((*with-match-data* t)
21 (*match-data* nil))
22 ,@body))
24 (defun match-end (idx &optional (data *match-data*))
25 "Return position of start of text matched by last search.
26 SUBEXP, a number, specifies which parenthesized expression in the last
27 regexp.
28 Value is nil if SUBEXPth pair didn't match, or there were less than
29 SUBEXP pairs.
30 Zero means the entire text matched by the whole regexp or whole string."
31 (if (zerop idx)
32 (match-data-end data)
33 (aref (match-data-reg-ends data) (1- idx))))
35 (defun match-beginning (idx &optional (data *match-data*))
36 "Return position of start of text matched by last search.
37 SUBEXP, a number, specifies which parenthesized expression in the last
38 regexp.
39 Value is nil if SUBEXPth pair didn't match, or there were less than
40 SUBEXP pairs.
41 Zero means the entire text matched by the whole regexp or whole string."
42 (if (zerop idx)
43 (match-data-start data)
44 (aref (match-data-reg-starts data) (1- idx))))
46 ;; FIXME: needs a formatter and the search string
47 (define-condition search-failed (lice-condition)
48 () (:documentation "raised when a search failed to match"))
50 (define-condition thread-unsafe (style-warning)
51 () (:documentation "Raised when a search is not threadsafe. See also `*with-match-data*'"))
53 (defun check-search-thread-safe ()
54 "Report a warning if the search is unsafe for threads."
55 (unless *with-match-data*
56 (signal 'thread-unsafe)))
58 (defun string-search-command (string bound error count direction)
59 (check-search-thread-safe)
60 (gap-move-to (current-buffer) (buffer-point-aref (current-buffer)))
61 ;; normalize vars
62 (setf count (* count direction)
63 bound (if (minusp count)
64 (if bound (max bound (begv)) (begv))
65 (if bound (min bound (zv)) (zv))))
66 (let* ((buffer (current-buffer))
67 pos
68 (start-aref (buffer-point-aref buffer))
69 (bound-aref (buffer-char-to-aref buffer bound))
70 (n (if (minusp count)
71 (loop for i from 0 below (- count)
72 do (setf pos (search string (buffer-data buffer) :from-end t :end2 start-aref :start2 bound-aref))
73 while pos
74 count i)
75 (loop for i from 0 below count
76 do (setf pos (search string (buffer-data buffer) :start2 start-aref :end2 bound-aref))
77 while pos
78 count i))))
79 (if (/= n (abs count))
80 (cond
81 ((eq error t)
82 (signal 'search-failed))
83 ((null error)
84 nil)
85 (bound
86 (set-point bound buffer)
87 nil)
88 (t nil))
89 (progn
90 (if (minusp count)
91 (set-point (+ (buffer-aref-to-char buffer pos) (length string)))
92 (set-point (buffer-aref-to-char buffer pos)))
93 (values (pt)
94 (setf *match-data*
95 (make-match-data :obj buffer
96 :start (buffer-aref-to-char buffer pos)
97 :end (+ (buffer-aref-to-char buffer pos) (length string))
98 :reg-starts #()
99 :reg-ends #())))))))
101 (defun search-forward (string &key bound (error t) (count 1))
102 "Search forward from point for string.
103 Set point to the end of the occurrence found, and return point.
104 An optional second argument bounds the search; it is a buffer position.
105 The match found must not extend after that position. nil is equivalent
106 to (point-max).
107 Optional third argument, if t, means if fail just return nil (no error).
108 If not nil and not t, move to limit of search and return nil.
109 Optional fourth argument is repeat count--search for successive occurrences.
111 Search case-sensitivity is determined by the value of the variable
112 `case-fold-search', which see.
114 See also the functions `match-beginning', `match-end' and `replace-match'."
115 (string-search-command string bound error count 1))
117 (defun search-backward (string &key bound (error t) (count 1))
118 "Search backward from point for STRING.
119 Set point to the beginning of the occurrence found, and return point.
120 An optional second argument bounds the search; it is a buffer position.
121 The match found must not extend before that position.
122 Optional third argument, if t, means if fail just return nil (no error).
123 If not nil and not t, position at limit of search and return nil.
124 Optional fourth argument is repeat count--search for successive occurrences.
126 Search case-sensitivity is determined by the value of the variable
127 `case-fold-search', which see.
129 See also the functions `match-beginning', `match-end' and `replace-match'."
130 (string-search-command string bound error count -1))
132 (defvar *regexp-cache* (make-memoize-state :test 'string=))
134 ;; TODO: create compiler-macros for regex functions so the regexps can
135 ;; be compiled at compile time.
137 (defun looking-at (regexp &optional (buffer (current-buffer)))
138 "Return the match-data if text after point matches regular expression regexp."
139 (check-type regexp string)
140 (check-search-thread-safe)
141 ;; get the gap outta the way. It sucks we have to do this. Really we
142 ;; should modify ppcre to generate scanner functions that hop the
143 ;; gap. Meantime...
144 (when (< (buffer-char-to-aref buffer (pt buffer))
145 (buffer-gap-start buffer))
146 (gap-move-to-point buffer))
147 (multiple-value-bind (start end reg-starts reg-ends)
148 (ppcre:scan (memoize *regexp-cache* regexp (ppcre:create-scanner regexp :multi-line-mode t)) (buffer-data buffer)
149 :start (buffer-char-to-aref buffer (pt buffer))
150 :real-start-pos 0)
151 (when (and start
152 (= start (buffer-char-to-aref buffer (pt buffer))))
153 (values t
154 (setf *match-data*
155 (make-match-data :obj buffer
156 :start (buffer-aref-to-char buffer start)
157 :end (buffer-aref-to-char buffer end)
158 :reg-starts (map 'vector (lambda (n)
159 (buffer-aref-to-char buffer n))
160 reg-starts)
161 :reg-ends (map 'vector (lambda (n)
162 (buffer-aref-to-char buffer n))
163 reg-ends)))))))
165 (defun re-search-forward (regexp &key (bound (zv)) (error t) count &aux (buffer (current-buffer)))
166 "Search forward from point for regular expression regexp.
167 Set point to the end of the occurrence found, and return match-data structure.
168 BOUND bounds the search; it is a buffer position.
169 The match found must not extend after that position.
170 ERROR, if nil, means if fail just return nil (no error).
171 If not nil and not t, move to limit of search and return nil.
172 COUNT is repeat count--search for successive occurrences.
173 See also the functions `match-beginning', `match-end', `match-string',
174 and `replace-match'."
175 (declare (ignore count))
176 (check-search-thread-safe)
177 (when (< (buffer-char-to-aref buffer (pt buffer))
178 (buffer-gap-start buffer))
179 (gap-move-to-point buffer))
180 (multiple-value-bind (start end reg-starts reg-ends)
181 (ppcre:scan (memoize *regexp-cache* regexp (ppcre:create-scanner regexp :multi-line-mode t)) (buffer-data buffer)
182 :start (buffer-char-to-aref buffer (pt buffer))
183 :end (buffer-char-to-aref buffer bound)
184 :real-start-pos 0)
185 (cond (start
186 (set-point (buffer-aref-to-char buffer end) buffer)
187 (values (pt)
188 (setf *match-data*
189 (make-match-data :obj buffer
190 :start (buffer-aref-to-char buffer start)
191 :end (buffer-aref-to-char buffer end)
192 :reg-starts (map 'vector (lambda (n)
193 (buffer-aref-to-char buffer n))
194 reg-starts)
195 :reg-ends (map 'vector (lambda (n)
196 (buffer-aref-to-char buffer n))
197 reg-ends)))))
198 ((eq error t)
199 (signal 'search-failed))
200 ((null error)
201 nil)
202 (bound
203 (set-point bound buffer)
204 nil)
205 (t nil))))
207 (defun re-search-backward (regexp &key (bound (begv)) (error t) count &aux (buffer (current-buffer)))
208 "Search backward from point for match for regular expression regexp.
209 Set point to the beginning of the match, and return match-data.
210 The match found is the one starting last in the buffer
211 and yet ending before the origin of the search.
212 BOUND bounds the search; it is a buffer position.
213 The match found must start at or after that position.
214 ERROR, if nil, means if fail just return nil (no error).
215 If not nil and not t, move to limit of search and return nil.
216 COUNT is repeat count--search for successive occurrences.
217 See also the functions `match-beginning', `match-end', `match-string',
218 and `replace-match'."
219 (declare (ignore count))
220 (check-search-thread-safe)
221 ;;(message "re-search-backward ~s ~d" regexp (point))
222 (when (> (buffer-gap-start buffer)
223 (buffer-char-to-aref buffer (pt buffer)))
224 (gap-move-to buffer (buffer-char-to-aref buffer (1+ (pt buffer)))))
225 ;; start search from point and keep walking back til we match something
226 (let* ((start-aref (buffer-char-to-aref buffer (pt buffer)))
227 (pt-aref start-aref)
228 (stop (buffer-char-to-aref buffer bound))
229 (scanner (memoize *regexp-cache* regexp (ppcre:create-scanner regexp :multi-line-mode t))))
230 (loop
231 (multiple-value-bind (start end reg-starts reg-ends)
232 (ppcre:scan scanner (buffer-data buffer) :start start-aref :end pt-aref :real-start-pos 0)
233 (when start
234 (set-point (buffer-aref-to-char buffer start) buffer)
235 (return (values (pt)
236 (setf *match-data*
237 (make-match-data :obj buffer
238 :start (buffer-aref-to-char buffer start)
239 :end (buffer-aref-to-char buffer end)
240 :reg-starts (map 'vector (lambda (n)
241 (buffer-aref-to-char buffer n))
242 reg-starts)
243 :reg-ends (map 'vector (lambda (n)
244 (buffer-aref-to-char buffer n))
245 reg-ends))))))
246 (dec-aref start-aref buffer)
247 (when (< start-aref stop)
248 (cond ((eq error t)
249 ;; FIXME: we need a search condition
250 (signal 'search-failed))
251 ((null error)
252 (return nil))
254 (when bound
255 (set-point bound buffer))
256 (return nil))))))))
258 (defun string-match (regexp string &key (start 0) (end (length string)))
259 "Return index of start of first match for regexp in string and match-data, or nil.
260 Matching ignores case if `case-fold-search' is non-nil.
261 START, start search at that index in string.
262 END, end search at that index in string.
263 **For index of first char beyond the match, do (match-end 0).
264 **`match-end' and `match-beginning' also give indices of substrings
265 **matched by parenthesis constructs in the pattern.
267 You can use the function `match-string' to extract the substrings
268 matched by the parenthesis constructions in regexp."
269 (check-search-thread-safe)
270 (multiple-value-bind (start end reg-starts reg-ends)
271 (ppcre:scan (memoize *regexp-cache* regexp (ppcre:create-scanner regexp :multi-line-mode t))
272 string :start start :end end)
273 (when start
274 (values start
275 (setf *match-data*
276 (make-match-data :obj string
277 :start start
278 :end end
279 :reg-starts reg-starts
280 :reg-ends reg-ends))))))
282 (defun regexp-quote (string)
283 "Return a regexp string which matches exactly STRING and nothing else."
284 (check-type string string)
285 (coerce
286 (loop for c across string
287 when (find c "[*.\\?+^$" :test 'char=)
288 collect #\\
289 collect c)
290 'string))
292 (defun scan-buffer (buffer target start end count)
293 "Search for COUNT instances of the character TARGET between START and END.
295 If COUNT is positive, search forwards; END must be >= START.
296 If COUNT is negative, search backwards for the -COUNTth instance;
297 END must be <= START.
298 If COUNT is zero, do anything you please; run rogue, for all I care.
300 If END is NIL, use BEGV or ZV instead, as appropriate for the
301 direction indicated by COUNT.
303 If we find COUNT instances, return the
304 position past the COUNTth match and 0. Note that for reverse motion
305 this is not the same as the usual convention for Emacs motion commands.
307 If we don't find COUNT instances before reaching END, return END
308 and the number of TARGETs left unfound."
309 (let ((shortage (abs count))
310 last)
311 (if (> count 0)
312 (setf end (or end (zv buffer)))
313 (setf end (or end (begv buffer))))
314 (setf start (buffer-char-to-aref buffer start)
315 end (buffer-char-to-aref buffer end))
316 (loop while (and (> count 0)
317 (/= start end)) do
318 (setf start
319 (if (< start (buffer-gap-start buffer))
320 (or (position target (buffer-data buffer) :start start :end (min end (buffer-gap-start buffer)))
321 (and (> end (gap-end buffer))
322 (position target (buffer-data buffer) :start (gap-end buffer) :end end)))
323 (position target (buffer-data buffer) :start start :end end)))
324 (if start
325 (setf start (1+ start)
326 last start
327 count (1- count)
328 shortage (1- shortage))
329 (setf start end)))
330 (loop while (and (< count 0)
331 (/= start end)) do
332 (setf start
333 (if (> start (buffer-gap-start buffer))
334 (or (position target (buffer-data buffer) :start (max end (gap-end buffer)) :end start :from-end t)
335 (and (< end (buffer-gap-start buffer))
336 (position target (buffer-data buffer) :start end :end (buffer-gap-start buffer) :from-end t)))
337 (position target (buffer-data buffer) :start end :end start :from-end t)))
338 (if start
339 (setf last (+ start 1) ; match emacs functionality
340 count (1+ count)
341 shortage (1- shortage))
342 (setf start end)))
343 (if (zerop count)
344 (values (and last (buffer-aref-to-char buffer last)) 0)
345 (values (buffer-aref-to-char buffer end) shortage))))
347 (defun find-before-next-newline (from to cnt)
348 "Like find_next_newline, but returns position before the newline,
349 not after, and only search up to TO. This isn't just
350 find_next_newline (...)-1, because you might hit TO."
351 (multiple-value-bind (pos shortage) (scan-buffer (current-buffer) #\Newline from to cnt)
352 (when (zerop shortage)
353 (decf pos))
354 pos))
356 (defun buffer-scan-newline (buf start limit count)
357 "Search BUF for COUNT newlines with a limiting point at LIMIT,
358 starting at START. Returns the point of the last newline or limit and
359 number of newlines found. START and LIMIT are inclusive."
360 (declare (type buffer buf)
361 (type integer start limit count))
362 (labels ((buffer-scan-bk (buf start limit count)
363 "count is always >=0. start >= limit."
364 (let* ((start-aref (buffer-char-to-aref buf start))
365 (limit-aref (buffer-char-to-aref buf limit))
366 (ceiling (if (>= start-aref (gap-end buf))
367 (max limit-aref (gap-end buf))
368 limit-aref))
369 (i 0)
370 ;; :END is not inclusive but START is.
371 (start (1+ start-aref))
373 (loop
374 ;; Always search at least once
375 (setf p (position #\Newline (buffer-data buf)
376 :start ceiling :end start :from-end t))
377 (if p
378 (progn
379 ;; Move start. Note that start isn't set to (1+ p)
380 ;; because we don't want to search p again.
381 (setf start p)
382 ;; Count the newline
383 (incf i)
384 ;; Have we found enough newlines?
385 (when (>= i count)
386 (return-from buffer-scan-bk (values (buffer-aref-to-char buf p)
387 i))))
388 ;; Check if we've searched up to the limit
389 (if (= ceiling limit-aref)
390 (return-from buffer-scan-bk (values limit i))
391 ;; if not, skip past the gap
392 (progn
393 (setf ceiling limit-aref)
394 (setf start (buffer-gap-start buf))))))))
395 (buffer-scan-fw (buf start limit count)
396 "count is always >=0. start >= limit."
397 (let* ((start-aref (buffer-char-to-aref buf start))
398 (limit-aref (1+ (buffer-char-to-aref buf limit)))
399 (ceiling (if (< start (buffer-gap-start buf))
400 (min limit-aref (buffer-gap-start buf))
401 limit-aref))
402 (i 0)
403 (start start-aref)
405 (loop
406 ;; Always search at least once
407 (setf p (position #\Newline (buffer-data buf) :start start :end ceiling))
408 (if p
409 (progn
410 ;; Move start. We don't want to search p again, thus the 1+.
411 (setf start (1+ p))
412 ;; Count the newline
413 (incf i)
414 ;; Have we found enough newlines?
415 (when (>= i count)
416 (return-from buffer-scan-fw (values (buffer-aref-to-char buf p)
417 i))))
418 ;; Check if we've searched up to the limit
419 (if (= ceiling limit-aref)
420 (return-from buffer-scan-fw (values limit i))
421 ;; if not, skip past the gap
422 (progn
423 (setf ceiling limit-aref)
424 (setf start (gap-end buf)))))))))
425 ;; make sure start and limit are within the bounds
426 (setf start (max 0 (min start (1- (buffer-size buf))))
427 limit (max 0 (min limit (1- (buffer-size buf)))))
428 ;; the search always fails on an empty buffer
429 (when (= (buffer-size buf) 0)
430 (return-from buffer-scan-newline (values limit 0)))
431 (cond ((> count 0)
432 (dformat +debug-vv+ "scan-fw ~a ~a ~a~%" start limit count)
433 (buffer-scan-fw buf start limit count))
434 ((< count 0)
435 (dformat +debug-vv+ "scan-bk ~a ~a ~a~%" start limit count)
436 (buffer-scan-bk buf start limit (abs count)))
437 ;; 0 means the newline before the beginning of the current
438 ;; line. We need to handle the case where we are on a newline.
440 (dformat +debug-vv+ "scan-0 ~a ~a ~a~%" start limit count)
441 (if (char= (buffer-char-after buf start) #\Newline)
442 (buffer-scan-bk buf start limit 2)
443 (buffer-scan-bk buf start limit 1))))))