[lice @ shit loads of stuff]
[lice.git] / src / search.lisp
blob01736caddf1161cf02d076b4b5b34f18ae08ce30
1 (in-package "LICE")
3 ;; because gnu emacs' match-data is not reentrant we create this
4 ;; structure that is returned for all searching functions. It is
5 ;; passed into the match-data related functions.
6 (defstruct match-data
7 obj start end reg-starts reg-ends)
9 (defvar *match-data* nil
10 "store the match data for searches.")
12 (defvar *with-match-data* nil
13 "Set to true when inside a match-data block. If this is NIL
14 during one of the searches, a warning is signaled because it's
15 not thread safe. But, lots of code uses the search functions so
16 it's useful, at least now to be compatible with gnu emacs, even
17 if it's not thread safe. Never set this variable directly.")
19 (defmacro with-match-data (&body body)
20 `(let ((*with-match-data* t)
21 (*match-data* nil))
22 ,@body))
24 (defun match-end (idx &optional (data *match-data*))
25 "Return position of start of text matched by last search.
26 SUBEXP, a number, specifies which parenthesized expression in the last
27 regexp.
28 Value is nil if SUBEXPth pair didn't match, or there were less than
29 SUBEXP pairs.
30 Zero means the entire text matched by the whole regexp or whole string."
31 (if (zerop idx)
32 (match-data-end data)
33 (aref (match-data-reg-ends data) (1- idx))))
35 (defun match-beginning (idx &optional (data *match-data*))
36 "Return position of start of text matched by last search.
37 SUBEXP, a number, specifies which parenthesized expression in the last
38 regexp.
39 Value is nil if SUBEXPth pair didn't match, or there were less than
40 SUBEXP pairs.
41 Zero means the entire text matched by the whole regexp or whole string."
42 (if (zerop idx)
43 (match-data-start data)
44 (aref (match-data-reg-starts data) (1- idx))))
46 (defun match-string (num &optional string)
47 "Return string of text matched by last search.
48 NUM specifies which parenthesized expression in the last regexp.
49 Value is nil if NUMth pair didn't match, or there were less than NUM pairs.
50 Zero means the entire text matched by the whole regexp or whole string.
51 STRING should be given if the last search was by `string-match' on STRING."
52 (if (match-beginning num)
53 (if string
54 (substring string (match-beginning num) (match-end num))
55 (buffer-substring (match-beginning num) (match-end num)))))
57 (defun replace-match (newtext &optional fixedcase literal string subexp)
58 "Replace text matched by last search with NEWTEXT.
59 Leave point at the end of the replacement text.
61 If second arg FIXEDCASE is non-nil, do not alter case of replacement text.
62 Otherwise maybe capitalize the whole text, or maybe just word initials,
63 based on the replaced text.
64 If the replaced text has only capital letters
65 and has at least one multiletter word, convert NEWTEXT to all caps.
66 Otherwise if all words are capitalized in the replaced text,
67 capitalize each word in NEWTEXT.
69 If third arg LITERAL is non-nil, insert NEWTEXT literally.
70 Otherwise treat `\\' as special:
71 `\\&' in NEWTEXT means substitute original matched text.
72 `\\N' means substitute what matched the Nth `\\(...\\)'.
73 If Nth parens didn't match, substitute nothing.
74 `\\\\' means insert one `\\'.
75 Case conversion does not apply to these substitutions.
77 FIXEDCASE and LITERAL are optional arguments.
79 The optional fourth argument STRING can be a string to modify.
80 This is meaningful when the previous match was done against STRING,
81 using `string-match'. When used this way, `replace-match'
82 creates and returns a new string made by copying STRING and replacing
83 the part of STRING that was matched.
85 The optional fifth argument SUBEXP specifies a subexpression;
86 it says to replace just that subexpression with NEWTEXT,
87 rather than replacing the entire matched text.
88 This is, in a vague sense, the inverse of using `\\N' in NEWTEXT;
89 `\\N' copies subexp N into NEWTEXT, but using N as SUBEXP puts
90 NEWTEXT in place of subexp N.
91 This is useful only after a regular expression search or match,
92 since only regular expressions have distinguished subexpressions."
93 (declare (ignore newtext fixedcase literal string subexp))
94 (error "unimplemented replace-match"))
97 (defun match-string-no-properties (num &optional string)
98 "Return string of text matched by last search, without text properties.
99 NUM specifies which parenthesized expression in the last regexp.
100 Value is nil if NUMth pair didn't match, or there were less than NUM pairs.
101 Zero means the entire text matched by the whole regexp or whole string.
102 STRING should be given if the last search was by `string-match' on STRING."
103 (if (match-beginning num)
104 (if string
105 (substring-no-properties string (match-beginning num)
106 (match-end num))
107 (buffer-substring-no-properties (match-beginning num)
108 (match-end num)))))
110 ;; FIXME: needs a formatter and the search string
111 (define-condition search-failed (lice-condition)
112 () (:documentation "raised when a search failed to match"))
114 (define-condition thread-unsafe (style-warning)
115 () (:documentation "Raised when a search is not threadsafe. See also `*with-match-data*'"))
117 (defun check-search-thread-safe ()
118 "Report a warning if the search is unsafe for threads."
119 (unless *with-match-data*
120 (signal 'thread-unsafe)))
122 (defun string-search-command (string bound error count direction)
123 (check-search-thread-safe)
124 (gap-move-to (current-buffer) (buffer-point-aref (current-buffer)))
125 ;; normalize vars
126 (setf count (* count direction)
127 bound (if (minusp count)
128 (if bound (max bound (begv)) (begv))
129 (if bound (min bound (zv)) (zv))))
130 (let* ((buffer (current-buffer))
132 (start-aref (buffer-point-aref buffer))
133 (bound-aref (buffer-char-to-aref buffer bound))
134 (n (if (minusp count)
135 (loop for i from 0 below (- count)
136 do (setf pos (search string (buffer-data buffer) :from-end t :end2 start-aref :start2 bound-aref))
137 while pos
138 count i)
139 (loop for i from 0 below count
140 do (setf pos (search string (buffer-data buffer) :start2 start-aref :end2 bound-aref))
141 while pos
142 count i))))
143 (if (/= n (abs count))
144 (cond
145 ((eq error t)
146 (signal 'search-failed))
147 ((null error)
148 nil)
149 (bound
150 (set-point bound buffer)
151 nil)
152 (t nil))
153 (progn
154 (if (minusp count)
155 (set-point (+ (buffer-aref-to-char buffer pos) (length string)))
156 (set-point (buffer-aref-to-char buffer pos)))
157 (values (pt)
158 (setf *match-data*
159 (make-match-data :obj buffer
160 :start (buffer-aref-to-char buffer pos)
161 :end (+ (buffer-aref-to-char buffer pos) (length string))
162 :reg-starts #()
163 :reg-ends #())))))))
165 (defun search-forward (string &key bound (error t) (count 1))
166 "Search forward from point for string.
167 Set point to the end of the occurrence found, and return point.
168 An optional second argument bounds the search; it is a buffer position.
169 The match found must not extend after that position. nil is equivalent
170 to (point-max).
171 Optional third argument, if t, means if fail just return nil (no error).
172 If not nil and not t, move to limit of search and return nil.
173 Optional fourth argument is repeat count--search for successive occurrences.
175 Search case-sensitivity is determined by the value of the variable
176 `case-fold-search', which see.
178 See also the functions `match-beginning', `match-end' and `replace-match'."
179 (string-search-command string bound error count 1))
181 (defun search-backward (string &key bound (error t) (count 1))
182 "Search backward from point for STRING.
183 Set point to the beginning of the occurrence found, and return point.
184 An optional second argument bounds the search; it is a buffer position.
185 The match found must not extend before that position.
186 Optional third argument, if t, means if fail just return nil (no error).
187 If not nil and not t, position at limit of search and return nil.
188 Optional fourth argument is repeat count--search for successive occurrences.
190 Search case-sensitivity is determined by the value of the variable
191 `case-fold-search', which see.
193 See also the functions `match-beginning', `match-end' and `replace-match'."
194 (string-search-command string bound error count -1))
196 (defvar *regexp-cache* (make-memoize-state :test 'string=))
198 ;; TODO: create compiler-macros for regex functions so the regexps can
199 ;; be compiled at compile time.
201 (defun looking-at (regexp &optional (buffer (current-buffer)))
202 "Return the match-data if text after point matches regular expression regexp."
203 (check-type regexp string)
204 (check-search-thread-safe)
205 ;; get the gap outta the way. It sucks we have to do this. Really we
206 ;; should modify ppcre to generate scanner functions that hop the
207 ;; gap. Meantime...
208 (when (< (buffer-char-to-aref buffer (pt buffer))
209 (buffer-gap-start buffer))
210 (gap-move-to-point buffer))
211 (multiple-value-bind (start end reg-starts reg-ends)
212 (ppcre:scan (memoize *regexp-cache* regexp (ppcre:create-scanner regexp :multi-line-mode t)) (buffer-data buffer)
213 :start (buffer-char-to-aref buffer (pt buffer))
214 :real-start-pos 0)
215 (when (and start
216 (= start (buffer-char-to-aref buffer (pt buffer))))
217 (values t
218 (setf *match-data*
219 (make-match-data :obj buffer
220 :start (buffer-aref-to-char buffer start)
221 :end (buffer-aref-to-char buffer end)
222 :reg-starts (map 'vector (lambda (n)
223 (buffer-aref-to-char buffer n))
224 reg-starts)
225 :reg-ends (map 'vector (lambda (n)
226 (buffer-aref-to-char buffer n))
227 reg-ends)))))))
229 (defun re-search-forward (regexp &key (bound (zv)) (error t) count &aux (buffer (current-buffer)))
230 "Search forward from point for regular expression regexp.
231 Set point to the end of the occurrence found, and return match-data structure.
232 BOUND bounds the search; it is a buffer position.
233 The match found must not extend after that position.
234 ERROR, if nil, means if fail just return nil (no error).
235 If not nil and not t, move to limit of search and return nil.
236 COUNT is repeat count--search for successive occurrences.
237 See also the functions `match-beginning', `match-end', `match-string',
238 and `replace-match'."
239 (declare (ignore count))
240 (check-search-thread-safe)
241 (when (< (buffer-char-to-aref buffer (pt buffer))
242 (buffer-gap-start buffer))
243 (gap-move-to-point buffer))
244 (multiple-value-bind (start end reg-starts reg-ends)
245 (ppcre:scan (memoize *regexp-cache* regexp (ppcre:create-scanner regexp :multi-line-mode t)) (buffer-data buffer)
246 :start (buffer-char-to-aref buffer (pt buffer))
247 :end (buffer-char-to-aref buffer bound)
248 :real-start-pos 0)
249 (cond (start
250 (set-point (buffer-aref-to-char buffer end) buffer)
251 (values (pt)
252 (setf *match-data*
253 (make-match-data :obj buffer
254 :start (buffer-aref-to-char buffer start)
255 :end (buffer-aref-to-char buffer end)
256 :reg-starts (map 'vector (lambda (n)
257 (buffer-aref-to-char buffer n))
258 reg-starts)
259 :reg-ends (map 'vector (lambda (n)
260 (buffer-aref-to-char buffer n))
261 reg-ends)))))
262 ((eq error t)
263 (signal 'search-failed))
264 ((null error)
265 nil)
266 (bound
267 (set-point bound buffer)
268 nil)
269 (t nil))))
271 (defun re-search-backward (regexp &key (bound (begv)) (error t) count &aux (buffer (current-buffer)))
272 "Search backward from point for match for regular expression regexp.
273 Set point to the beginning of the match, and return match-data.
274 The match found is the one starting last in the buffer
275 and yet ending before the origin of the search.
276 BOUND bounds the search; it is a buffer position.
277 The match found must start at or after that position.
278 ERROR, if nil, means if fail just return nil (no error).
279 If not nil and not t, move to limit of search and return nil.
280 COUNT is repeat count--search for successive occurrences.
281 See also the functions `match-beginning', `match-end', `match-string',
282 and `replace-match'."
283 (declare (ignore count))
284 (check-search-thread-safe)
285 ;;(message "re-search-backward ~s ~d" regexp (point))
286 (when (> (buffer-gap-start buffer)
287 (buffer-char-to-aref buffer (pt buffer)))
288 (gap-move-to buffer (buffer-char-to-aref buffer (1+ (pt buffer)))))
289 ;; start search from point and keep walking back til we match something
290 (let* ((start-aref (buffer-char-to-aref buffer (pt buffer)))
291 (pt-aref start-aref)
292 (stop (buffer-char-to-aref buffer bound))
293 (scanner (memoize *regexp-cache* regexp (ppcre:create-scanner regexp :multi-line-mode t))))
294 (loop
295 (multiple-value-bind (start end reg-starts reg-ends)
296 (ppcre:scan scanner (buffer-data buffer) :start start-aref :end pt-aref :real-start-pos 0)
297 (when start
298 (set-point (buffer-aref-to-char buffer start) buffer)
299 (return (values (pt)
300 (setf *match-data*
301 (make-match-data :obj buffer
302 :start (buffer-aref-to-char buffer start)
303 :end (buffer-aref-to-char buffer end)
304 :reg-starts (map 'vector (lambda (n)
305 (buffer-aref-to-char buffer n))
306 reg-starts)
307 :reg-ends (map 'vector (lambda (n)
308 (buffer-aref-to-char buffer n))
309 reg-ends))))))
310 (dec-aref start-aref buffer)
311 (when (< start-aref stop)
312 (cond ((eq error t)
313 ;; FIXME: we need a search condition
314 (signal 'search-failed))
315 ((null error)
316 (return nil))
318 (when bound
319 (set-point bound buffer))
320 (return nil))))))))
322 (defun string-match (regexp string &key (start 0) (end (length string)))
323 "Return index of start of first match for regexp in string and match-data, or nil.
324 Matching ignores case if `case-fold-search' is non-nil.
325 START, start search at that index in string.
326 END, end search at that index in string.
327 **For index of first char beyond the match, do (match-end 0).
328 **`match-end' and `match-beginning' also give indices of substrings
329 **matched by parenthesis constructs in the pattern.
331 You can use the function `match-string' to extract the substrings
332 matched by the parenthesis constructions in regexp."
333 (check-search-thread-safe)
334 (multiple-value-bind (start end reg-starts reg-ends)
335 (ppcre:scan (memoize *regexp-cache* regexp (ppcre:create-scanner regexp :multi-line-mode t))
336 string :start start :end end)
337 (when start
338 (values start
339 (setf *match-data*
340 (make-match-data :obj string
341 :start start
342 :end end
343 :reg-starts reg-starts
344 :reg-ends reg-ends))))))
346 (defun regexp-quote (string)
347 "Return a regexp string which matches exactly STRING and nothing else."
348 (check-type string string)
349 (coerce
350 (loop for c across string
351 when (find c "[*.\\?+^$" :test 'char=)
352 collect #\\
353 collect c)
354 'string))
356 (defun wordify (string)
357 "Given a string of words separated by word delimiters,
358 compute a regexp that matches those exact words
359 separated by arbitrary punctuation."
360 (error "unimplemented wordify"))
362 (defun word-search-forward (string &key (bound (begv)) (error t) count &aux (buffer (current-buffer)))
363 (error "unimplemented word-search-forward"))
365 (defun scan-buffer (buffer target start end count)
366 "Search for COUNT instances of the character TARGET between START and END.
368 If COUNT is positive, search forwards; END must be >= START.
369 If COUNT is negative, search backwards for the -COUNTth instance;
370 END must be <= START.
371 If COUNT is zero, do anything you please; run rogue, for all I care.
373 If END is NIL, use BEGV or ZV instead, as appropriate for the
374 direction indicated by COUNT.
376 If we find COUNT instances, return the
377 position past the COUNTth match and 0. Note that for reverse motion
378 this is not the same as the usual convention for Emacs motion commands.
380 If we don't find COUNT instances before reaching END, return END
381 and the number of TARGETs left unfound."
382 (let ((shortage (abs count))
383 last)
384 (if (> count 0)
385 (setf end (or end (zv buffer)))
386 (setf end (or end (begv buffer))))
387 (setf start (buffer-char-to-aref buffer start)
388 end (buffer-char-to-aref buffer end))
389 (loop while (and (> count 0)
390 (/= start end)) do
391 (setf start
392 (if (< start (buffer-gap-start buffer))
393 (or (position target (buffer-data buffer) :start start :end (min end (buffer-gap-start buffer)))
394 (and (> end (gap-end buffer))
395 (position target (buffer-data buffer) :start (gap-end buffer) :end end)))
396 (position target (buffer-data buffer) :start start :end end)))
397 (if start
398 (setf start (1+ start)
399 last start
400 count (1- count)
401 shortage (1- shortage))
402 (setf start end)))
403 (loop while (and (< count 0)
404 (/= start end)) do
405 (setf start
406 (if (> start (buffer-gap-start buffer))
407 (or (position target (buffer-data buffer) :start (max end (gap-end buffer)) :end start :from-end t)
408 (and (< end (buffer-gap-start buffer))
409 (position target (buffer-data buffer) :start end :end (buffer-gap-start buffer) :from-end t)))
410 (position target (buffer-data buffer) :start end :end start :from-end t)))
411 (if start
412 (setf last (+ start 1) ; match emacs functionality
413 count (1+ count)
414 shortage (1- shortage))
415 (setf start end)))
416 (if (zerop count)
417 (values (and last (buffer-aref-to-char buffer last)) 0)
418 (values (buffer-aref-to-char buffer end) shortage))))
420 (defun find-before-next-newline (from to cnt)
421 "Like find_next_newline, but returns position before the newline,
422 not after, and only search up to TO. This isn't just
423 find_next_newline (...)-1, because you might hit TO."
424 (multiple-value-bind (pos shortage) (scan-buffer (current-buffer) #\Newline from to cnt)
425 (when (zerop shortage)
426 (decf pos))
427 pos))
429 (defun buffer-scan-newline (buf start limit count)
430 "Search BUF for COUNT newlines with a limiting point at LIMIT,
431 starting at START. Returns the point of the last newline or limit and
432 number of newlines found. START and LIMIT are inclusive."
433 (declare (type buffer buf)
434 (type integer start limit count))
435 (labels ((buffer-scan-bk (buf start limit count)
436 "count is always >=0. start >= limit."
437 (let* ((start-aref (buffer-char-to-aref buf start))
438 (limit-aref (buffer-char-to-aref buf limit))
439 (ceiling (if (>= start-aref (gap-end buf))
440 (max limit-aref (gap-end buf))
441 limit-aref))
442 (i 0)
443 ;; :END is not inclusive but START is.
444 (start (1+ start-aref))
446 (loop
447 ;; Always search at least once
448 (setf p (position #\Newline (buffer-data buf)
449 :start ceiling :end start :from-end t))
450 (if p
451 (progn
452 ;; Move start. Note that start isn't set to (1+ p)
453 ;; because we don't want to search p again.
454 (setf start p)
455 ;; Count the newline
456 (incf i)
457 ;; Have we found enough newlines?
458 (when (>= i count)
459 (return-from buffer-scan-bk (values (buffer-aref-to-char buf p)
460 i))))
461 ;; Check if we've searched up to the limit
462 (if (= ceiling limit-aref)
463 (return-from buffer-scan-bk (values limit i))
464 ;; if not, skip past the gap
465 (progn
466 (setf ceiling limit-aref)
467 (setf start (buffer-gap-start buf))))))))
468 (buffer-scan-fw (buf start limit count)
469 "count is always >=0. start >= limit."
470 (let* ((start-aref (buffer-char-to-aref buf start))
471 (limit-aref (1+ (buffer-char-to-aref buf limit)))
472 (ceiling (if (< start (buffer-gap-start buf))
473 (min limit-aref (buffer-gap-start buf))
474 limit-aref))
475 (i 0)
476 (start start-aref)
478 (loop
479 ;; Always search at least once
480 (setf p (position #\Newline (buffer-data buf) :start start :end ceiling))
481 (if p
482 (progn
483 ;; Move start. We don't want to search p again, thus the 1+.
484 (setf start (1+ p))
485 ;; Count the newline
486 (incf i)
487 ;; Have we found enough newlines?
488 (when (>= i count)
489 (return-from buffer-scan-fw (values (buffer-aref-to-char buf p)
490 i))))
491 ;; Check if we've searched up to the limit
492 (if (= ceiling limit-aref)
493 (return-from buffer-scan-fw (values limit i))
494 ;; if not, skip past the gap
495 (progn
496 (setf ceiling limit-aref)
497 (setf start (gap-end buf)))))))))
498 ;; make sure start and limit are within the bounds
499 (setf start (max 0 (min start (1- (buffer-size buf))))
500 limit (max 0 (min limit (1- (buffer-size buf)))))
501 ;; the search always fails on an empty buffer
502 (when (= (buffer-size buf) 0)
503 (return-from buffer-scan-newline (values limit 0)))
504 (cond ((> count 0)
505 (dformat +debug-vv+ "scan-fw ~a ~a ~a~%" start limit count)
506 (buffer-scan-fw buf start limit count))
507 ((< count 0)
508 (dformat +debug-vv+ "scan-bk ~a ~a ~a~%" start limit count)
509 (buffer-scan-bk buf start limit (abs count)))
510 ;; 0 means the newline before the beginning of the current
511 ;; line. We need to handle the case where we are on a newline.
513 (dformat +debug-vv+ "scan-0 ~a ~a ~a~%" start limit count)
514 (if (char= (buffer-char-after buf start) #\Newline)
515 (buffer-scan-bk buf start limit 2)
516 (buffer-scan-bk buf start limit 1))))))