1 (***********************************************************************)
5 (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
7 (* Copyright 1996 Institut National de Recherche en Informatique et *)
8 (* en Automatique. All rights reserved. This file is distributed *)
9 (* under the terms of the GNU Library General Public License, with *)
10 (* the special exception on linking described in file ../../LICENSE. *)
12 (***********************************************************************)
16 (** String utilities *)
18 let string_before s n
= String.sub s
0 n
20 let string_after s n
= String.sub s n
(String.length s
- n
)
22 let first_chars s n
= String.sub s
0 n
24 let last_chars s n
= String.sub s
(String.length s
- n
) n
26 (** Representation of character sets **)
30 type t
= string (* of length 32 *)
32 let empty = String.make
32 '
\000'
33 let full = String.make
32 '
\255'
35 let make_empty () = String.make
32 '
\000'
38 let i = Char.code c
in
39 s
.[i lsr 3] <- Char.chr
(Char.code s
.[i lsr 3] lor (1 lsl (i land 7)))
41 let add_range s c1 c2
=
42 for i = Char.code c1
to Char.code c2
do add s
(Char.chr
i) done
45 let s = make_empty () in add s c
; s
48 let s = make_empty () in add_range s c1 c2
; s
51 let r = String.create
32 in
53 r.[i] <- Char.chr
(Char.code
s.[i] lxor 0xFF)
58 let r = String.create
32 in
60 r.[i] <- Char.chr
(Char.code s1
.[i] lor Char.code s2
.[i])
67 if Char.code s1
.[i] land Char.code s2
.[i] <> 0 then raise Exit
75 let c = Char.code
s.[i] in
78 if c land (1 lsl j
) <> 0 then fn
(Char.chr
((i lsl 3) + j
))
83 let r = String.make
256 '
\000'
in
84 iter (fun c -> r.[Char.code
c] <- '
\001'
) s;
88 let r = make_empty() in
89 iter (fun c -> add r (Char.lowercase
c); add r (Char.uppercase
c)) s;
94 (** Abstract syntax tree for regular expressions *)
99 | CharClass
of Charset.t
100 | Seq
of re_syntax list
101 | Alt
of re_syntax
* re_syntax
104 | Option
of re_syntax
105 | Group
of int * re_syntax
111 (** Representation of compiled regular expressions *)
114 prog
: int array
; (* bytecode instructions *)
115 cpool
: string array
; (* constant pool (string literals) *)
116 normtable
: string; (* case folding table (if any) *)
117 numgroups
: int; (* number of \(...\) groups *)
118 numregisters
: int; (* number of nullable Star or Plus *)
119 startchars
: int (* index of set of starting chars, or -1 if none *)
122 (** Opcodes for bytecode instructions; see strstubs.c for description *)
127 let op_STRINGNORM = 3
131 let op_WORDBOUNDARY = 7
136 let op_SIMPLEOPT = 12
137 let op_SIMPLESTAR = 13
138 let op_SIMPLEPLUS = 14
142 let op_CHECKPROGRESS = 18
144 (* Encoding of bytecode instructions *)
146 let instr opc arg
= opc
lor (arg
lsl 8)
148 (* Computing relative displacements for GOTO and PUSHBACK instructions *)
150 let displ dest from
= dest
- from
- 1
152 (** Compilation of a regular expression *)
154 (* Determine if a regexp can match the empty string *)
156 let rec is_nullable = function
159 | CharClass cl
-> false
160 | Seq rl
-> List.for_all
is_nullable rl
161 | Alt
(r1
, r2
) -> is_nullable r1
|| is_nullable r2
163 | Plus
r -> is_nullable r
165 | Group
(n
, r) -> is_nullable r
169 | Wordboundary
-> true
171 (* first r returns a set of characters C such that:
172 for all string s, s matches r => the first character of s is in C.
173 For convenience, return Charset.full if r is nullable. *)
175 let rec first = function
176 Char
c -> Charset.singleton c
177 | String
s -> if s = "" then Charset.full else Charset.singleton s.[0]
179 | Seq rl
-> first_seq rl
180 | Alt
(r1
, r2
) -> Charset.union (first r1
) (first r2
)
181 | Star
r -> Charset.full
183 | Option
r -> Charset.full
184 | Group
(n
, r) -> first r
185 | Refgroup n
-> Charset.full
186 | Bol
-> Charset.full
187 | Eol
-> Charset.full
188 | Wordboundary
-> Charset.full
190 and first_seq
= function
192 | (Bol
| Eol
| Wordboundary
) :: rl
-> first_seq rl
193 | Star
r :: rl
-> Charset.union (first r) (first_seq rl
)
194 | Option
r :: rl
-> Charset.union (first r) (first_seq rl
)
197 (* Transform a Char or CharClass regexp into a character class *)
199 let charclass_of_regexp fold_case re
=
202 Char
c -> Charset.singleton c
204 | _
-> assert false in
205 if fold_case then Charset.fold_case cl else cl
207 (* The case fold table: maps characters to their lowercase equivalent *)
209 let fold_case_table =
210 let t = String.create
256 in
211 for i = 0 to 255 do t.[i] <- Char.lowercase
(Char.chr
i) done;
214 module StringMap
= Map.Make
(struct type t = string let compare = compare end)
216 (* Compilation of a regular expression *)
218 let compile fold_case re
=
220 (* Instruction buffering *)
221 let prog = ref (Array.make
32 0)
223 and cpool
= ref StringMap.empty
225 and numgroups
= ref 1
226 and numregs
= ref 0 in
227 (* Add a new instruction *)
228 let emit_instr opc arg
=
229 if !progpos
>= Array.length
!prog then begin
230 let newlen = ref (Array.length
!prog) in
231 while !progpos
>= !newlen do newlen := !newlen * 2 done;
232 let nprog = Array.make
!newlen 0 in
233 Array.blit
!prog 0 nprog 0 (Array.length
!prog);
236 (!prog).(!progpos
) <- (instr opc arg
);
238 (* Reserve an instruction slot and return its position *)
240 let p = !progpos
in incr progpos
; p in
241 (* Fill a reserved instruction slot with a GOTO or PUSHBACK instruction *)
242 let patch_instr pos opc dest
=
243 (!prog).(pos
) <- (instr opc
(displ dest pos
)) in
244 (* Return the cpool index for the given string, adding it if not
248 StringMap.find
s !cpool
251 cpool
:= StringMap.add s p !cpool
;
254 (* Allocate fresh register if regexp is nullable *)
255 let allocate_register_if_nullable r =
256 if is_nullable r then begin
258 if n >= 64 then failwith
"too many r* or r+ where r is nullable";
263 (* Main recursive compilation function *)
264 let rec emit_code = function
267 emit_instr op_CHARNORM (Char.code
(Char.lowercase
c))
269 emit_instr op_CHAR (Char.code
c)
271 begin match String.length
s with
275 emit_instr op_CHARNORM (Char.code
(Char.lowercase
s.[0]))
277 emit_instr op_CHAR (Char.code
s.[0])
280 (* null characters are not accepted by the STRING* instructions;
281 if one is found, split string at null character *)
282 let i = String.index
s '
\000'
in
283 emit_code (String
(string_before s i));
284 emit_instr op_CHAR 0;
285 emit_code (String
(string_after s (i+1)))
288 emit_instr op_STRINGNORM (cpool_index (String.lowercase
s))
290 emit_instr op_STRING (cpool_index s)
293 let cl'
= if fold_case then Charset.fold_case cl else cl in
294 emit_instr op_CHARCLASS (cpool_index cl'
)
303 let pos_pushback = emit_hole() in
305 let pos_goto_end = emit_hole() in
306 let lbl1 = !progpos
in
308 let lbl2 = !progpos
in
309 patch_instr pos_pushback op_PUSHBACK lbl1;
310 patch_instr pos_goto_end op_GOTO lbl2
312 (* Implement longest match semantics for compatibility with old Str *)
313 (* General translation:
320 If r cannot match the empty string, code can be simplified:
326 let regno = allocate_register_if_nullable r in
327 let lbl1 = emit_hole() in
328 if regno >= 0 then emit_instr op_SETMARK regno;
330 if regno >= 0 then emit_instr op_CHECKPROGRESS regno;
331 emit_instr op_GOTO (displ lbl1 !progpos
);
332 let lbl2 = !progpos
in
333 patch_instr lbl1 op_PUSHBACK lbl2
335 (* Implement longest match semantics for compatibility with old Str *)
336 (* General translation:
343 If r cannot match the empty string, code can be simplified:
349 let regno = allocate_register_if_nullable r in
350 let lbl1 = !progpos
in
352 if regno >= 0 then emit_instr op_CHECKPROGRESS regno;
353 let pos_pushback = emit_hole() in
354 if regno >= 0 then emit_instr op_SETMARK regno;
355 emit_instr op_GOTO (displ lbl1 !progpos
);
356 let lbl2 = !progpos
in
357 patch_instr pos_pushback op_PUSHBACK lbl2
359 (* Implement longest match semantics for compatibility with old Str *)
364 let pos_pushback = emit_hole() in
366 let lbl = !progpos
in
367 patch_instr pos_pushback op_PUSHBACK lbl
369 if n >= 32 then failwith
"too many \\(...\\) groups";
370 emit_instr op_BEGGROUP n;
372 emit_instr op_ENDGROUP n;
373 numgroups
:= max
!numgroups
(n+1)
375 emit_instr op_REFGROUP n
381 emit_instr op_WORDBOUNDARY 0
383 and emit_seq_code
= function
385 | Star
(Char _
| CharClass _
as r) :: rl
386 when disjoint_modulo_case
(first r) (first_seq rl
) ->
387 emit_instr op_SIMPLESTAR (cpool_index (charclass_of_regexp fold_case r));
389 | Plus
(Char _
| CharClass _
as r) :: rl
390 when disjoint_modulo_case
(first r) (first_seq rl
) ->
391 emit_instr op_SIMPLEPLUS (cpool_index (charclass_of_regexp fold_case r));
393 | Option
(Char _
| CharClass _
as r) :: rl
394 when disjoint_modulo_case
(first r) (first_seq rl
) ->
395 emit_instr op_SIMPLEOPT (cpool_index (charclass_of_regexp fold_case r));
401 and disjoint_modulo_case c1 c2
=
403 then Charset.disjoint (Charset.fold_case c1
) (Charset.fold_case c2
)
404 else Charset.disjoint c1 c2
408 emit_instr op_ACCEPT 0;
409 let start = first re
in
410 let start'
= if fold_case then Charset.fold_case start else start in
412 if start = Charset.full
414 else cpool_index (Charset.expand start'
) in
415 let constantpool = Array.make
!cpoolpos
"" in
416 StringMap.iter (fun str idx
-> constantpool.(idx
) <- str
) !cpool
;
417 { prog = Array.sub
!prog 0 !progpos
;
418 cpool
= constantpool;
419 normtable
= if fold_case then fold_case_table else "";
420 numgroups
= !numgroups
;
421 numregisters
= !numregs
;
422 startchars
= start_pos }
424 (** Parsing of a regular expression *)
426 (* Efficient buffering of sequences *)
428 module SeqBuffer
= struct
430 type t = { sb_chars
: Buffer.t; mutable sb_next
: re_syntax list
}
432 let create() = { sb_chars
= Buffer.create 16; sb_next
= [] }
435 let s = Buffer.contents buf
.sb_chars
in
436 Buffer.clear buf
.sb_chars
;
437 match String.length
s with
439 | 1 -> buf
.sb_next
<- Char
s.[0] :: buf
.sb_next
440 | _
-> buf
.sb_next
<- String
s :: buf
.sb_next
444 Char
c -> Buffer.add_char buf
.sb_chars
c
445 | _
-> flush buf
; buf
.sb_next
<- re
:: buf
.sb_next
448 flush buf
; Seq
(List.rev buf
.sb_next
)
452 (* The character class corresponding to `.' *)
454 let dotclass = Charset.complement (Charset.singleton '
\n'
)
456 (* Parse a regular expression *)
459 let len = String.length
s in
460 let group_counter = ref 1 in
463 let (r, j
) = regexp1
i in
465 and regexp0cont r1
i =
466 if i + 2 <= len && s.[i] = '
\\'
&& s.[i+1] = '
|'
then
467 let (r2
, j
) = regexp1
(i+2) in
468 regexp0cont
(Alt
(r1
, r2
)) j
472 regexp1cont
(SeqBuffer.create()) i
473 and regexp1cont sb
i =
475 || i + 2 <= len && s.[i] = '
\\'
&& (let c = s.[i+1] in c = '
|'
|| c = '
)'
)
477 (SeqBuffer.extract sb
, i)
479 let (r, j
) = regexp2
i in
483 let (r, j
) = regexp3
i in
485 and regexp2cont
r i =
486 if i >= len then (r, i) else
488 '?'
-> regexp2cont
(Option
r) (i+1)
489 | '
*'
-> regexp2cont
(Star
r) (i+1)
490 | '
+'
-> regexp2cont
(Plus
r) (i+1)
494 '
\\'
-> regexpbackslash
(i+1)
495 | '
['
-> let (c, j
) = regexpclass0
(i+1) in (CharClass
c, j
)
498 | '
.'
-> (CharClass
dotclass, i+1)
500 and regexpbackslash
i =
501 if i >= len then (Char '
\\'
, i) else
506 let group_no = !group_counter in
507 if group_no < 32 then incr
group_counter;
508 let (r, j
) = regexp0 (i+1) in
509 if j
+ 1 < len && s.[j
] = '
\\'
&& s.[j
+1] = '
)'
then
511 then (Group
(group_no, r), j
+ 2)
514 failwith
"\\( group not closed by \\)"
516 (Refgroup
(Char.code
c - 48), i + 1)
518 (Wordboundary
, i + 1)
522 if i < len && s.[i] = '^'
523 then let (c, j
) = regexpclass1
(i+1) in (Charset.complement c, j
)
526 let c = Charset.make_empty() in
527 let j = regexpclass2
c i i in
529 and regexpclass2
c start i =
530 if i >= len then failwith
"[ class not closed by ]";
531 if s.[i] = '
]'
&& i > start then i+1 else begin
533 if i+2 < len && s.[i+1] = '
-'
&& s.[i+2] <> '
]'
then begin
535 Charset.add_range c c1 c2;
536 regexpclass2
c start (i+3)
539 regexpclass2
c start (i+1)
543 let (r, j) = regexp0 0 in
544 if j = len then r else failwith
"spurious \\) in regular expression"
546 (** Parsing and compilation *)
548 let regexp e
= compile false (parse e
)
550 let regexp_case_fold e
= compile true (parse e
)
553 let len = String.length
s in
554 let buf = String.create (2 * len) in
556 for i = 0 to len - 1 do
558 '
['
| '
]'
| '
*'
| '
.'
| '
\\'
| '?'
| '
+'
| '^'
| '$'
as c ->
559 buf.[!pos] <- '
\\'
; buf.[!pos + 1] <- c; pos := !pos + 2
561 buf.[!pos] <- c; pos := !pos + 1
563 String.sub
buf 0 !pos
565 let regexp_string s = compile false (String
s)
567 let regexp_string_case_fold s = compile true (String
s)
569 (** Matching functions **)
571 external re_string_match
: regexp -> string -> int -> int array
573 external re_partial_match
: regexp -> string -> int -> int array
575 external re_search_forward
: regexp -> string -> int -> int array
576 = "re_search_forward"
577 external re_search_backward
: regexp -> string -> int -> int array
578 = "re_search_backward"
580 let last_search_result = ref [||]
582 let string_match re
s pos =
583 let res = re_string_match re
s pos in
584 last_search_result := res;
587 let string_partial_match re
s pos =
588 let res = re_partial_match re
s pos in
589 last_search_result := res;
592 let search_forward re
s pos =
593 let res = re_search_forward re
s pos in
594 last_search_result := res;
595 if Array.length
res = 0 then raise Not_found
else res.(0)
597 let search_backward re
s pos =
598 let res = re_search_backward re
s pos in
599 last_search_result := res;
600 if Array.length
res = 0 then raise Not_found
else res.(0)
602 let group_beginning n =
604 if n < 0 || n2 >= Array.length
!last_search_result then
605 invalid_arg
"Str.group_beginning"
607 let pos = !last_search_result.(n2) in
608 if pos = -1 then raise Not_found
else pos
612 if n < 0 || n2 >= Array.length
!last_search_result then
613 invalid_arg
"Str.group_end"
615 let pos = !last_search_result.(n2 + 1) in
616 if pos = -1 then raise Not_found
else pos
618 let matched_group n txt
=
620 if n < 0 || n2 >= Array.length
!last_search_result then
621 invalid_arg
"Str.matched_group"
623 let b = !last_search_result.(n2)
624 and e
= !last_search_result.(n2 + 1) in
625 if b = -1 then raise Not_found
else String.sub txt
b (e
- b)
627 let match_beginning () = group_beginning 0
628 and match_end
() = group_end 0
629 and matched_string txt
= matched_group 0 txt
633 external re_replacement_text
: string -> int array
-> string -> string
634 = "re_replacement_text"
636 let replace_matched repl matched
=
637 re_replacement_text repl
!last_search_result matched
639 let substitute_first expr repl_fun text
=
641 let pos = search_forward expr text
0 in
642 String.concat
"" [string_before text
pos;
644 string_after text
(match_end
())]
648 let global_substitute expr repl_fun text
=
649 let rec replace start last_was_empty
=
651 let startpos = if last_was_empty
then start + 1 else start in
652 if startpos > String.length text
then raise Not_found
;
653 let pos = search_forward expr text
startpos in
654 let end_pos = match_end
() in
655 let repl_text = repl_fun text
in
656 String.sub text
start (pos-start) ::
658 replace end_pos (end_pos = pos)
660 [string_after text
start] in
661 String.concat
"" (replace 0 false)
663 let global_replace expr repl text
=
664 global_substitute expr
(replace_matched repl
) text
665 and replace_first expr repl text
=
666 substitute_first expr
(replace_matched repl
) text
670 let search_forward_progress expr text
start =
671 let pos = search_forward expr text
start in
672 if match_end
() > start then pos
673 else if start < String.length text
then search_forward expr text
(start + 1)
676 let bounded_split expr text num
=
678 if string_match expr text
0 then match_end
() else 0 in
679 let rec split start n =
680 if start >= String.length text
then [] else
681 if n = 1 then [string_after text
start] else
683 let pos = search_forward_progress expr text
start in
684 String.sub text
start (pos-start) :: split (match_end
()) (n-1)
686 [string_after text
start] in
689 let split expr text
= bounded_split expr text
0
691 let bounded_split_delim expr text num
=
692 let rec split start n =
693 if start > String.length text
then [] else
694 if n = 1 then [string_after text
start] else
696 let pos = search_forward_progress expr text
start in
697 String.sub text
start (pos-start) :: split (match_end
()) (n-1)
699 [string_after text
start] in
700 if text
= "" then [] else split 0 num
702 let split_delim expr text
= bounded_split_delim expr text
0
704 type split_result
= Text
of string | Delim
of string
706 let bounded_full_split expr text num
=
707 let rec split start n =
708 if start >= String.length text
then [] else
709 if n = 1 then [Text
(string_after text
start)] else
711 let pos = search_forward_progress expr text
start in
712 let s = matched_string text
in
714 Text
(String.sub text
start (pos-start)) ::
716 split (match_end
()) (n-1)
719 split (match_end
()) (n-1)
721 [Text
(string_after text
start)] in
724 let full_split expr text
= bounded_full_split expr text
0