Sys.Signals module for a Variant type of signals (and a set_signal function that...
[ocaml.git] / otherlibs / str / str.ml
blob080efae9ffe8d0db9399001e18336d796579e286
1 (***********************************************************************)
2 (* *)
3 (* Objective Caml *)
4 (* *)
5 (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
6 (* *)
7 (* Copyright 1996 Institut National de Recherche en Informatique et *)
8 (* en Automatique. All rights reserved. This file is distributed *)
9 (* under the terms of the GNU Library General Public License, with *)
10 (* the special exception on linking described in file ../../LICENSE. *)
11 (* *)
12 (***********************************************************************)
14 (* $Id$ *)
16 (** String utilities *)
18 let string_before s n = String.sub s 0 n
20 let string_after s n = String.sub s n (String.length s - n)
22 let first_chars s n = String.sub s 0 n
24 let last_chars s n = String.sub s (String.length s - n) n
26 (** Representation of character sets **)
28 module Charset =
29 struct
30 type t = string (* of length 32 *)
32 let empty = String.make 32 '\000'
33 let full = String.make 32 '\255'
35 let make_empty () = String.make 32 '\000'
37 let add s c =
38 let i = Char.code c in
39 s.[i lsr 3] <- Char.chr(Char.code s.[i lsr 3] lor (1 lsl (i land 7)))
41 let add_range s c1 c2 =
42 for i = Char.code c1 to Char.code c2 do add s (Char.chr i) done
44 let singleton c =
45 let s = make_empty () in add s c; s
47 let range c1 c2 =
48 let s = make_empty () in add_range s c1 c2; s
50 let complement s =
51 let r = String.create 32 in
52 for i = 0 to 31 do
53 r.[i] <- Char.chr(Char.code s.[i] lxor 0xFF)
54 done;
57 let union s1 s2 =
58 let r = String.create 32 in
59 for i = 0 to 31 do
60 r.[i] <- Char.chr(Char.code s1.[i] lor Char.code s2.[i])
61 done;
64 let disjoint s1 s2 =
65 try
66 for i = 0 to 31 do
67 if Char.code s1.[i] land Char.code s2.[i] <> 0 then raise Exit
68 done;
69 true
70 with Exit ->
71 false
73 let iter fn s =
74 for i = 0 to 31 do
75 let c = Char.code s.[i] in
76 if c <> 0 then
77 for j = 0 to 7 do
78 if c land (1 lsl j) <> 0 then fn (Char.chr ((i lsl 3) + j))
79 done
80 done
82 let expand s =
83 let r = String.make 256 '\000' in
84 iter (fun c -> r.[Char.code c] <- '\001') s;
87 let fold_case s =
88 let r = make_empty() in
89 iter (fun c -> add r (Char.lowercase c); add r (Char.uppercase c)) s;
92 end
94 (** Abstract syntax tree for regular expressions *)
96 type re_syntax =
97 Char of char
98 | String of string
99 | CharClass of Charset.t
100 | Seq of re_syntax list
101 | Alt of re_syntax * re_syntax
102 | Star of re_syntax
103 | Plus of re_syntax
104 | Option of re_syntax
105 | Group of int * re_syntax
106 | Refgroup of int
107 | Bol
108 | Eol
109 | Wordboundary
111 (** Representation of compiled regular expressions *)
113 type regexp = {
114 prog: int array; (* bytecode instructions *)
115 cpool: string array; (* constant pool (string literals) *)
116 normtable: string; (* case folding table (if any) *)
117 numgroups: int; (* number of \(...\) groups *)
118 numregisters: int; (* number of nullable Star or Plus *)
119 startchars: int (* index of set of starting chars, or -1 if none *)
122 (** Opcodes for bytecode instructions; see strstubs.c for description *)
124 let op_CHAR = 0
125 let op_CHARNORM = 1
126 let op_STRING = 2
127 let op_STRINGNORM = 3
128 let op_CHARCLASS = 4
129 let op_BOL = 5
130 let op_EOL = 6
131 let op_WORDBOUNDARY = 7
132 let op_BEGGROUP = 8
133 let op_ENDGROUP = 9
134 let op_REFGROUP = 10
135 let op_ACCEPT = 11
136 let op_SIMPLEOPT = 12
137 let op_SIMPLESTAR = 13
138 let op_SIMPLEPLUS = 14
139 let op_GOTO = 15
140 let op_PUSHBACK = 16
141 let op_SETMARK = 17
142 let op_CHECKPROGRESS = 18
144 (* Encoding of bytecode instructions *)
146 let instr opc arg = opc lor (arg lsl 8)
148 (* Computing relative displacements for GOTO and PUSHBACK instructions *)
150 let displ dest from = dest - from - 1
152 (** Compilation of a regular expression *)
154 (* Determine if a regexp can match the empty string *)
156 let rec is_nullable = function
157 Char c -> false
158 | String s -> s = ""
159 | CharClass cl -> false
160 | Seq rl -> List.for_all is_nullable rl
161 | Alt (r1, r2) -> is_nullable r1 || is_nullable r2
162 | Star r -> true
163 | Plus r -> is_nullable r
164 | Option r -> true
165 | Group(n, r) -> is_nullable r
166 | Refgroup n -> true
167 | Bol -> true
168 | Eol -> true
169 | Wordboundary -> true
171 (* first r returns a set of characters C such that:
172 for all string s, s matches r => the first character of s is in C.
173 For convenience, return Charset.full if r is nullable. *)
175 let rec first = function
176 Char c -> Charset.singleton c
177 | String s -> if s = "" then Charset.full else Charset.singleton s.[0]
178 | CharClass cl -> cl
179 | Seq rl -> first_seq rl
180 | Alt (r1, r2) -> Charset.union (first r1) (first r2)
181 | Star r -> Charset.full
182 | Plus r -> first r
183 | Option r -> Charset.full
184 | Group(n, r) -> first r
185 | Refgroup n -> Charset.full
186 | Bol -> Charset.full
187 | Eol -> Charset.full
188 | Wordboundary -> Charset.full
190 and first_seq = function
191 [] -> Charset.full
192 | (Bol | Eol | Wordboundary) :: rl -> first_seq rl
193 | Star r :: rl -> Charset.union (first r) (first_seq rl)
194 | Option r :: rl -> Charset.union (first r) (first_seq rl)
195 | r :: rl -> first r
197 (* Transform a Char or CharClass regexp into a character class *)
199 let charclass_of_regexp fold_case re =
200 let cl =
201 match re with
202 Char c -> Charset.singleton c
203 | CharClass cl -> cl
204 | _ -> assert false in
205 if fold_case then Charset.fold_case cl else cl
207 (* The case fold table: maps characters to their lowercase equivalent *)
209 let fold_case_table =
210 let t = String.create 256 in
211 for i = 0 to 255 do t.[i] <- Char.lowercase(Char.chr i) done;
214 module StringMap = Map.Make(struct type t = string let compare = compare end)
216 (* Compilation of a regular expression *)
218 let compile fold_case re =
220 (* Instruction buffering *)
221 let prog = ref (Array.make 32 0)
222 and progpos = ref 0
223 and cpool = ref StringMap.empty
224 and cpoolpos = ref 0
225 and numgroups = ref 1
226 and numregs = ref 0 in
227 (* Add a new instruction *)
228 let emit_instr opc arg =
229 if !progpos >= Array.length !prog then begin
230 let newlen = ref (Array.length !prog) in
231 while !progpos >= !newlen do newlen := !newlen * 2 done;
232 let nprog = Array.make !newlen 0 in
233 Array.blit !prog 0 nprog 0 (Array.length !prog);
234 prog := nprog
235 end;
236 (!prog).(!progpos) <- (instr opc arg);
237 incr progpos in
238 (* Reserve an instruction slot and return its position *)
239 let emit_hole () =
240 let p = !progpos in incr progpos; p in
241 (* Fill a reserved instruction slot with a GOTO or PUSHBACK instruction *)
242 let patch_instr pos opc dest =
243 (!prog).(pos) <- (instr opc (displ dest pos)) in
244 (* Return the cpool index for the given string, adding it if not
245 already there *)
246 let cpool_index s =
248 StringMap.find s !cpool
249 with Not_found ->
250 let p = !cpoolpos in
251 cpool := StringMap.add s p !cpool;
252 incr cpoolpos;
253 p in
254 (* Allocate fresh register if regexp is nullable *)
255 let allocate_register_if_nullable r =
256 if is_nullable r then begin
257 let n = !numregs in
258 if n >= 64 then failwith "too many r* or r+ where r is nullable";
259 incr numregs;
261 end else
262 -1 in
263 (* Main recursive compilation function *)
264 let rec emit_code = function
265 Char c ->
266 if fold_case then
267 emit_instr op_CHARNORM (Char.code (Char.lowercase c))
268 else
269 emit_instr op_CHAR (Char.code c)
270 | String s ->
271 begin match String.length s with
272 0 -> ()
273 | 1 ->
274 if fold_case then
275 emit_instr op_CHARNORM (Char.code (Char.lowercase s.[0]))
276 else
277 emit_instr op_CHAR (Char.code s.[0])
278 | _ ->
280 (* null characters are not accepted by the STRING* instructions;
281 if one is found, split string at null character *)
282 let i = String.index s '\000' in
283 emit_code (String (string_before s i));
284 emit_instr op_CHAR 0;
285 emit_code (String (string_after s (i+1)))
286 with Not_found ->
287 if fold_case then
288 emit_instr op_STRINGNORM (cpool_index (String.lowercase s))
289 else
290 emit_instr op_STRING (cpool_index s)
292 | CharClass cl ->
293 let cl' = if fold_case then Charset.fold_case cl else cl in
294 emit_instr op_CHARCLASS (cpool_index cl')
295 | Seq rl ->
296 emit_seq_code rl
297 | Alt(r1, r2) ->
298 (* PUSHBACK lbl1
299 <match r1>
300 GOTO lbl2
301 lbl1: <match r2>
302 lbl2: ... *)
303 let pos_pushback = emit_hole() in
304 emit_code r1;
305 let pos_goto_end = emit_hole() in
306 let lbl1 = !progpos in
307 emit_code r2;
308 let lbl2 = !progpos in
309 patch_instr pos_pushback op_PUSHBACK lbl1;
310 patch_instr pos_goto_end op_GOTO lbl2
311 | Star r ->
312 (* Implement longest match semantics for compatibility with old Str *)
313 (* General translation:
314 lbl1: PUSHBACK lbl2
315 SETMARK regno
316 <match r>
317 CHECKPROGRESS regno
318 GOTO lbl1
319 lbl2:
320 If r cannot match the empty string, code can be simplified:
321 lbl1: PUSHBACK lbl2
322 <match r>
323 GOTO lbl1
324 lbl2:
326 let regno = allocate_register_if_nullable r in
327 let lbl1 = emit_hole() in
328 if regno >= 0 then emit_instr op_SETMARK regno;
329 emit_code r;
330 if regno >= 0 then emit_instr op_CHECKPROGRESS regno;
331 emit_instr op_GOTO (displ lbl1 !progpos);
332 let lbl2 = !progpos in
333 patch_instr lbl1 op_PUSHBACK lbl2
334 | Plus r ->
335 (* Implement longest match semantics for compatibility with old Str *)
336 (* General translation:
337 lbl1: <match r>
338 CHECKPROGRESS regno
339 PUSHBACK lbl2
340 SETMARK regno
341 GOTO lbl1
342 lbl2:
343 If r cannot match the empty string, code can be simplified:
344 lbl1: <match r>
345 PUSHBACK lbl2
346 GOTO_PLUS lbl1
347 lbl2:
349 let regno = allocate_register_if_nullable r in
350 let lbl1 = !progpos in
351 emit_code r;
352 if regno >= 0 then emit_instr op_CHECKPROGRESS regno;
353 let pos_pushback = emit_hole() in
354 if regno >= 0 then emit_instr op_SETMARK regno;
355 emit_instr op_GOTO (displ lbl1 !progpos);
356 let lbl2 = !progpos in
357 patch_instr pos_pushback op_PUSHBACK lbl2
358 | Option r ->
359 (* Implement longest match semantics for compatibility with old Str *)
360 (* PUSHBACK lbl
361 <match r>
362 lbl:
364 let pos_pushback = emit_hole() in
365 emit_code r;
366 let lbl = !progpos in
367 patch_instr pos_pushback op_PUSHBACK lbl
368 | Group(n, r) ->
369 if n >= 32 then failwith "too many \\(...\\) groups";
370 emit_instr op_BEGGROUP n;
371 emit_code r;
372 emit_instr op_ENDGROUP n;
373 numgroups := max !numgroups (n+1)
374 | Refgroup n ->
375 emit_instr op_REFGROUP n
376 | Bol ->
377 emit_instr op_BOL 0
378 | Eol ->
379 emit_instr op_EOL 0
380 | Wordboundary ->
381 emit_instr op_WORDBOUNDARY 0
383 and emit_seq_code = function
384 [] -> ()
385 | Star(Char _ | CharClass _ as r) :: rl
386 when disjoint_modulo_case (first r) (first_seq rl) ->
387 emit_instr op_SIMPLESTAR (cpool_index (charclass_of_regexp fold_case r));
388 emit_seq_code rl
389 | Plus(Char _ | CharClass _ as r) :: rl
390 when disjoint_modulo_case (first r) (first_seq rl) ->
391 emit_instr op_SIMPLEPLUS (cpool_index (charclass_of_regexp fold_case r));
392 emit_seq_code rl
393 | Option(Char _ | CharClass _ as r) :: rl
394 when disjoint_modulo_case (first r) (first_seq rl) ->
395 emit_instr op_SIMPLEOPT (cpool_index (charclass_of_regexp fold_case r));
396 emit_seq_code rl
397 | r :: rl ->
398 emit_code r;
399 emit_seq_code rl
401 and disjoint_modulo_case c1 c2 =
402 if fold_case
403 then Charset.disjoint (Charset.fold_case c1) (Charset.fold_case c2)
404 else Charset.disjoint c1 c2
407 emit_code re;
408 emit_instr op_ACCEPT 0;
409 let start = first re in
410 let start' = if fold_case then Charset.fold_case start else start in
411 let start_pos =
412 if start = Charset.full
413 then -1
414 else cpool_index (Charset.expand start') in
415 let constantpool = Array.make !cpoolpos "" in
416 StringMap.iter (fun str idx -> constantpool.(idx) <- str) !cpool;
417 { prog = Array.sub !prog 0 !progpos;
418 cpool = constantpool;
419 normtable = if fold_case then fold_case_table else "";
420 numgroups = !numgroups;
421 numregisters = !numregs;
422 startchars = start_pos }
424 (** Parsing of a regular expression *)
426 (* Efficient buffering of sequences *)
428 module SeqBuffer = struct
430 type t = { sb_chars: Buffer.t; mutable sb_next: re_syntax list }
432 let create() = { sb_chars = Buffer.create 16; sb_next = [] }
434 let flush buf =
435 let s = Buffer.contents buf.sb_chars in
436 Buffer.clear buf.sb_chars;
437 match String.length s with
438 0 -> ()
439 | 1 -> buf.sb_next <- Char s.[0] :: buf.sb_next
440 | _ -> buf.sb_next <- String s :: buf.sb_next
442 let add buf re =
443 match re with
444 Char c -> Buffer.add_char buf.sb_chars c
445 | _ -> flush buf; buf.sb_next <- re :: buf.sb_next
447 let extract buf =
448 flush buf; Seq(List.rev buf.sb_next)
452 (* The character class corresponding to `.' *)
454 let dotclass = Charset.complement (Charset.singleton '\n')
456 (* Parse a regular expression *)
458 let parse s =
459 let len = String.length s in
460 let group_counter = ref 1 in
462 let rec regexp0 i =
463 let (r, j) = regexp1 i in
464 regexp0cont r j
465 and regexp0cont r1 i =
466 if i + 2 <= len && s.[i] = '\\' && s.[i+1] = '|' then
467 let (r2, j) = regexp1 (i+2) in
468 regexp0cont (Alt(r1, r2)) j
469 else
470 (r1, i)
471 and regexp1 i =
472 regexp1cont (SeqBuffer.create()) i
473 and regexp1cont sb i =
474 if i >= len
475 || i + 2 <= len && s.[i] = '\\' && (let c = s.[i+1] in c = '|' || c = ')')
476 then
477 (SeqBuffer.extract sb, i)
478 else
479 let (r, j) = regexp2 i in
480 SeqBuffer.add sb r;
481 regexp1cont sb j
482 and regexp2 i =
483 let (r, j) = regexp3 i in
484 regexp2cont r j
485 and regexp2cont r i =
486 if i >= len then (r, i) else
487 match s.[i] with
488 '?' -> regexp2cont (Option r) (i+1)
489 | '*' -> regexp2cont (Star r) (i+1)
490 | '+' -> regexp2cont (Plus r) (i+1)
491 | _ -> (r, i)
492 and regexp3 i =
493 match s.[i] with
494 '\\' -> regexpbackslash (i+1)
495 | '[' -> let (c, j) = regexpclass0 (i+1) in (CharClass c, j)
496 | '^' -> (Bol, i+1)
497 | '$' -> (Eol, i+1)
498 | '.' -> (CharClass dotclass, i+1)
499 | c -> (Char c, i+1)
500 and regexpbackslash i =
501 if i >= len then (Char '\\', i) else
502 match s.[i] with
503 '|' | ')' ->
504 assert false
505 | '(' ->
506 let group_no = !group_counter in
507 if group_no < 32 then incr group_counter;
508 let (r, j) = regexp0 (i+1) in
509 if j + 1 < len && s.[j] = '\\' && s.[j+1] = ')' then
510 if group_no < 32
511 then (Group(group_no, r), j + 2)
512 else (r, j + 2)
513 else
514 failwith "\\( group not closed by \\)"
515 | '1' .. '9' as c ->
516 (Refgroup(Char.code c - 48), i + 1)
517 | 'b' ->
518 (Wordboundary, i + 1)
519 | c ->
520 (Char c, i + 1)
521 and regexpclass0 i =
522 if i < len && s.[i] = '^'
523 then let (c, j) = regexpclass1 (i+1) in (Charset.complement c, j)
524 else regexpclass1 i
525 and regexpclass1 i =
526 let c = Charset.make_empty() in
527 let j = regexpclass2 c i i in
528 (c, j)
529 and regexpclass2 c start i =
530 if i >= len then failwith "[ class not closed by ]";
531 if s.[i] = ']' && i > start then i+1 else begin
532 let c1 = s.[i] in
533 if i+2 < len && s.[i+1] = '-' && s.[i+2] <> ']' then begin
534 let c2 = s.[i+2] in
535 Charset.add_range c c1 c2;
536 regexpclass2 c start (i+3)
537 end else begin
538 Charset.add c c1;
539 regexpclass2 c start (i+1)
541 end in
543 let (r, j) = regexp0 0 in
544 if j = len then r else failwith "spurious \\) in regular expression"
546 (** Parsing and compilation *)
548 let regexp e = compile false (parse e)
550 let regexp_case_fold e = compile true (parse e)
552 let quote s =
553 let len = String.length s in
554 let buf = String.create (2 * len) in
555 let pos = ref 0 in
556 for i = 0 to len - 1 do
557 match s.[i] with
558 '[' | ']' | '*' | '.' | '\\' | '?' | '+' | '^' | '$' as c ->
559 buf.[!pos] <- '\\'; buf.[!pos + 1] <- c; pos := !pos + 2
560 | c ->
561 buf.[!pos] <- c; pos := !pos + 1
562 done;
563 String.sub buf 0 !pos
565 let regexp_string s = compile false (String s)
567 let regexp_string_case_fold s = compile true (String s)
569 (** Matching functions **)
571 external re_string_match: regexp -> string -> int -> int array
572 = "re_string_match"
573 external re_partial_match: regexp -> string -> int -> int array
574 = "re_partial_match"
575 external re_search_forward: regexp -> string -> int -> int array
576 = "re_search_forward"
577 external re_search_backward: regexp -> string -> int -> int array
578 = "re_search_backward"
580 let last_search_result = ref [||]
582 let string_match re s pos =
583 let res = re_string_match re s pos in
584 last_search_result := res;
585 Array.length res > 0
587 let string_partial_match re s pos =
588 let res = re_partial_match re s pos in
589 last_search_result := res;
590 Array.length res > 0
592 let search_forward re s pos =
593 let res = re_search_forward re s pos in
594 last_search_result := res;
595 if Array.length res = 0 then raise Not_found else res.(0)
597 let search_backward re s pos =
598 let res = re_search_backward re s pos in
599 last_search_result := res;
600 if Array.length res = 0 then raise Not_found else res.(0)
602 let group_beginning n =
603 let n2 = n + n in
604 if n < 0 || n2 >= Array.length !last_search_result then
605 invalid_arg "Str.group_beginning"
606 else
607 let pos = !last_search_result.(n2) in
608 if pos = -1 then raise Not_found else pos
610 let group_end n =
611 let n2 = n + n in
612 if n < 0 || n2 >= Array.length !last_search_result then
613 invalid_arg "Str.group_end"
614 else
615 let pos = !last_search_result.(n2 + 1) in
616 if pos = -1 then raise Not_found else pos
618 let matched_group n txt =
619 let n2 = n + n in
620 if n < 0 || n2 >= Array.length !last_search_result then
621 invalid_arg "Str.matched_group"
622 else
623 let b = !last_search_result.(n2)
624 and e = !last_search_result.(n2 + 1) in
625 if b = -1 then raise Not_found else String.sub txt b (e - b)
627 let match_beginning () = group_beginning 0
628 and match_end () = group_end 0
629 and matched_string txt = matched_group 0 txt
631 (** Replacement **)
633 external re_replacement_text: string -> int array -> string -> string
634 = "re_replacement_text"
636 let replace_matched repl matched =
637 re_replacement_text repl !last_search_result matched
639 let substitute_first expr repl_fun text =
641 let pos = search_forward expr text 0 in
642 String.concat "" [string_before text pos;
643 repl_fun text;
644 string_after text (match_end())]
645 with Not_found ->
646 text
648 let global_substitute expr repl_fun text =
649 let rec replace start last_was_empty =
651 let startpos = if last_was_empty then start + 1 else start in
652 if startpos > String.length text then raise Not_found;
653 let pos = search_forward expr text startpos in
654 let end_pos = match_end() in
655 let repl_text = repl_fun text in
656 String.sub text start (pos-start) ::
657 repl_text ::
658 replace end_pos (end_pos = pos)
659 with Not_found ->
660 [string_after text start] in
661 String.concat "" (replace 0 false)
663 let global_replace expr repl text =
664 global_substitute expr (replace_matched repl) text
665 and replace_first expr repl text =
666 substitute_first expr (replace_matched repl) text
668 (** Splitting *)
670 let search_forward_progress expr text start =
671 let pos = search_forward expr text start in
672 if match_end() > start then pos
673 else if start < String.length text then search_forward expr text (start + 1)
674 else raise Not_found
676 let bounded_split expr text num =
677 let start =
678 if string_match expr text 0 then match_end() else 0 in
679 let rec split start n =
680 if start >= String.length text then [] else
681 if n = 1 then [string_after text start] else
683 let pos = search_forward_progress expr text start in
684 String.sub text start (pos-start) :: split (match_end()) (n-1)
685 with Not_found ->
686 [string_after text start] in
687 split start num
689 let split expr text = bounded_split expr text 0
691 let bounded_split_delim expr text num =
692 let rec split start n =
693 if start > String.length text then [] else
694 if n = 1 then [string_after text start] else
696 let pos = search_forward_progress expr text start in
697 String.sub text start (pos-start) :: split (match_end()) (n-1)
698 with Not_found ->
699 [string_after text start] in
700 if text = "" then [] else split 0 num
702 let split_delim expr text = bounded_split_delim expr text 0
704 type split_result = Text of string | Delim of string
706 let bounded_full_split expr text num =
707 let rec split start n =
708 if start >= String.length text then [] else
709 if n = 1 then [Text(string_after text start)] else
711 let pos = search_forward_progress expr text start in
712 let s = matched_string text in
713 if pos > start then
714 Text(String.sub text start (pos-start)) ::
715 Delim(s) ::
716 split (match_end()) (n-1)
717 else
718 Delim(s) ::
719 split (match_end()) (n-1)
720 with Not_found ->
721 [Text(string_after text start)] in
722 split 0 num
724 let full_split expr text = bounded_full_split expr text 0