Merge commit 'ocaml3102'
[ocaml.git] / lex / lexgen.ml
blob5df5bcf10a69faaa38ba4c5f345161fd8fe3da12
1 (***********************************************************************)
2 (* *)
3 (* Objective Caml *)
4 (* *)
5 (* Xavier Leroy, projet Cristal, *)
6 (* Luc Maranget, projet Moscova, *)
7 (* INRIA Rocquencourt *)
8 (* *)
9 (* Copyright 1996 Institut National de Recherche en Informatique et *)
10 (* en Automatique. All rights reserved. This file is distributed *)
11 (* under the terms of the Q Public License version 1.0. *)
12 (* *)
13 (***********************************************************************)
15 (* $Id$ *)
17 (* Compiling a lexer definition *)
19 open Syntax
20 open Printf
22 exception Memory_overflow
24 (* Deep abstract syntax for regular expressions *)
26 type ident = string * Syntax.location
28 type tag_info = {id : string ; start : bool ; action : int}
30 type regexp =
31 Empty
32 | Chars of int * bool
33 | Action of int
34 | Tag of tag_info
35 | Seq of regexp * regexp
36 | Alt of regexp * regexp
37 | Star of regexp
39 type tag_base = Start | End | Mem of int
40 type tag_addr = Sum of (tag_base * int)
41 type ident_info =
42 | Ident_string of bool * tag_addr * tag_addr
43 | Ident_char of bool * tag_addr
44 type t_env = (ident * ident_info) list
46 type ('args,'action) lexer_entry =
47 { lex_name: string;
48 lex_regexp: regexp;
49 lex_mem_tags: int ;
50 lex_actions: (int * t_env * 'action) list }
53 type automata =
54 Perform of int * tag_action list
55 | Shift of automata_trans * (automata_move * memory_action list) array
57 and automata_trans =
58 No_remember
59 | Remember of int * tag_action list
61 and automata_move =
62 Backtrack
63 | Goto of int
65 and memory_action =
66 | Copy of int * int
67 | Set of int
69 and tag_action = SetTag of int * int | EraseTag of int
71 (* Representation of entry points *)
73 type ('args,'action) automata_entry =
74 { auto_name: string;
75 auto_args: 'args ;
76 auto_mem_size : int ;
77 auto_initial_state: int * memory_action list;
78 auto_actions: (int * t_env * 'action) list }
81 (* A lot of sets and map structures *)
83 module Ints = Set.Make(struct type t = int let compare = compare end)
85 let id_compare (id1,_) (id2,_) = String.compare id1 id2
87 let tag_compare t1 t2 = Pervasives.compare t1 t2
89 module Tags = Set.Make(struct type t = tag_info let compare = tag_compare end)
91 module TagMap =
92 Map.Make (struct type t = tag_info let compare = tag_compare end)
94 module IdSet =
95 Set.Make (struct type t = ident let compare = id_compare end)
97 module IdMap =
98 Map.Make (struct type t = ident let compare = id_compare end)
100 (*********************)
101 (* Variable cleaning *)
102 (*********************)
104 (* Silently eliminate nested variables *)
106 let rec do_remove_nested to_remove = function
107 | Bind (e,x) ->
108 if IdSet.mem x to_remove then
109 do_remove_nested to_remove e
110 else
111 Bind (do_remove_nested (IdSet.add x to_remove) e, x)
112 | Epsilon|Eof|Characters _ as e -> e
113 | Sequence (e1, e2) ->
114 Sequence
115 (do_remove_nested to_remove e1, do_remove_nested to_remove e2)
116 | Alternative (e1, e2) ->
117 Alternative
118 (do_remove_nested to_remove e1, do_remove_nested to_remove e2)
119 | Repetition e ->
120 Repetition (do_remove_nested to_remove e)
122 let remove_nested_as e = do_remove_nested IdSet.empty e
124 (*********************)
125 (* Variable analysis *)
126 (*********************)
129 Optional variables.
130 A variable is optional when matching of regexp does not
131 implies it binds.
132 The typical case is:
133 ("" | 'a' as x) -> optional
134 ("" as x | 'a' as x) -> non-optional
137 let stringset_delta s1 s2 =
138 IdSet.union
139 (IdSet.diff s1 s2)
140 (IdSet.diff s2 s1)
142 let rec find_all_vars = function
143 | Characters _|Epsilon|Eof ->
144 IdSet.empty
145 | Bind (e,x) ->
146 IdSet.add x (find_all_vars e)
147 | Sequence (e1,e2)|Alternative (e1,e2) ->
148 IdSet.union (find_all_vars e1) (find_all_vars e2)
149 | Repetition e -> find_all_vars e
152 let rec do_find_opt = function
153 | Characters _|Epsilon|Eof -> IdSet.empty, IdSet.empty
154 | Bind (e,x) ->
155 let opt,all = do_find_opt e in
156 opt, IdSet.add x all
157 | Sequence (e1,e2) ->
158 let opt1,all1 = do_find_opt e1
159 and opt2,all2 = do_find_opt e2 in
160 IdSet.union opt1 opt2, IdSet.union all1 all2
161 | Alternative (e1,e2) ->
162 let opt1,all1 = do_find_opt e1
163 and opt2,all2 = do_find_opt e2 in
164 IdSet.union
165 (IdSet.union opt1 opt2)
166 (stringset_delta all1 all2),
167 IdSet.union all1 all2
168 | Repetition e ->
169 let r = find_all_vars e in
172 let find_optional e =
173 let r,_ = do_find_opt e in r
176 Double variables
177 A variable is double when it can be bound more than once
178 in a single matching
179 The typical case is:
180 (e1 as x) (e2 as x)
184 let rec do_find_double = function
185 | Characters _|Epsilon|Eof -> IdSet.empty, IdSet.empty
186 | Bind (e,x) ->
187 let dbl,all = do_find_double e in
188 (if IdSet.mem x all then
189 IdSet.add x dbl
190 else
191 dbl),
192 IdSet.add x all
193 | Sequence (e1,e2) ->
194 let dbl1, all1 = do_find_double e1
195 and dbl2, all2 = do_find_double e2 in
196 IdSet.union
197 (IdSet.inter all1 all2)
198 (IdSet.union dbl1 dbl2),
199 IdSet.union all1 all2
200 | Alternative (e1,e2) ->
201 let dbl1, all1 = do_find_double e1
202 and dbl2, all2 = do_find_double e2 in
203 IdSet.union dbl1 dbl2,
204 IdSet.union all1 all2
205 | Repetition e ->
206 let r = find_all_vars e in
209 let find_double e = do_find_double e
212 Type of variables:
213 A variable is bound to a char when all its occurences
214 bind a pattern of length 1.
215 The typical case is:
216 (_ as x) -> char
219 let add_some x = function
220 | Some i -> Some (x+i)
221 | None -> None
223 let add_some_some x y = match x,y with
224 | Some i, Some j -> Some (i+j)
225 | _,_ -> None
227 let rec do_find_chars sz = function
228 | Epsilon|Eof -> IdSet.empty, IdSet.empty, sz
229 | Characters _ -> IdSet.empty, IdSet.empty, add_some 1 sz
230 | Bind (e,x) ->
231 let c,s,e_sz = do_find_chars (Some 0) e in
232 begin match e_sz with
233 | Some 1 ->
234 IdSet.add x c,s,add_some 1 sz
235 | _ ->
236 c, IdSet.add x s, add_some_some sz e_sz
238 | Sequence (e1,e2) ->
239 let c1,s1,sz1 = do_find_chars sz e1 in
240 let c2,s2,sz2 = do_find_chars sz1 e2 in
241 IdSet.union c1 c2,
242 IdSet.union s1 s2,
244 | Alternative (e1,e2) ->
245 let c1,s1,sz1 = do_find_chars sz e1
246 and c2,s2,sz2 = do_find_chars sz e2 in
247 IdSet.union c1 c2,
248 IdSet.union s1 s2,
249 (if sz1 = sz2 then sz1 else None)
250 | Repetition e -> do_find_chars None e
254 let find_chars e =
255 let c,s,_ = do_find_chars (Some 0) e in
256 IdSet.diff c s
258 (*******************************)
259 (* From shallow to deep syntax *)
260 (*******************************)
262 let chars = ref ([] : Cset.t list)
263 let chars_count = ref 0
266 let rec encode_regexp char_vars act = function
267 Epsilon -> Empty
268 | Characters cl ->
269 let n = !chars_count in
270 chars := cl :: !chars;
271 incr chars_count;
272 Chars(n,false)
273 | Eof ->
274 let n = !chars_count in
275 chars := Cset.eof :: !chars;
276 incr chars_count;
277 Chars(n,true)
278 | Sequence(r1,r2) ->
279 let r1 = encode_regexp char_vars act r1 in
280 let r2 = encode_regexp char_vars act r2 in
281 Seq (r1, r2)
282 | Alternative(r1,r2) ->
283 let r1 = encode_regexp char_vars act r1 in
284 let r2 = encode_regexp char_vars act r2 in
285 Alt(r1, r2)
286 | Repetition r ->
287 let r = encode_regexp char_vars act r in
288 Star r
289 | Bind (r,((name,_) as x)) ->
290 let r = encode_regexp char_vars act r in
291 if IdSet.mem x char_vars then
292 Seq (Tag {id=name ; start=true ; action=act},r)
293 else
294 Seq (Tag {id=name ; start=true ; action=act},
295 Seq (r, Tag {id=name ; start=false ; action=act}))
298 (* Optimisation,
299 Static optimization :
300 Replace tags by offsets relative to the beginning
301 or end of matched string.
302 Dynamic optimization:
303 Replace some non-optional, non-double tags by offsets w.r.t
304 a previous similar tag.
307 let incr_pos = function
308 | None -> None
309 | Some i -> Some (i+1)
311 let decr_pos = function
312 | None -> None
313 | Some i -> Some (i-1)
316 let opt = true
318 let mk_seq r1 r2 = match r1,r2 with
319 | Empty,_ -> r2
320 | _,Empty -> r1
321 | _,_ -> Seq (r1,r2)
323 let add_pos p i = match p with
324 | Some (Sum (a,n)) -> Some (Sum (a,n+i))
325 | None -> None
327 let mem_name name id_set =
328 IdSet.exists (fun (id_name,_) -> name = id_name) id_set
330 let opt_regexp all_vars char_vars optional_vars double_vars r =
332 (* From removed tags to their addresses *)
333 let env = Hashtbl.create 17 in
335 (* First static optimizations, from start position *)
336 let rec size_forward pos = function
337 | Empty|Chars (_,true)|Tag _ -> Some pos
338 | Chars (_,false) -> Some (pos+1)
339 | Seq (r1,r2) ->
340 begin match size_forward pos r1 with
341 | None -> None
342 | Some pos -> size_forward pos r2
344 | Alt (r1,r2) ->
345 let pos1 = size_forward pos r1
346 and pos2 = size_forward pos r2 in
347 if pos1=pos2 then pos1 else None
348 | Star _ -> None
349 | Action _ -> assert false in
351 let rec simple_forward pos r = match r with
352 | Tag n ->
353 if mem_name n.id double_vars then
354 r,Some pos
355 else begin
356 Hashtbl.add env (n.id,n.start) (Sum (Start, pos)) ;
357 Empty,Some pos
359 | Empty -> r, Some pos
360 | Chars (_,is_eof) ->
361 r,Some (if is_eof then pos else pos+1)
362 | Seq (r1,r2) ->
363 let r1,pos = simple_forward pos r1 in
364 begin match pos with
365 | None -> mk_seq r1 r2,None
366 | Some pos ->
367 let r2,pos = simple_forward pos r2 in
368 mk_seq r1 r2,pos
370 | Alt (r1,r2) ->
371 let pos1 = size_forward pos r1
372 and pos2 = size_forward pos r2 in
373 r,(if pos1=pos2 then pos1 else None)
374 | Star _ -> r,None
375 | Action _ -> assert false in
377 (* Then static optimizations, from end position *)
378 let rec size_backward pos = function
379 | Empty|Chars (_,true)|Tag _ -> Some pos
380 | Chars (_,false) -> Some (pos-1)
381 | Seq (r1,r2) ->
382 begin match size_backward pos r2 with
383 | None -> None
384 | Some pos -> size_backward pos r1
386 | Alt (r1,r2) ->
387 let pos1 = size_backward pos r1
388 and pos2 = size_backward pos r2 in
389 if pos1=pos2 then pos1 else None
390 | Star _ -> None
391 | Action _ -> assert false in
394 let rec simple_backward pos r = match r with
395 | Tag n ->
396 if mem_name n.id double_vars then
397 r,Some pos
398 else begin
399 Hashtbl.add env (n.id,n.start) (Sum (End, pos)) ;
400 Empty,Some pos
402 | Empty -> r,Some pos
403 | Chars (_,is_eof) ->
404 r,Some (if is_eof then pos else pos-1)
405 | Seq (r1,r2) ->
406 let r2,pos = simple_backward pos r2 in
407 begin match pos with
408 | None -> mk_seq r1 r2,None
409 | Some pos ->
410 let r1,pos = simple_backward pos r1 in
411 mk_seq r1 r2,pos
413 | Alt (r1,r2) ->
414 let pos1 = size_backward pos r1
415 and pos2 = size_backward pos r2 in
416 r,(if pos1=pos2 then pos1 else None)
417 | Star _ -> r,None
418 | Action _ -> assert false in
420 let r =
421 if opt then
422 let r,_ = simple_forward 0 r in
423 let r,_ = simple_backward 0 r in
425 else
426 r in
428 let loc_count = ref 0 in
429 let get_tag_addr t =
431 Hashtbl.find env t
432 with
433 | Not_found ->
434 let n = !loc_count in
435 incr loc_count ;
436 Hashtbl.add env t (Sum (Mem n,0)) ;
437 Sum (Mem n,0) in
439 let rec alloc_exp pos r = match r with
440 | Tag n ->
441 if mem_name n.id double_vars then
442 r,pos
443 else begin match pos with
444 | Some a ->
445 Hashtbl.add env (n.id,n.start) a ;
446 Empty,pos
447 | None ->
448 let a = get_tag_addr (n.id,n.start) in
449 r,Some a
452 | Empty -> r,pos
453 | Chars (_,is_eof) -> r,(if is_eof then pos else add_pos pos 1)
454 | Seq (r1,r2) ->
455 let r1,pos = alloc_exp pos r1 in
456 let r2,pos = alloc_exp pos r2 in
457 mk_seq r1 r2,pos
458 | Alt (_,_) ->
459 let off = size_forward 0 r in
460 begin match off with
461 | Some i -> r,add_pos pos i
462 | None -> r,None
464 | Star _ -> r,None
465 | Action _ -> assert false in
467 let r,_ = alloc_exp None r in
468 let m =
469 IdSet.fold
470 (fun ((name,_) as x) r ->
472 let v =
473 if IdSet.mem x char_vars then
474 Ident_char
475 (IdSet.mem x optional_vars, get_tag_addr (name,true))
476 else
477 Ident_string
478 (IdSet.mem x optional_vars,
479 get_tag_addr (name,true),
480 get_tag_addr (name,false)) in
481 (x,v)::r)
482 all_vars [] in
483 m,r, !loc_count
487 let encode_casedef casedef =
488 let r =
489 List.fold_left
490 (fun (reg,actions,count,ntags) (expr, act) ->
491 let expr = remove_nested_as expr in
492 let char_vars = find_chars expr in
493 let r = encode_regexp char_vars count expr
494 and opt_vars = find_optional expr
495 and double_vars,all_vars = find_double expr in
496 let m,r,loc_ntags =
497 opt_regexp all_vars char_vars opt_vars double_vars r in
498 Alt(reg, Seq(r, Action count)),
499 (count, m ,act) :: actions,
500 (succ count),
501 max loc_ntags ntags)
502 (Empty, [], 0, 0)
503 casedef in
506 let encode_lexdef def =
507 chars := [];
508 chars_count := 0;
509 let entry_list =
510 List.map
511 (fun {name=entry_name ; args=args ; shortest=shortest ; clauses= casedef} ->
512 let (re,actions,_,ntags) = encode_casedef casedef in
513 { lex_name = entry_name;
514 lex_regexp = re;
515 lex_mem_tags = ntags ;
516 lex_actions = List.rev actions },args,shortest)
517 def in
518 let chr = Array.of_list (List.rev !chars) in
519 chars := [];
520 (chr, entry_list)
522 (* To generate directly a NFA from a regular expression.
523 Confer Aho-Sethi-Ullman, dragon book, chap. 3
524 Extension to tagged automata.
525 Confer
526 Ville Larikari
527 ``NFAs with Tagged Transitions, their Conversion to Deterministic
528 Automata and Application to Regular Expressions''.
529 Symposium on String Processing and Information Retrieval (SPIRE 2000),
530 http://kouli.iki.fi/~vlaurika/spire2000-tnfa.ps
531 (See also)
532 http://kouli.iki.fi/~vlaurika/regex-submatch.ps.gz
535 type t_transition =
536 OnChars of int
537 | ToAction of int
539 type transition = t_transition * Tags.t
541 let trans_compare (t1,tags1) (t2,tags2) =
542 match Pervasives.compare t1 t2 with
543 | 0 -> Tags.compare tags1 tags2
544 | r -> r
547 module TransSet =
548 Set.Make(struct type t = transition let compare = trans_compare end)
550 let rec nullable = function
551 | Empty|Tag _ -> true
552 | Chars (_,_)|Action _ -> false
553 | Seq(r1,r2) -> nullable r1 && nullable r2
554 | Alt(r1,r2) -> nullable r1 || nullable r2
555 | Star r -> true
557 let rec emptymatch = function
558 | Empty | Chars (_,_) | Action _ -> Tags.empty
559 | Tag t -> Tags.add t Tags.empty
560 | Seq (r1,r2) -> Tags.union (emptymatch r1) (emptymatch r2)
561 | Alt(r1,r2) ->
562 if nullable r1 then
563 emptymatch r1
564 else
565 emptymatch r2
566 | Star r ->
567 if nullable r then
568 emptymatch r
569 else
570 Tags.empty
572 let addtags transs tags =
573 TransSet.fold
574 (fun (t,tags_t) r -> TransSet.add (t, Tags.union tags tags_t) r)
575 transs TransSet.empty
578 let rec firstpos = function
579 Empty|Tag _ -> TransSet.empty
580 | Chars (pos,_) -> TransSet.add (OnChars pos,Tags.empty) TransSet.empty
581 | Action act -> TransSet.add (ToAction act,Tags.empty) TransSet.empty
582 | Seq(r1,r2) ->
583 if nullable r1 then
584 TransSet.union (firstpos r1) (addtags (firstpos r2) (emptymatch r1))
585 else
586 firstpos r1
587 | Alt(r1,r2) -> TransSet.union (firstpos r1) (firstpos r2)
588 | Star r -> firstpos r
591 (* Berry-sethi followpos *)
592 let followpos size entry_list =
593 let v = Array.create size TransSet.empty in
594 let rec fill s = function
595 | Empty|Action _|Tag _ -> ()
596 | Chars (n,_) -> v.(n) <- s
597 | Alt (r1,r2) ->
598 fill s r1 ; fill s r2
599 | Seq (r1,r2) ->
600 fill
601 (if nullable r2 then
602 TransSet.union (firstpos r2) (addtags s (emptymatch r2))
603 else
604 (firstpos r2))
605 r1 ;
606 fill s r2
607 | Star r ->
608 fill (TransSet.union (firstpos r) s) r in
609 List.iter (fun (entry,_,_) -> fill TransSet.empty entry.lex_regexp) entry_list ;
612 (************************)
613 (* The algorithm itself *)
614 (************************)
616 let no_action = max_int
618 module StateSet =
619 Set.Make (struct type t = t_transition let compare = Pervasives.compare end)
622 module MemMap =
623 Map.Make (struct type t = int let compare = Pervasives.compare end)
625 type 'a dfa_state =
626 {final : int * ('a * int TagMap.t) ;
627 others : ('a * int TagMap.t) MemMap.t}
630 let dtag oc t =
631 fprintf oc "%s<%s>" t.id (if t.start then "s" else "e")
633 let dmem_map dp ds m =
634 MemMap.iter
635 (fun k x ->
636 eprintf "%d -> " k ; dp x ; ds ())
639 and dtag_map dp ds m =
640 TagMap.iter
641 (fun t x ->
642 dtag stderr t ; eprintf " -> " ; dp x ; ds ())
645 let dstate {final=(act,(_,m)) ; others=o} =
646 if act <> no_action then begin
647 eprintf "final=%d " act ;
648 dtag_map (fun x -> eprintf "%d" x) (fun () -> prerr_string " ,") m ;
649 prerr_endline ""
650 end ;
651 dmem_map
652 (fun (_,m) ->
653 dtag_map (fun x -> eprintf "%d" x) (fun () -> prerr_string " ,") m)
654 (fun () -> prerr_endline "")
658 let dfa_state_empty =
659 {final=(no_action, (max_int,TagMap.empty)) ;
660 others=MemMap.empty}
662 and dfa_state_is_empty {final=(act,_) ; others=o} =
663 act = no_action &&
664 o = MemMap.empty
667 (* A key is an abstraction on a dfa state,
668 two states with the same key can be made the same by
669 copying some memory cells into others *)
672 module StateSetSet =
673 Set.Make (struct type t = StateSet.t let compare = StateSet.compare end)
675 type t_equiv = {tag:tag_info ; equiv:StateSetSet.t}
677 module MemKey =
678 Set.Make
679 (struct
680 type t = t_equiv
682 let compare e1 e2 = match Pervasives.compare e1.tag e2.tag with
683 | 0 -> StateSetSet.compare e1.equiv e2.equiv
684 | r -> r
685 end)
687 type dfa_key = {kstate : StateSet.t ; kmem : MemKey.t}
689 (* Map a state to its key *)
690 let env_to_class m =
691 let env1 =
692 MemMap.fold
693 (fun _ (tag,s) r ->
695 let ss = TagMap.find tag r in
696 let r = TagMap.remove tag r in
697 TagMap.add tag (StateSetSet.add s ss) r
698 with
699 | Not_found ->
700 TagMap.add tag (StateSetSet.add s StateSetSet.empty) r)
701 m TagMap.empty in
702 TagMap.fold
703 (fun tag ss r -> MemKey.add {tag=tag ; equiv=ss} r)
704 env1 MemKey.empty
707 (* trans is nfa_state, m is associated memory map *)
708 let inverse_mem_map trans m r =
709 TagMap.fold
710 (fun tag addr r ->
712 let otag,s = MemMap.find addr r in
713 assert (tag = otag) ;
714 let r = MemMap.remove addr r in
715 MemMap.add addr (tag,StateSet.add trans s) r
716 with
717 | Not_found ->
718 MemMap.add addr (tag,StateSet.add trans StateSet.empty) r)
721 let inverse_mem_map_other n (_,m) r = inverse_mem_map (OnChars n) m r
723 let get_key {final=(act,(_,m_act)) ; others=o} =
724 let env =
725 MemMap.fold inverse_mem_map_other
727 (if act = no_action then MemMap.empty
728 else inverse_mem_map (ToAction act) m_act MemMap.empty) in
729 let state_key =
730 MemMap.fold (fun n _ r -> StateSet.add (OnChars n) r) o
731 (if act=no_action then StateSet.empty
732 else StateSet.add (ToAction act) StateSet.empty) in
733 let mem_key = env_to_class env in
734 {kstate = state_key ; kmem = mem_key}
737 let key_compare k1 k2 = match StateSet.compare k1.kstate k2.kstate with
738 | 0 -> MemKey.compare k1.kmem k2.kmem
739 | r -> r
741 (* Association dfa_state -> state_num *)
743 module StateMap =
744 Map.Make(struct type t = dfa_key let compare = key_compare end)
746 let state_map = ref (StateMap.empty : int StateMap.t)
747 let todo = Stack.create()
748 let next_state_num = ref 0
749 let next_mem_cell = ref 0
750 let temp_pending = ref false
751 let tag_cells = Hashtbl.create 17
752 let state_table = Table.create dfa_state_empty
755 let reset_state_mem () =
756 state_map := StateMap.empty;
757 Stack.clear todo;
758 next_state_num := 0 ;
759 let _ = Table.trim state_table in
762 (* Allocation of memory cells *)
763 let reset_cell_mem ntags =
764 next_mem_cell := ntags ;
765 Hashtbl.clear tag_cells ;
766 temp_pending := false
768 let do_alloc_temp () =
769 temp_pending := true ;
770 let n = !next_mem_cell in
773 let do_alloc_cell used t =
774 let available =
775 try Hashtbl.find tag_cells t with Not_found -> Ints.empty in
777 Ints.choose (Ints.diff available used)
778 with
779 | Not_found ->
780 temp_pending := false ;
781 let n = !next_mem_cell in
782 if n >= 255 then raise Memory_overflow ;
783 Hashtbl.replace tag_cells t (Ints.add n available) ;
784 incr next_mem_cell ;
787 let is_old_addr a = a >= 0
788 and is_new_addr a = a < 0
790 let old_in_map m r =
791 TagMap.fold
792 (fun _ addr r ->
793 if is_old_addr addr then
794 Ints.add addr r
795 else
799 let alloc_map used m mvs =
800 TagMap.fold
801 (fun tag a (r,mvs) ->
802 let a,mvs =
803 if is_new_addr a then
804 let a = do_alloc_cell used tag in
805 a,Ints.add a mvs
806 else a,mvs in
807 TagMap.add tag a r,mvs)
808 m (TagMap.empty,mvs)
810 let create_new_state {final=(act,(_,m_act)) ; others=o} =
811 let used =
812 MemMap.fold (fun _ (_,m) r -> old_in_map m r)
813 o (old_in_map m_act Ints.empty) in
815 let new_m_act,mvs = alloc_map used m_act Ints.empty in
816 let new_o,mvs =
817 MemMap.fold (fun k (x,m) (r,mvs) ->
818 let m,mvs = alloc_map used m mvs in
819 MemMap.add k (x,m) r,mvs)
820 o (MemMap.empty,mvs) in
821 {final=(act,(0,new_m_act)) ; others=new_o},
822 Ints.fold (fun x r -> Set x::r) mvs []
824 type new_addr_gen = {mutable count : int ; mutable env : int TagMap.t}
826 let create_new_addr_gen () = {count = -1 ; env = TagMap.empty}
828 let alloc_new_addr tag r =
830 TagMap.find tag r.env
831 with
832 | Not_found ->
833 let a = r.count in
834 r.count <- a-1 ;
835 r.env <- TagMap.add tag a r.env ;
839 let create_mem_map tags gen =
840 Tags.fold
841 (fun tag r -> TagMap.add tag (alloc_new_addr tag gen) r)
842 tags TagMap.empty
844 let create_init_state pos =
845 let gen = create_new_addr_gen () in
846 let st =
847 TransSet.fold
848 (fun (t,tags) st ->
849 match t with
850 | ToAction n ->
851 let on,otags = st.final in
852 if n < on then
853 {st with final = (n, (0,create_mem_map tags gen))}
854 else
856 | OnChars n ->
858 let _ = MemMap.find n st.others in assert false
859 with
860 | Not_found ->
861 {st with others =
862 MemMap.add n (0,create_mem_map tags gen) st.others})
863 pos dfa_state_empty in
867 let get_map t st = match t with
868 | ToAction _ -> let _,(_,m) = st.final in m
869 | OnChars n ->
870 let (_,m) = MemMap.find n st.others in
873 let dest = function | Copy (d,_) | Set d -> d
874 and orig = function | Copy (_,o) -> o | Set _ -> -1
876 let pmv oc mv = fprintf oc "%d <- %d" (dest mv) (orig mv)
877 let pmvs oc mvs =
878 List.iter (fun mv -> fprintf oc "%a " pmv mv) mvs ;
879 output_char oc '\n' ; flush oc
882 (* Topological sort << a la louche >> *)
883 let sort_mvs mvs =
884 let rec do_rec r mvs = match mvs with
885 | [] -> r
886 | _ ->
887 let dests =
888 List.fold_left
889 (fun r mv -> Ints.add (dest mv) r)
890 Ints.empty mvs in
891 let rem,here =
892 List.partition
893 (fun mv -> Ints.mem (orig mv) dests)
894 mvs in
895 match here with
896 | [] ->
897 begin match rem with
898 | Copy (d,_)::_ ->
899 let d' = do_alloc_temp () in
900 Copy (d',d)::
901 do_rec r
902 (List.map
903 (fun mv ->
904 if orig mv = d then
905 Copy (dest mv,d')
906 else
908 rem)
909 | _ -> assert false
911 | _ -> do_rec (here@r) rem in
912 do_rec [] mvs
914 let move_to mem_key src tgt =
915 let mvs =
916 MemKey.fold
917 (fun {tag=tag ; equiv=m} r ->
918 StateSetSet.fold
919 (fun s r ->
921 let t = StateSet.choose s in
922 let src = TagMap.find tag (get_map t src)
923 and tgt = TagMap.find tag (get_map t tgt) in
924 if src <> tgt then begin
925 if is_new_addr src then
926 Set tgt::r
927 else
928 Copy (tgt, src)::r
929 end else
931 with
932 | Not_found -> assert false)
933 m r)
934 mem_key [] in
935 (* Moves are topologically sorted *)
936 sort_mvs mvs
939 let get_state st =
940 let key = get_key st in
942 let num = StateMap.find key !state_map in
943 num,move_to key.kmem st (Table.get state_table num)
944 with Not_found ->
945 let num = !next_state_num in
946 incr next_state_num;
947 let st,mvs = create_new_state st in
948 Table.emit state_table st ;
949 state_map := StateMap.add key num !state_map;
950 Stack.push (st, num) todo;
951 num,mvs
953 let map_on_all_states f old_res =
954 let res = ref old_res in
955 begin try
956 while true do
957 let (st, i) = Stack.pop todo in
958 let r = f st in
959 res := (r, i) :: !res
960 done
961 with Stack.Empty -> ()
962 end;
963 !res
965 let goto_state st =
967 dfa_state_is_empty st
968 then
969 Backtrack,[]
970 else
971 let n,moves = get_state st in
972 Goto n,moves
974 (****************************)
975 (* compute reachable states *)
976 (****************************)
978 let add_tags_to_map gen tags m =
979 Tags.fold
980 (fun tag m ->
981 let m = TagMap.remove tag m in
982 TagMap.add tag (alloc_new_addr tag gen) m)
983 tags m
985 let apply_transition gen r pri m = function
986 | ToAction n,tags ->
987 let on,(opri,_) = r.final in
988 if n < on || (on=n && pri < opri) then
989 let m = add_tags_to_map gen tags m in
990 {r with final=n,(pri,m)}
991 else r
992 | OnChars n,tags ->
994 let (opri,_) = MemMap.find n r.others in
995 if pri < opri then
996 let m = add_tags_to_map gen tags m in
997 {r with others=MemMap.add n (pri,m) (MemMap.remove n r.others)}
998 else
1000 with
1001 | Not_found ->
1002 let m = add_tags_to_map gen tags m in
1003 {r with others=MemMap.add n (pri,m) r.others}
1005 (* add transitions ts to new state r
1006 transitions in ts start from state pri and memory map m
1008 let apply_transitions gen r pri m ts =
1009 TransSet.fold
1010 (fun t r -> apply_transition gen r pri m t)
1011 ts r
1014 (* For a given nfa_state pos, refine char partition *)
1015 let rec split_env gen follow pos m s = function
1016 | [] -> (* Can occur ! because of non-matching regexp ([^'\000'-'\255']) *)
1018 | (s1,st1) as p::rem ->
1019 let here = Cset.inter s s1 in
1020 if Cset.is_empty here then
1021 p::split_env gen follow pos m s rem
1022 else
1023 let rest = Cset.diff s here in
1024 let rem =
1025 if Cset.is_empty rest then
1027 else
1028 split_env gen follow pos m rest rem
1029 and new_st = apply_transitions gen st1 pos m follow in
1030 let stay = Cset.diff s1 here in
1031 if Cset.is_empty stay then
1032 (here, new_st)::rem
1033 else
1034 (stay, st1)::(here, new_st)::rem
1037 (* For all nfa_state pos in a dfa state st *)
1038 let comp_shift gen chars follow st =
1039 MemMap.fold
1040 (fun pos (_,m) env -> split_env gen follow.(pos) pos m chars.(pos) env)
1041 st [Cset.all_chars_eof,dfa_state_empty]
1044 let reachs chars follow st =
1045 let gen = create_new_addr_gen () in
1046 (* build a association list (char set -> new state) *)
1047 let env = comp_shift gen chars follow st in
1048 (* change it into (char set -> new state_num) *)
1049 let env =
1050 List.map
1051 (fun (s,dfa_state) -> s,goto_state dfa_state) env in
1052 (* finally build the char indexed array -> new state num *)
1053 let shift = Cset.env_to_array env in
1054 shift
1057 let get_tag_mem n env t =
1059 TagMap.find t env.(n)
1060 with
1061 | Not_found -> assert false
1063 let do_tag_actions n env m =
1065 let used,r =
1066 TagMap.fold (fun t m (used,r) ->
1067 let a = get_tag_mem n env t in
1068 Ints.add a used,SetTag (a,m)::r) m (Ints.empty,[]) in
1069 let _,r =
1070 TagMap.fold
1071 (fun tag m (used,r) ->
1072 if not (Ints.mem m used) && tag.start then
1073 Ints.add m used, EraseTag m::r
1074 else
1075 used,r)
1076 env.(n) (used,r) in
1080 let translate_state shortest_match tags chars follow st =
1081 let (n,(_,m)) = st.final in
1082 if MemMap.empty = st.others then
1083 Perform (n,do_tag_actions n tags m)
1084 else if shortest_match then begin
1085 if n=no_action then
1086 Shift (No_remember,reachs chars follow st.others)
1087 else
1088 Perform(n, do_tag_actions n tags m)
1089 end else begin
1090 Shift (
1091 (if n = no_action then
1092 No_remember
1093 else
1094 Remember (n,do_tag_actions n tags m)),
1095 reachs chars follow st.others)
1099 let dtags chan tags =
1100 Tags.iter
1101 (fun t -> fprintf chan " %a" dtag t)
1102 tags
1104 let dtransset s =
1105 TransSet.iter
1106 (fun trans -> match trans with
1107 | OnChars i,tags ->
1108 eprintf " (-> %d,%a)" i dtags tags
1109 | ToAction i,tags ->
1110 eprintf " ([%d],%a)" i dtags tags)
1113 let dfollow t =
1114 eprintf "follow=[" ;
1115 for i = 0 to Array.length t-1 do
1116 eprintf "%d:" i ;
1117 dtransset t.(i)
1118 done ;
1119 prerr_endline "]"
1122 let make_tag_entry id start act a r = match a with
1123 | Sum (Mem m,0) ->
1124 TagMap.add {id=id ; start=start ; action=act} m r
1125 | _ -> r
1127 let extract_tags l =
1128 let envs = Array.create (List.length l) TagMap.empty in
1129 List.iter
1130 (fun (act,m,_) ->
1131 envs.(act) <-
1132 List.fold_right
1133 (fun ((name,_),v) r -> match v with
1134 | Ident_char (_,t) -> make_tag_entry name true act t r
1135 | Ident_string (_,t1,t2) ->
1136 make_tag_entry name true act t1
1137 (make_tag_entry name false act t2 r))
1138 m TagMap.empty)
1140 envs
1143 let make_dfa lexdef =
1144 let (chars, entry_list) = encode_lexdef lexdef in
1145 let follow = followpos (Array.length chars) entry_list in
1147 dfollow follow ;
1149 reset_state_mem () ;
1150 let r_states = ref [] in
1151 let initial_states =
1152 List.map
1153 (fun (le,args,shortest) ->
1154 let tags = extract_tags le.lex_actions in
1155 reset_cell_mem le.lex_mem_tags ;
1156 let pos_set = firstpos le.lex_regexp in
1158 prerr_string "trans={" ; dtransset pos_set ; prerr_endline "}" ;
1160 let init_state = create_init_state pos_set in
1161 let init_num = get_state init_state in
1162 r_states :=
1163 map_on_all_states
1164 (translate_state shortest tags chars follow) !r_states ;
1165 { auto_name = le.lex_name;
1166 auto_args = args ;
1167 auto_mem_size =
1168 (if !temp_pending then !next_mem_cell+1 else !next_mem_cell) ;
1169 auto_initial_state = init_num ;
1170 auto_actions = le.lex_actions })
1171 entry_list in
1172 let states = !r_states in
1174 prerr_endline "** states **" ;
1175 for i = 0 to !next_state_num-1 do
1176 eprintf "+++ %d +++\n" i ;
1177 dstate (Table.get state_table i) ;
1178 prerr_endline ""
1179 done ;
1180 eprintf "%d states\n" !next_state_num ;
1182 let actions = Array.create !next_state_num (Perform (0,[])) in
1183 List.iter (fun (act, i) -> actions.(i) <- act) states;
1184 reset_state_mem () ;
1185 reset_cell_mem 0 ;
1186 (initial_states, actions)