1 (**************************************************************************)
2 (* *)
3 (* OCaml *)
4 (* *)
5 (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
6 (* Luc Maranget, projet Moscova, INRIA Rocquencourt *)
7 (* *)
8 (* Copyright 1996 Institut National de Recherche en Informatique et *)
9 (* en Automatique. *)
10 (* *)
11 (* All rights reserved. This file is distributed under the terms of *)
12 (* the GNU Lesser General Public License version 2.1, with the *)
13 (* special exception on linking described in the file LICENSE. *)
14 (* *)
15 (**************************************************************************)
16
17 (* Compiling a lexer definition *)
18
19 open Syntax
20 (*open Printf*)
21
22 exception Memory_overflow
23
24 (* Deep abstract syntax for regular expressions *)
25
26 type ident = string * Syntax.location
27
28 type tag_info = {id : string ; start : bool ; action : int}
29
30 type regexp =
31 Empty
32 | Chars of int * bool
33 | Action of int
34 | Tag of tag_info
35 | Seq of regexp * regexp
36 | Alt of regexp * regexp
37 | Star of regexp
38
39 type tag_base = Start | End | Mem of int
40 type tag_addr = Sum of (tag_base * int)
41 type ident_info =
42 | Ident_string of bool * tag_addr * tag_addr
43 | Ident_char of bool * tag_addr
44 type t_env = (ident * ident_info) list
45
46 type ('args,'action) lexer_entry =
47 { lex_name: string;
48 lex_regexp: regexp;
49 lex_mem_tags: int ;
50 lex_actions: (int * t_env * 'action) list }
51
52
53 type automata =
54 Perform of int * tag_action list
55 | Shift of automata_trans * (automata_move * memory_action list) array
56
57 and automata_trans =
58 No_remember
59 | Remember of int * tag_action list
60
61 and automata_move =
62 Backtrack
63 | Goto of int
64
65 and memory_action =
66 | Copy of int * int
67 | Set of int
68
69 and tag_action = SetTag of int * int | EraseTag of int
70
71 (* Representation of entry points *)
72
73 type ('args,'action) automata_entry =
74 { auto_name: string;
75 auto_args: 'args ;
76 auto_mem_size : int ;
77 auto_initial_state: int * memory_action list;
78 auto_actions: (int * t_env * 'action) list }
79
80
81 (* A lot of sets and map structures *)
82
83 module Ints =
84 Set.Make(struct type t = int let compare (x:t) y = compare x y end)
85
86 let id_compare (id1,_) (id2,_) = String.compare id1 id2
87
88 let tag_compare t1 t2 = Stdlib.compare t1 t2
89
90 module Tags = Set.Make(struct type t = tag_info let compare = tag_compare end)
91
92 module TagMap =
93 Map.Make (struct type t = tag_info let compare = tag_compare end)
94
95 module IdSet =
96 Set.Make (struct type t = ident let compare = id_compare end)
97
98 (*********************)
99 (* Variable cleaning *)
100 (*********************)
101
102 (* Silently eliminate nested variables *)
103
104 let rec do_remove_nested to_remove = function
105 | Bind (e,x) ->
106 if IdSet.mem x to_remove then
107 do_remove_nested to_remove e
108 else
109 Bind (do_remove_nested (IdSet.add x to_remove) e, x)
110 | Epsilon|Eof|Characters _ as e -> e
111 | Sequence (e1, e2) ->
112 Sequence
113 (do_remove_nested to_remove e1, do_remove_nested to_remove e2)
114 | Alternative (e1, e2) ->
115 Alternative
116 (do_remove_nested to_remove e1, do_remove_nested to_remove e2)
117 | Repetition e ->
118 Repetition (do_remove_nested to_remove e)
119
120 let remove_nested_as e = do_remove_nested IdSet.empty e
121
122 (*********************)
123 (* Variable analysis *)
124 (*********************)
125
126 (*
127 Optional variables.
128 A variable is optional when matching of regexp does not
129 implies it binds.
130 The typical case is:
131 ("" | 'a' as x) -> optional
132 ("" as x | 'a' as x) -> non-optional
133 *)
134
135 let stringset_delta s1 s2 =
136 IdSet.union
137 (IdSet.diff s1 s2)
138 (IdSet.diff s2 s1)
139
140 let rec find_all_vars = function
141 | Characters _|Epsilon|Eof ->
142 IdSet.empty
143 | Bind (e,x) ->
144 IdSet.add x (find_all_vars e)
145 | Sequence (e1,e2)|Alternative (e1,e2) ->
146 IdSet.union (find_all_vars e1) (find_all_vars e2)
147 | Repetition e -> find_all_vars e
148
149
150 let rec do_find_opt = function
151 | Characters _|Epsilon|Eof -> IdSet.empty, IdSet.empty
152 | Bind (e,x) ->
153 let opt,all = do_find_opt e in
154 opt, IdSet.add x all
155 | Sequence (e1,e2) ->
156 let opt1,all1 = do_find_opt e1
157 and opt2,all2 = do_find_opt e2 in
158 IdSet.union opt1 opt2, IdSet.union all1 all2
159 | Alternative (e1,e2) ->
160 let opt1,all1 = do_find_opt e1
161 and opt2,all2 = do_find_opt e2 in
162 IdSet.union
163 (IdSet.union opt1 opt2)
164 (stringset_delta all1 all2),
165 IdSet.union all1 all2
166 | Repetition e ->
167 let r = find_all_vars e in
168 r,r
169
170 let find_optional e =
171 let r,_ = do_find_opt e in r
172
173 (*
174 Double variables
175 A variable is double when it can be bound more than once
176 in a single matching
177 The typical case is:
178 (e1 as x) (e2 as x)
179
180 *)
181
182 let rec do_find_double = function
183 | Characters _|Epsilon|Eof -> IdSet.empty, IdSet.empty
184 | Bind (e,x) ->
185 let dbl,all = do_find_double e in
186 (if IdSet.mem x all then
187 IdSet.add x dbl
188 else
189 dbl),
190 IdSet.add x all
191 | Sequence (e1,e2) ->
192 let dbl1, all1 = do_find_double e1
193 and dbl2, all2 = do_find_double e2 in
194 IdSet.union
195 (IdSet.inter all1 all2)
196 (IdSet.union dbl1 dbl2),
197 IdSet.union all1 all2
198 | Alternative (e1,e2) ->
199 let dbl1, all1 = do_find_double e1
200 and dbl2, all2 = do_find_double e2 in
201 IdSet.union dbl1 dbl2,
202 IdSet.union all1 all2
203 | Repetition e ->
204 let r = find_all_vars e in
205 r,r
206
207 let find_double e = do_find_double e
208
209 (*
210 Type of variables:
211 A variable is bound to a char when all its occurrences
212 bind a pattern of length 1.
213 The typical case is:
214 (_ as x) -> char
215 *)
216
217 let add_some x = function
218 | Some i -> Some (x+i)
219 | None -> None
220
221 let add_some_some x y = match x,y with
222 | Some i, Some j -> Some (i+j)
223 | _,_ -> None
224
225 let rec do_find_chars sz = function
226 | Epsilon|Eof -> IdSet.empty, IdSet.empty, sz
227 | Characters _ -> IdSet.empty, IdSet.empty, add_some 1 sz
228 | Bind (e,x) ->
229 let c,s,e_sz = do_find_chars (Some 0) e in
230 begin match e_sz with
231 | Some 1 ->
232 IdSet.add x c,s,add_some 1 sz
233 | _ ->
234 c, IdSet.add x s, add_some_some sz e_sz
235 end
236 | Sequence (e1,e2) ->
237 let c1,s1,sz1 = do_find_chars sz e1 in
238 let c2,s2,sz2 = do_find_chars sz1 e2 in
239 IdSet.union c1 c2,
240 IdSet.union s1 s2,
241 sz2
242 | Alternative (e1,e2) ->
243 let c1,s1,sz1 = do_find_chars sz e1
244 and c2,s2,sz2 = do_find_chars sz e2 in
245 IdSet.union c1 c2,
246 IdSet.union s1 s2,
247 (if sz1 = sz2 then sz1 else None)
248 | Repetition e -> do_find_chars None e
249
250
251
252 let find_chars e =
253 let c,s,_ = do_find_chars (Some 0) e in
254 IdSet.diff c s
255
256 (*******************************)
257 (* From shallow to deep syntax *)
258 (*******************************)
259
260 let chars = ref ([] : Cset.t list)
261 let chars_count = ref 0
262
263
264 let rec encode_regexp char_vars act = function
265 Epsilon -> Empty
266 | Characters cl ->
267 let n = !chars_count in
268 chars := cl :: !chars;
269 incr chars_count;
270 Chars(n,false)
271 | Eof ->
272 let n = !chars_count in
273 chars := Cset.eof :: !chars;
274 incr chars_count;
275 Chars(n,true)
276 | Sequence(r1,r2) ->
277 let r1 = encode_regexp char_vars act r1 in
278 let r2 = encode_regexp char_vars act r2 in
279 Seq (r1, r2)
280 | Alternative(r1,r2) ->
281 let r1 = encode_regexp char_vars act r1 in
282 let r2 = encode_regexp char_vars act r2 in
283 Alt(r1, r2)
284 | Repetition r ->
285 let r = encode_regexp char_vars act r in
286 Star r
287 | Bind (r,((name,_) as x)) ->
288 let r = encode_regexp char_vars act r in
289 if IdSet.mem x char_vars then
290 Seq (Tag {id=name ; start=true ; action=act},r)
291 else
292 Seq (Tag {id=name ; start=true ; action=act},
293 Seq (r, Tag {id=name ; start=false ; action=act}))
294
295
296 (* Optimisation,
297 Static optimization :
298 Replace tags by offsets relative to the beginning
299 or end of matched string.
300 Dynamic optimization:
301 Replace some non-optional, non-double tags by offsets w.r.t
302 a previous similar tag.
303 *)
304
305 let opt = true
306
307 let mk_seq r1 r2 = match r1,r2 with
308 | Empty,_ -> r2
309 | _,Empty -> r1
310 | _,_ -> Seq (r1,r2)
311
312 let add_pos p i = match p with
313 | Some (Sum (a,n)) -> Some (Sum (a,n+i))
314 | None -> None
315
316 let mem_name name id_set =
317 IdSet.exists (fun (id_name,_) -> name = id_name) id_set
318
319 let opt_regexp all_vars char_vars optional_vars double_vars r =
320
321 (* From removed tags to their addresses *)
322 let env = Hashtbl.create 17 in
323
324 (* First static optimizations, from start position *)
325 let rec size_forward pos = function
326 | Empty|Chars (_,true)|Tag _ -> Some pos
327 | Chars (_,false) -> Some (pos+1)
328 | Seq (r1,r2) ->
329 begin match size_forward pos r1 with
330 | None -> None
331 | Some pos -> size_forward pos r2
332 end
333 | Alt (r1,r2) ->
334 let pos1 = size_forward pos r1
335 and pos2 = size_forward pos r2 in
336 if pos1=pos2 then pos1 else None
337 | Star _ -> None
338 | Action _ -> assert false in
339
340 let rec simple_forward pos r = match r with
341 | Tag n ->
342 if mem_name n.id double_vars then
343 r,Some pos
344 else begin
345 Hashtbl.add env (n.id,n.start) (Sum (Start, pos)) ;
346 Empty,Some pos
347 end
348 | Empty -> r, Some pos
349 | Chars (_,is_eof) ->
350 r,Some (if is_eof then pos else pos+1)
351 | Seq (r1,r2) ->
352 let r1,pos = simple_forward pos r1 in
353 begin match pos with
354 | None -> mk_seq r1 r2,None
355 | Some pos ->
356 let r2,pos = simple_forward pos r2 in
357 mk_seq r1 r2,pos
358 end
359 | Alt (r1,r2) ->
360 let pos1 = size_forward pos r1
361 and pos2 = size_forward pos r2 in
362 r,(if pos1=pos2 then pos1 else None)
363 | Star _ -> r,None
364 | Action _ -> assert false in
365
366 (* Then static optimizations, from end position *)
367 let rec size_backward pos = function
368 | Empty|Chars (_,true)|Tag _ -> Some pos
369 | Chars (_,false) -> Some (pos-1)
370 | Seq (r1,r2) ->
371 begin match size_backward pos r2 with
372 | None -> None
373 | Some pos -> size_backward pos r1
374 end
375 | Alt (r1,r2) ->
376 let pos1 = size_backward pos r1
377 and pos2 = size_backward pos r2 in
378 if pos1=pos2 then pos1 else None
379 | Star _ -> None
380 | Action _ -> assert false in
381
382
383 let rec simple_backward pos r = match r with
384 | Tag n ->
385 if mem_name n.id double_vars then
386 r,Some pos
387 else begin
388 Hashtbl.add env (n.id,n.start) (Sum (End, pos)) ;
389 Empty,Some pos
390 end
391 | Empty -> r,Some pos
392 | Chars (_,is_eof) ->
393 r,Some (if is_eof then pos else pos-1)
394 | Seq (r1,r2) ->
395 let r2,pos = simple_backward pos r2 in
396 begin match pos with
397 | None -> mk_seq r1 r2,None
398 | Some pos ->
399 let r1,pos = simple_backward pos r1 in
400 mk_seq r1 r2,pos
401 end
402 | Alt (r1,r2) ->
403 let pos1 = size_backward pos r1
404 and pos2 = size_backward pos r2 in
405 r,(if pos1=pos2 then pos1 else None)
406 | Star _ -> r,None
407 | Action _ -> assert false in
408
409 let r =
410 if opt then
411 let r,_ = simple_forward 0 r in
412 let r,_ = simple_backward 0 r in
413 r
414 else
415 r in
416
417 let loc_count = ref 0 in
418 let get_tag_addr t =
419 try
420 Hashtbl.find env t
421 with
422 | Not_found ->
423 let n = !loc_count in
424 incr loc_count ;
425 Hashtbl.add env t (Sum (Mem n,0)) ;
426 Sum (Mem n,0) in
427
428 let rec alloc_exp pos r = match r with
429 | Tag n ->
430 if mem_name n.id double_vars then
431 r,pos
432 else begin match pos with
433 | Some a ->
434 Hashtbl.add env (n.id,n.start) a ;
435 Empty,pos
436 | None ->
437 let a = get_tag_addr (n.id,n.start) in
438 r,Some a
439 end
440
441 | Empty -> r,pos
442 | Chars (_,is_eof) -> r,(if is_eof then pos else add_pos pos 1)
443 | Seq (r1,r2) ->
444 let r1,pos = alloc_exp pos r1 in
445 let r2,pos = alloc_exp pos r2 in
446 mk_seq r1 r2,pos
447 | Alt (_,_) ->
448 let off = size_forward 0 r in
449 begin match off with
450 | Some i -> r,add_pos pos i
451 | None -> r,None
452 end
453 | Star _ -> r,None
454 | Action _ -> assert false in
455
456 let r,_ = alloc_exp None r in
457 let m =
458 IdSet.fold
459 (fun ((name,_) as x) r ->
460
461 let v =
462 if IdSet.mem x char_vars then
463 Ident_char
464 (IdSet.mem x optional_vars, get_tag_addr (name,true))
465 else
466 Ident_string
467 (IdSet.mem x optional_vars,
468 get_tag_addr (name,true),
469 get_tag_addr (name,false)) in
470 (x,v)::r)
471 all_vars [] in
472 m,r, !loc_count
473
474
475
476 let encode_casedef casedef =
477 let r =
478 List.fold_left
479 (fun (reg,actions,count,ntags) (expr, act) ->
480 let expr = remove_nested_as expr in
481 let char_vars = find_chars expr in
482 let r = encode_regexp char_vars count expr
483 and opt_vars = find_optional expr
484 and double_vars,all_vars = find_double expr in
485 let m,r,loc_ntags =
486 opt_regexp all_vars char_vars opt_vars double_vars r in
487 Alt(reg, Seq(r, Action count)),
488 (count, m ,act) :: actions,
489 (succ count),
490 max loc_ntags ntags)
491 (Empty, [], 0, 0)
492 casedef in
493 r
494
495 let encode_lexdef def =
496 chars := [];
497 chars_count := 0;
498 let entry_list =
499 List.map
500 (fun {name=entry_name; args=args; shortest=shortest; clauses=casedef} ->
501 let (re,actions,_,ntags) = encode_casedef casedef in
502 { lex_name = entry_name;
503 lex_regexp = re;
504 lex_mem_tags = ntags ;
505 lex_actions = List.rev actions },args,shortest)
506 def in
507 let chr = Array.of_list (List.rev !chars) in
508 chars := [];
509 (chr, entry_list)
510
511 (* To generate directly a NFA from a regular expression.
512 Confer Aho-Sethi-Ullman, dragon book, chap. 3
513 Extension to tagged automata.
514 Confer
515 Ville Larikari
516 'NFAs with Tagged Transitions, their Conversion to Deterministic
517 Automata and Application to Regular Expressions'.
518 Symposium on String Processing and Information Retrieval (SPIRE 2000),
519 http://kouli.iki.fi/~vlaurika/spire2000-tnfa.ps
520 (See also)
521 http://kouli.iki.fi/~vlaurika/regex-submatch.ps.gz
522 *)
523
524 type t_transition =
525 OnChars of int
526 | ToAction of int
527
528 type transition = t_transition * Tags.t
529
530 let trans_compare (t1,tags1) (t2,tags2) =
531 match Stdlib.compare t1 t2 with
532 | 0 -> Tags.compare tags1 tags2
533 | r -> r
534
535
536 module TransSet =
537 Set.Make(struct type t = transition let compare = trans_compare end)
538
539 let rec nullable = function
540 | Empty|Tag _ -> true
541 | Chars (_,_)|Action _ -> false
542 | Seq(r1,r2) -> nullable r1 && nullable r2
543 | Alt(r1,r2) -> nullable r1 || nullable r2
544 | Star _ -> true
545
546 let rec emptymatch = function
547 | Empty | Chars (_,_) | Action _ -> Tags.empty
548 | Tag t -> Tags.add t Tags.empty
549 | Seq (r1,r2) -> Tags.union (emptymatch r1) (emptymatch r2)
550 | Alt(r1,r2) ->
551 if nullable r1 then
552 emptymatch r1
553 else
554 emptymatch r2
555 | Star r ->
556 if nullable r then
557 emptymatch r
558 else
559 Tags.empty
560
561 let addtags transs tags =
562 TransSet.fold
563 (fun (t,tags_t) r -> TransSet.add (t, Tags.union tags tags_t) r)
564 transs TransSet.empty
565
566
567 let rec firstpos = function
568 Empty|Tag _ -> TransSet.empty
569 | Chars (pos,_) -> TransSet.add (OnChars pos,Tags.empty) TransSet.empty
570 | Action act -> TransSet.add (ToAction act,Tags.empty) TransSet.empty
571 | Seq(r1,r2) ->
572 if nullable r1 then
573 TransSet.union (firstpos r1) (addtags (firstpos r2) (emptymatch r1))
574 else
575 firstpos r1
576 | Alt(r1,r2) -> TransSet.union (firstpos r1) (firstpos r2)
577 | Star r -> firstpos r
578
579
580 (* Berry-Sethi followpos *)
581 let followpos size entry_list =
582 let v = Array.make size TransSet.empty in
583 let rec fill s = function
584 | Empty|Action _|Tag _ -> ()
585 | Chars (n,_) -> v.(n) <- s
586 | Alt (r1,r2) ->
587 fill s r1 ; fill s r2
588 | Seq (r1,r2) ->
589 fill
590 (if nullable r2 then
591 TransSet.union (firstpos r2) (addtags s (emptymatch r2))
592 else
593 (firstpos r2))
594 r1 ;
595 fill s r2
596 | Star r ->
597 fill (TransSet.union (firstpos r) s) r in
598 List.iter (fun (entry,_,_) -> fill TransSet.empty entry.lex_regexp)
599 entry_list;
600 v
601
602 (************************)
603 (* The algorithm itself *)
604 (************************)
605
606 let no_action = max_int
607
608 module StateSet =
609 Set.Make (struct type t = t_transition let compare = Stdlib.compare end)
610
611
612 module MemMap =
613 Map.Make (struct type t = int
614 let compare (x:t) y = Stdlib.compare x y end)
615
616 type 'a dfa_state =
617 {final : int * ('a * int TagMap.t) ;
618 others : ('a * int TagMap.t) MemMap.t}
619
620
621 (*
622 let dtag oc t =
623 fprintf oc "%s<%s>" t.id (if t.start then "s" else "e")
624
625 let dmem_map dp ds m =
626 MemMap.iter
627 (fun k x ->
628 eprintf "%d -> " k ; dp x ; ds ())
629 m
630
631 and dtag_map dp ds m =
632 TagMap.iter
633 (fun t x ->
634 dtag stderr t ; eprintf " -> " ; dp x ; ds ())
635 m
636
637 let dstate {final=(act,(_,m)) ; others=o} =
638 if act <> no_action then begin
639 eprintf "final=%d " act ;
640 dtag_map (fun x -> eprintf "%d" x) (fun () -> prerr_string " ,") m ;
641 prerr_endline ""
642 end ;
643 dmem_map
644 (fun (_,m) ->
645 dtag_map (fun x -> eprintf "%d" x) (fun () -> prerr_string " ,") m)
646 (fun () -> prerr_endline "")
647 o
648 *)
649
650
651 let dfa_state_empty =
652 {final=(no_action, (max_int,TagMap.empty)) ;
653 others=MemMap.empty}
654
655 and dfa_state_is_empty {final=(act,_) ; others=o} =
656 act = no_action &&
657 o = MemMap.empty
658
659
660 (* A key is an abstraction on a dfa state,
661 two states with the same key can be made the same by
662 copying some memory cells into others *)
663
664
665 module StateSetSet =
666 Set.Make (struct type t = StateSet.t let compare = StateSet.compare end)
667
668 type t_equiv = {tag:tag_info ; equiv:StateSetSet.t}
669
670 module MemKey =
671 Set.Make
672 (struct
673 type t = t_equiv
674
675 let compare e1 e2 = match Stdlib.compare e1.tag e2.tag with
676 | 0 -> StateSetSet.compare e1.equiv e2.equiv
677 | r -> r
678 end)
679
680 type dfa_key = {kstate : StateSet.t ; kmem : MemKey.t}
681
682 (* Map a state to its key *)
683 let env_to_class m =
684 let env1 =
685 MemMap.fold
686 (fun _ (tag,s) r ->
687 TagMap.update tag (function
688 | None -> Some (StateSetSet.singleton s)
689 | Some ss -> Some (StateSetSet.add s ss)
690 ) r)
691 m TagMap.empty in
692 TagMap.fold
693 (fun tag ss r -> MemKey.add {tag=tag ; equiv=ss} r)
694 env1 MemKey.empty
695
696
697 (* trans is nfa_state, m is associated memory map *)
698 let inverse_mem_map trans m r =
699 TagMap.fold
700 (fun tag addr r ->
701 MemMap.update addr (function
702 | None -> Some (tag, StateSet.singleton trans)
703 | Some (otag, s) ->
704 assert (tag = otag);
705 Some (tag, StateSet.add trans s)
706 ) r)
707 m r
708
709 let inverse_mem_map_other n (_,m) r = inverse_mem_map (OnChars n) m r
710
711 let get_key {final=(act,(_,m_act)) ; others=o} =
712 let env =
713 MemMap.fold inverse_mem_map_other
714 o
715 (if act = no_action then MemMap.empty
716 else inverse_mem_map (ToAction act) m_act MemMap.empty) in
717 let state_key =
718 MemMap.fold (fun n _ r -> StateSet.add (OnChars n) r) o
719 (if act=no_action then StateSet.empty
720 else StateSet.add (ToAction act) StateSet.empty) in
721 let mem_key = env_to_class env in
722 {kstate = state_key ; kmem = mem_key}
723
724
725 let key_compare k1 k2 = match StateSet.compare k1.kstate k2.kstate with
726 | 0 -> MemKey.compare k1.kmem k2.kmem
727 | r -> r
728
729 (* Association dfa_state -> state_num *)
730
731 module StateMap =
732 Map.Make(struct type t = dfa_key let compare = key_compare end)
733
734 let state_map = ref (StateMap.empty : int StateMap.t)
735 let todo = Stack.create()
736 let next_state_num = ref 0
737 let next_mem_cell = ref 0
738 let temp_pending = ref false
739 let tag_cells = Hashtbl.create 17
740 let state_table = Table.create dfa_state_empty
741
742
743 (* Initial reset of state *)
744 let reset_state () =
745 Stack.clear todo;
746 next_state_num := 0 ;
747 let _ = Table.trim state_table in
748 ()
749
750 (* Reset state before processing a given automata.
751 We clear both the memory mapping and
752 the state mapping, as state sharing between different
753 automata may lead to incorrect estimation of the cell memory size
754 BUG ID 0004517 *)
755
756
757 let reset_state_partial ntags =
758 next_mem_cell := ntags ;
759 Hashtbl.clear tag_cells ;
760 temp_pending := false ;
761 state_map := StateMap.empty
762
763 let do_alloc_temp () =
764 temp_pending := true ;
765 let n = !next_mem_cell in
766 n
767
768 let do_alloc_cell used t =
769 let available =
770 try Hashtbl.find tag_cells t with Not_found -> Ints.empty in
771 try
772 Ints.choose (Ints.diff available used)
773 with
774 | Not_found ->
775 temp_pending := false ;
776 let n = !next_mem_cell in
777 if n >= 255 then raise Memory_overflow ;
778 Hashtbl.replace tag_cells t (Ints.add n available) ;
779 incr next_mem_cell ;
780 n
781
782 let is_old_addr a = a >= 0
783 and is_new_addr a = a < 0
784
785 let old_in_map m r =
786 TagMap.fold
787 (fun _ addr r ->
788 if is_old_addr addr then
789 Ints.add addr r
790 else
791 r)
792 m r
793
794 let alloc_map used m mvs =
795 TagMap.fold
796 (fun tag a (r,mvs) ->
797 let a,mvs =
798 if is_new_addr a then
799 let a = do_alloc_cell used tag in
800 a,Ints.add a mvs
801 else a,mvs in
802 TagMap.add tag a r,mvs)
803 m (TagMap.empty,mvs)
804
805 let create_new_state {final=(act,(_,m_act)) ; others=o} =
806 let used =
807 MemMap.fold (fun _ (_,m) r -> old_in_map m r)
808 o (old_in_map m_act Ints.empty) in
809
810 let new_m_act,mvs = alloc_map used m_act Ints.empty in
811 let new_o,mvs =
812 MemMap.fold (fun k (x,m) (r,mvs) ->
813 let m,mvs = alloc_map used m mvs in
814 MemMap.add k (x,m) r,mvs)
815 o (MemMap.empty,mvs) in
816 {final=(act,(0,new_m_act)) ; others=new_o},
817 Ints.fold (fun x r -> Set x::r) mvs []
818
819 type new_addr_gen = {mutable count : int ; mutable env : int TagMap.t}
820
821 let create_new_addr_gen () = {count = -1 ; env = TagMap.empty}
822
823 let alloc_new_addr tag r =
824 try
825 TagMap.find tag r.env
826 with
827 | Not_found ->
828 let a = r.count in
829 r.count <- a-1 ;
830 r.env <- TagMap.add tag a r.env ;
831 a
832
833
834 let create_mem_map tags gen =
835 Tags.fold
836 (fun tag r -> TagMap.add tag (alloc_new_addr tag gen) r)
837 tags TagMap.empty
838
839 let create_init_state pos =
840 let gen = create_new_addr_gen () in
841 let st =
842 TransSet.fold
843 (fun (t,tags) st ->
844 match t with
845 | ToAction n ->
846 let on,_otags = st.final in
847 if n < on then
848 {st with final = (n, (0,create_mem_map tags gen))}
849 else
850 st
851 | OnChars n ->
852 try
853 let _ = MemMap.find n st.others in assert false
854 with
855 | Not_found ->
856 {st with others =
857 MemMap.add n (0,create_mem_map tags gen) st.others})
858 pos dfa_state_empty in
859 st
860
861
862 let get_map t st = match t with
863 | ToAction _ -> let _,(_,m) = st.final in m
864 | OnChars n ->
865 let (_,m) = MemMap.find n st.others in
866 m
867
868 let dest = function | Copy (d,_) | Set d -> d
869 and orig = function | Copy (_,o) -> o | Set _ -> -1
870
871 (*
872 let pmv oc mv = fprintf oc "%d <- %d" (dest mv) (orig mv)
873 let pmvs oc mvs =
874 List.iter (fun mv -> fprintf oc "%a " pmv mv) mvs ;
875 output_char oc '\n' ; flush oc
876 *)
877
878
879 (* Topological sort << a la louche >> *)
880 let sort_mvs mvs =
881 let rec do_rec r mvs = match mvs with
882 | [] -> r
883 | _ ->
884 let dests =
885 List.fold_left
886 (fun r mv -> Ints.add (dest mv) r)
887 Ints.empty mvs in
888 let rem,here =
889 List.partition
890 (fun mv -> Ints.mem (orig mv) dests)
891 mvs in
892 match here with
893 | [] ->
894 begin match rem with
895 | Copy (d,_)::_ ->
896 let d' = do_alloc_temp () in
897 Copy (d',d)::
898 do_rec r
899 (List.map
900 (fun mv ->
901 if orig mv = d then
902 Copy (dest mv,d')
903 else
904 mv)
905 rem)
906 | _ -> assert false
907 end
908 | _ -> do_rec (here@r) rem in
909 do_rec [] mvs
910
911 let move_to mem_key src tgt =
912 let mvs =
913 MemKey.fold
914 (fun {tag=tag ; equiv=m} r ->
915 StateSetSet.fold
916 (fun s r ->
917 try
918 let t = StateSet.choose s in
919 let src = TagMap.find tag (get_map t src)
920 and tgt = TagMap.find tag (get_map t tgt) in
921 if src <> tgt then begin
922 if is_new_addr src then
923 Set tgt::r
924 else
925 Copy (tgt, src)::r
926 end else
927 r
928 with
929 | Not_found -> assert false)
930 m r)
931 mem_key [] in
932 (* Moves are topologically sorted *)
933 sort_mvs mvs
934
935
936 let get_state st =
937 let key = get_key st in
938 try
939 let num = StateMap.find key !state_map in
940 num,move_to key.kmem st (Table.get state_table num)
941 with Not_found ->
942 let num = !next_state_num in
943 incr next_state_num;
944 let st,mvs = create_new_state st in
945 Table.emit state_table st ;
946 state_map := StateMap.add key num !state_map;
947 Stack.push (st, num) todo;
948 num,mvs
949
950 let map_on_all_states f old_res =
951 let res = ref old_res in
952 begin try
953 while true do
954 let (st, i) = Stack.pop todo in
955 let r = f st in
956 res := (r, i) :: !res
957 done
958 with Stack.Empty -> ()
959 end;
960 !res
961
962 let goto_state st =
963 if
964 dfa_state_is_empty st
965 then
966 Backtrack,[]
967 else
968 let n,moves = get_state st in
969 Goto n,moves
970
971 (****************************)
972 (* compute reachable states *)
973 (****************************)
974
975 let add_tags_to_map gen tags m =
976 Tags.fold
977 (fun tag m ->
978 let m = TagMap.remove tag m in
979 TagMap.add tag (alloc_new_addr tag gen) m)
980 tags m
981
982 let apply_transition gen r pri m = function
983 | ToAction n,tags ->
984 let on,(opri,_) = r.final in
985 if n < on || (on=n && pri < opri) then
986 let m = add_tags_to_map gen tags m in
987 {r with final=n,(pri,m)}
988 else r
989 | OnChars n,tags ->
990 try
991 let (opri,_) = MemMap.find n r.others in
992 if pri < opri then
993 let m = add_tags_to_map gen tags m in
994 {r with others=MemMap.add n (pri,m) (MemMap.remove n r.others)}
995 else
996 r
997 with
998 | Not_found ->
999 let m = add_tags_to_map gen tags m in
1000 {r with others=MemMap.add n (pri,m) r.others}
1001
1002 (* add transitions ts to new state r
1003 transitions in ts start from state pri and memory map m
1004 *)
1005 let apply_transitions gen r pri m ts =
1006 TransSet.fold
1007 (fun t r -> apply_transition gen r pri m t)
1008 ts r
1009
1010
1011 (* For a given nfa_state pos, refine char partition *)
1012 let rec split_env gen follow pos m s = function
1013 | [] -> (* Can occur ! because of non-matching regexp ([^'\000'-'\255']) *)
1014 []
1015 | (s1,st1) as p::rem ->
1016 let here = Cset.inter s s1 in
1017 if Cset.is_empty here then
1018 p::split_env gen follow pos m s rem
1019 else
1020 let rest = Cset.diff s here in
1021 let rem =
1022 if Cset.is_empty rest then
1023 rem
1024 else
1025 split_env gen follow pos m rest rem
1026 and new_st = apply_transitions gen st1 pos m follow in
1027 let stay = Cset.diff s1 here in
1028 if Cset.is_empty stay then
1029 (here, new_st)::rem
1030 else
1031 (stay, st1)::(here, new_st)::rem
1032
1033
1034 (* For all nfa_state pos in a dfa state st *)
1035 let comp_shift gen chars follow st =
1036 MemMap.fold
1037 (fun pos (_,m) env -> split_env gen follow.(pos) pos m chars.(pos) env)
1038 st [Cset.all_chars_eof,dfa_state_empty]
1039
1040
1041 let reachs chars follow st =
1042 let gen = create_new_addr_gen () in
1043 (* build an association list (char set -> new state) *)
1044 let env = comp_shift gen chars follow st in
1045 (* change it into (char set -> new state_num) *)
1046 let env =
1047 List.map
1048 (fun (s,dfa_state) -> s,goto_state dfa_state) env in
1049 (* finally build the char indexed array -> new state num *)
1050 let shift = Cset.env_to_array env in
1051 shift
1052
1053
1054 let get_tag_mem n env t =
1055 try
1056 TagMap.find t env.(n)
1057 with
1058 | Not_found -> assert false
1059
1060 let do_tag_actions n env m =
1061
1062 let used,r =
1063 TagMap.fold (fun t m (used,r) ->
1064 let a = get_tag_mem n env t in
1065 Ints.add a used,SetTag (a,m)::r) m (Ints.empty,[]) in
1066 let _,r =
1067 TagMap.fold
1068 (fun tag m (used,r) ->
1069 if not (Ints.mem m used) && tag.start then
1070 Ints.add m used, EraseTag m::r
1071 else
1072 used,r)
1073 env.(n) (used,r) in
1074 r
1075
1076
1077 let translate_state shortest_match tags chars follow st =
1078 let (n,(_,m)) = st.final in
1079 if MemMap.empty = st.others then
1080 Perform (n,do_tag_actions n tags m)
1081 else if shortest_match then begin
1082 if n=no_action then
1083 Shift (No_remember,reachs chars follow st.others)
1084 else
1085 Perform(n, do_tag_actions n tags m)
1086 end else begin
1087 Shift (
1088 (if n = no_action then
1089 No_remember
1090 else
1091 Remember (n,do_tag_actions n tags m)),
1092 reachs chars follow st.others)
1093 end
1094
1095 (*
1096 let dtags chan tags =
1097 Tags.iter
1098 (fun t -> fprintf chan " %a" dtag t)
1099 tags
1100
1101 let dtransset s =
1102 TransSet.iter
1103 (fun trans -> match trans with
1104 | OnChars i,tags ->
1105 eprintf " (-> %d,%a)" i dtags tags
1106 | ToAction i,tags ->
1107 eprintf " ([%d],%a)" i dtags tags)
1108 s
1109
1110 let dfollow t =
1111 eprintf "follow=[" ;
1112 for i = 0 to Array.length t-1 do
1113 eprintf "%d:" i ;
1114 dtransset t.(i)
1115 done ;
1116 prerr_endline "]"
1117 *)
1118
1119
1120 let make_tag_entry id start act a r = match a with
1121 | Sum (Mem m,0) ->
1122 TagMap.add {id=id ; start=start ; action=act} m r
1123 | _ -> r
1124
1125 let extract_tags l =
1126 let envs = Array.make (List.length l) TagMap.empty in
1127 List.iter
1128 (fun (act,m,_) ->
1129 envs.(act) <-
1130 List.fold_right
1131 (fun ((name,_),v) r -> match v with
1132 | Ident_char (_,t) -> make_tag_entry name true act t r
1133 | Ident_string (_,t1,t2) ->
1134 make_tag_entry name true act t1
1135 (make_tag_entry name false act t2 r))
1136 m TagMap.empty)
1137 l ;
1138 envs
1139
1140
1141 let make_dfa lexdef =
1142 let (chars, entry_list) = encode_lexdef lexdef in
1143 let follow = followpos (Array.length chars) entry_list in
1144 (*
1145 dfollow follow ;
1146 *)
1147 reset_state () ;
1148 let r_states = ref [] in
1149 let initial_states =
1150 List.map
1151 (fun (le,args,shortest) ->
1152 let tags = extract_tags le.lex_actions in
1153 reset_state_partial le.lex_mem_tags ;
1154 let pos_set = firstpos le.lex_regexp in
1155 (*
1156 prerr_string "trans={" ; dtransset pos_set ; prerr_endline "}" ;
1157 *)
1158 let init_state = create_init_state pos_set in
1159 let init_num = get_state init_state in
1160 r_states :=
1161 map_on_all_states
1162 (translate_state shortest tags chars follow) !r_states ;
1163 { auto_name = le.lex_name;
1164 auto_args = args ;
1165 auto_mem_size =
1166 (if !temp_pending then !next_mem_cell+1 else !next_mem_cell) ;
1167 auto_initial_state = init_num ;
1168 auto_actions = le.lex_actions })
1169 entry_list in
1170 let states = !r_states in
1171 (*
1172 prerr_endline "** states **" ;
1173 for i = 0 to !next_state_num-1 do
1174 eprintf "+++ %d +++\n" i ;
1175 dstate (Table.get state_table i) ;
1176 prerr_endline ""
1177 done ;
1178 eprintf "%d states\n" !next_state_num ;
1179 *)
1180 let actions = Array.make !next_state_num (Perform (0,[])) in
1181 List.iter (fun (act, i) -> actions.(i) <- act) states;
1182 (* Useless state reset, so as to restrict GC roots *)
1183 reset_state () ;
1184 reset_state_partial 0 ;
1185 (initial_states, actions)
1186